Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate MBPP benchmarks #1103

Merged
merged 3 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,4 @@ webapp/.next/

# locally saved datasets
gpt_engineer/benchmark/benchmarks/apps/dataset
gpt_engineer/benchmark/benchmarks/mbpp/dataset
2 changes: 2 additions & 0 deletions gpt_engineer/benchmark/benchmarks/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@
from gpt_engineer.benchmark.benchmarks.apps.load import load_apps
from gpt_engineer.benchmark.benchmarks.gpteng.load import load_gpteng
from gpt_engineer.benchmark.benchmarks.gptme.load import load_gptme
from gpt_engineer.benchmark.benchmarks.mbpp.load import load_mbpp
from gpt_engineer.benchmark.types import Benchmark

BENCHMARKS = {
"gptme": load_gptme,
"gpteng": load_gpteng,
"apps": load_apps,
"mbpp": load_mbpp,
}


Expand Down
114 changes: 114 additions & 0 deletions gpt_engineer/benchmark/benchmarks/mbpp/load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""
Module for loading MBPP evaluation tasks.

This module provides functionality to load tasks for evaluating GPT-based models
on smaller, more focused tasks. It defines a set of tasks with predefined prompts
and assertions to benchmark the performance of AI models.

Functions
---------
load_mbpp : function
Loads the MBPP benchmark, which consists of a series coding problems.
"""
from pathlib import Path
from subprocess import TimeoutExpired
from typing import Union

from datasets import Dataset, DatasetDict, load_dataset, load_from_disk

from gpt_engineer.benchmark.benchmarks.mbpp.problem import Problem
from gpt_engineer.benchmark.benchmarks.mbpp.problems import PROBLEM_IDS
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
from gpt_engineer.core.files_dict import FilesDict
from gpt_engineer.core.prompt import Prompt

DATASET_PATH = Path("gpt_engineer/benchmark/benchmarks/mbpp/dataset")
ErikBjare marked this conversation as resolved.
Show resolved Hide resolved
MAX_N_TEST_EXAMPLES = 10


class MbppAssertion:
def __init__(self, assertion: str):
self.assertion = assertion

def evaluate(self, assertable: Assertable) -> bool:
generated_code = assertable.files["main.py"]
code_with_assertion = f"{generated_code}\n{self.assertion}"

# Create new execution environment for every run to avoid side effects
env = DiskExecutionEnv()
env.upload(FilesDict({"main.py": code_with_assertion}))
pro = env.popen("python main.py")

try:
stdout, stderr = pro.communicate(timeout=2)
stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8")
except TimeoutExpired:
print("Execution Timeout")
return False

return not stderr


def _get_dataset() -> Union[Dataset, DatasetDict]:
try:
return load_from_disk(str(DATASET_PATH))
except FileNotFoundError:
print("Dataset not found locally, downloading...")

dataset = load_dataset("mbpp", "sanitized", trust_remote_code=True)
dataset.save_to_disk(DATASET_PATH)

return dataset


def load_mbpp():
"""
Loads the MBPP benchmark, which consists of a series coding problems.

Returns
-------
Benchmark
A Benchmark object containing a list of Task objects for the MBPP evaluation.
"""
dataset = _get_dataset()
tasks = []

problems = [
Problem(
source_file=problem["source_file"],
task_id=problem["task_id"],
prompt=problem["prompt"],
code=problem["code"],
test_imports=problem["test_imports"],
test_list=problem["test_list"],
)
for problem in dataset["test"]
if problem["task_id"] in PROBLEM_IDS
]

for problem in problems:
prompt = Prompt(
problem.prompt
+ "Please extend given function without changing it's declaration including arguments."
)

tasks.append(
Task(
name=str(problem.task_id),
initial_code=FilesDict({"main.py": problem.starting_code}),
command=None, # Explicitly setting `None` because each assertion runs code
prompt=prompt,
assertions={
f"correct assertion {i}": MbppAssertion(
assertion=assertion
).evaluate
for i, assertion in enumerate(problem.test_list)
},
)
)

return Benchmark(
name="MBPP",
tasks=tasks,
)
25 changes: 25 additions & 0 deletions gpt_engineer/benchmark/benchmarks/mbpp/problem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from dataclasses import dataclass
from typing import List


@dataclass(frozen=True)
class Problem:
source_file: int
task_id: str
prompt: str
code: str
test_imports: str
test_list: List[str]

@property
def starting_code(self) -> str:
lines: List[str] = []

for line in self.code.split("\n"):
lines.append(line)

if line.startswith("def "):
lines.append("pass # TODO: Implement method\n")
break

return "\n".join(lines)
3 changes: 3 additions & 0 deletions gpt_engineer/benchmark/benchmarks/mbpp/problems.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# TODO: Pick problems
# Temporary testing against these problems
PROBLEM_IDS = range(0, 100)
Loading