gpt-engineer-org · ATheorell · Apr 5, 2024 · Apr 3, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -91,3 +91,4 @@ webapp/.next/
 
 # locally saved datasets
 gpt_engineer/benchmark/benchmarks/apps/dataset
+gpt_engineer/benchmark/benchmarks/mbpp/dataset
diff --git a/gpt_engineer/benchmark/benchmarks/load.py b/gpt_engineer/benchmark/benchmarks/load.py
@@ -12,12 +12,14 @@
 from gpt_engineer.benchmark.benchmarks.apps.load import load_apps
 from gpt_engineer.benchmark.benchmarks.gpteng.load import load_gpteng
 from gpt_engineer.benchmark.benchmarks.gptme.load import load_gptme
+from gpt_engineer.benchmark.benchmarks.mbpp.load import load_mbpp
 from gpt_engineer.benchmark.types import Benchmark
 
 BENCHMARKS = {
     "gptme": load_gptme,
     "gpteng": load_gpteng,
     "apps": load_apps,
+    "mbpp": load_mbpp,
 }
 
 

diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/load.py b/gpt_engineer/benchmark/benchmarks/mbpp/load.py
@@ -0,0 +1,114 @@
+"""
+Module for loading MBPP evaluation tasks.
+
+This module provides functionality to load tasks for evaluating GPT-based models
+on smaller, more focused tasks. It defines a set of tasks with predefined prompts
+and assertions to benchmark the performance of AI models.
+
+Functions
+---------
+load_mbpp : function
+    Loads the MBPP benchmark, which consists of a series coding problems.
+"""
+from pathlib import Path
+from subprocess import TimeoutExpired
+from typing import Union
+
+from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
+
+from gpt_engineer.benchmark.benchmarks.mbpp.problem import Problem
+from gpt_engineer.benchmark.benchmarks.mbpp.problems import PROBLEM_IDS
+from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
+from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
+from gpt_engineer.core.files_dict import FilesDict
+from gpt_engineer.core.prompt import Prompt
+
+DATASET_PATH = Path("gpt_engineer/benchmark/benchmarks/mbpp/dataset")
+MAX_N_TEST_EXAMPLES = 10
+
+
+class MbppAssertion:
+    def __init__(self, assertion: str):
+        self.assertion = assertion
+
+    def evaluate(self, assertable: Assertable) -> bool:
+        generated_code = assertable.files["main.py"]
+        code_with_assertion = f"{generated_code}\n{self.assertion}"
+
+        # Create new execution environment for every run to avoid side effects
+        env = DiskExecutionEnv()
+        env.upload(FilesDict({"main.py": code_with_assertion}))
+        pro = env.popen("python main.py")
+
+        try:
+            stdout, stderr = pro.communicate(timeout=2)
+            stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8")
+        except TimeoutExpired:
+            print("Execution Timeout")
+            return False
+
+        return not stderr
+
+
+def _get_dataset() -> Union[Dataset, DatasetDict]:
+    try:
+        return load_from_disk(str(DATASET_PATH))
+    except FileNotFoundError:
+        print("Dataset not found locally, downloading...")
+
+    dataset = load_dataset("mbpp", "sanitized", trust_remote_code=True)
+    dataset.save_to_disk(DATASET_PATH)
+
+    return dataset
+
+
+def load_mbpp():
+    """
+    Loads the MBPP benchmark, which consists of a series coding problems.
+
+    Returns
+    -------
+    Benchmark
+        A Benchmark object containing a list of Task objects for the MBPP evaluation.
+    """
+    dataset = _get_dataset()
+    tasks = []
+
+    problems = [
+        Problem(
+            source_file=problem["source_file"],
+            task_id=problem["task_id"],
+            prompt=problem["prompt"],
+            code=problem["code"],
+            test_imports=problem["test_imports"],
+            test_list=problem["test_list"],
+        )
+        for problem in dataset["test"]
+        if problem["task_id"] in PROBLEM_IDS
+    ]
+
+    for problem in problems:
+        prompt = Prompt(
+            problem.prompt
+            + "Please extend given function without changing it's declaration including arguments."
+        )
+
+        tasks.append(
+            Task(
+                name=str(problem.task_id),
+                initial_code=FilesDict({"main.py": problem.starting_code}),
+                command=None,  # Explicitly setting `None` because each assertion runs code
+                prompt=prompt,
+                assertions={
+                    f"correct assertion {i}": MbppAssertion(
+                        assertion=assertion
+                    ).evaluate
+                    for i, assertion in enumerate(problem.test_list)
+                },
+            )
+        )
+
+    return Benchmark(
+        name="MBPP",
+        tasks=tasks,
+    )
diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/problem.py b/gpt_engineer/benchmark/benchmarks/mbpp/problem.py
@@ -0,0 +1,25 @@
+from dataclasses import dataclass
+from typing import List
+
+
+@dataclass(frozen=True)
+class Problem:
+    source_file: int
+    task_id: str
+    prompt: str
+    code: str
+    test_imports: str
+    test_list: List[str]
+
+    @property
+    def starting_code(self) -> str:
+        lines: List[str] = []
+
+        for line in self.code.split("\n"):
+            lines.append(line)
+
+            if line.startswith("def "):
+                lines.append("pass #  TODO: Implement method\n")
+                break
+
+        return "\n".join(lines)
diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/problems.py b/gpt_engineer/benchmark/benchmarks/mbpp/problems.py
@@ -0,0 +1,3 @@
+# TODO: Pick problems
+# Temporary testing against these problems
+PROBLEM_IDS = range(0, 100)