From d02fcb44f2ccdb32cd19cca0d8c191055605dfca Mon Sep 17 00:00:00 2001
From: azrv <yellowbang368@gmail.com>
Date: Wed, 3 Apr 2024 23:48:19 +0300
Subject: [PATCH 1/3] Integrate MBPP benchmarks

---
 .gitignore                                    |   1 +
 gpt_engineer/benchmark/benchmarks/load.py     |   2 +
 .../benchmark/benchmarks/mbpp/load.py         | 114 ++++++++++++++++++
 .../benchmark/benchmarks/mbpp/problem.py      |  25 ++++
 .../benchmark/benchmarks/mbpp/problems.py     |   3 +
 5 files changed, 145 insertions(+)
 create mode 100644 gpt_engineer/benchmark/benchmarks/mbpp/load.py
 create mode 100644 gpt_engineer/benchmark/benchmarks/mbpp/problem.py
 create mode 100644 gpt_engineer/benchmark/benchmarks/mbpp/problems.py

diff --git a/.gitignore b/.gitignore
index 7540c0236e..79745c28db 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,4 @@ webapp/.next/
 
 # locally saved datasets
 gpt_engineer/benchmark/benchmarks/apps/dataset
+gpt_engineer/benchmark/benchmarks/mbpp/dataset
diff --git a/gpt_engineer/benchmark/benchmarks/load.py b/gpt_engineer/benchmark/benchmarks/load.py
index 2d3c266362..6ed8659a26 100644
--- a/gpt_engineer/benchmark/benchmarks/load.py
+++ b/gpt_engineer/benchmark/benchmarks/load.py
@@ -12,12 +12,14 @@
 from gpt_engineer.benchmark.benchmarks.apps.load import load_apps
 from gpt_engineer.benchmark.benchmarks.gpteng.load import load_gpteng
 from gpt_engineer.benchmark.benchmarks.gptme.load import load_gptme
+from gpt_engineer.benchmark.benchmarks.mbpp.load import load_mbpp
 from gpt_engineer.benchmark.types import Benchmark
 
 BENCHMARKS = {
     "gptme": load_gptme,
     "gpteng": load_gpteng,
     "apps": load_apps,
+    "mbpp": load_mbpp,
 }
 
 
diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/load.py b/gpt_engineer/benchmark/benchmarks/mbpp/load.py
new file mode 100644
index 0000000000..da95108a19
--- /dev/null
+++ b/gpt_engineer/benchmark/benchmarks/mbpp/load.py
@@ -0,0 +1,114 @@
+"""
+Module for loading MBPP evaluation tasks.
+
+This module provides functionality to load tasks for evaluating GPT-based models
+on smaller, more focused tasks. It defines a set of tasks with predefined prompts
+and assertions to benchmark the performance of AI models.
+
+Functions
+---------
+load_mbpp : function
+    Loads the MBPP benchmark, which consists of a series coding problems.
+"""
+from pathlib import Path
+from subprocess import TimeoutExpired
+from typing import Union
+
+from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
+
+from gpt_engineer.benchmark.benchmarks.mbpp.problem import Problem
+from gpt_engineer.benchmark.benchmarks.mbpp.problems import PROBLEM_IDS
+from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
+from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
+from gpt_engineer.core.files_dict import FilesDict
+from gpt_engineer.core.prompt import Prompt
+
+DATASET_PATH = Path("gpt_engineer/benchmark/benchmarks/mbpp/dataset")
+MAX_N_TEST_EXAMPLES = 10
+
+
+class MbppAssertion:
+    def __init__(self, assertion: str):
+        self.assertion = assertion
+
+    def evaluate(self, assertable: Assertable) -> bool:
+        generated_code = assertable.files["main.py"]
+        code_with_assertion = f"{generated_code}\n{self.assertion}"
+
+        # Create new execution environment for every run to avoid side effects
+        env = DiskExecutionEnv()
+        env.upload(FilesDict({"main.py": code_with_assertion}))
+        pro = env.popen("python main.py")
+
+        try:
+            stdout, stderr = pro.communicate(timeout=2)
+            stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8")
+        except TimeoutExpired:
+            print("Execution Timeout")
+            return False
+
+        return not stderr
+
+
+def _get_dataset() -> Union[Dataset, DatasetDict]:
+    try:
+        return load_from_disk(str(DATASET_PATH))
+    except FileNotFoundError:
+        print("Dataset not found locally, downloading...")
+
+    dataset = load_dataset("mbpp", "sanitized", trust_remote_code=True)
+    dataset.save_to_disk(DATASET_PATH)
+
+    return dataset
+
+
+def load_mbpp():
+    """
+    Loads the MBPP benchmark, which consists of a series coding problems.
+
+    Returns
+    -------
+    Benchmark
+        A Benchmark object containing a list of Task objects for the MBPP evaluation.
+    """
+    dataset = _get_dataset()
+    tasks = []
+
+    problems = [
+        Problem(
+            source_file=problem["source_file"],
+            task_id=problem["task_id"],
+            prompt=problem["prompt"],
+            code=problem["code"],
+            test_imports=problem["test_imports"],
+            test_list=problem["test_list"],
+        )
+        for problem in dataset["test"]
+        if problem["task_id"] in PROBLEM_IDS
+    ]
+
+    for problem in problems:
+        prompt = Prompt(
+            problem.prompt
+            + "Please extend given function without changing it's declaration including arguments."
+        )
+
+        tasks.append(
+            Task(
+                name=str(problem.task_id),
+                initial_code=FilesDict({"main.py": problem.starting_code}),
+                command=None,  # Explicitly setting `None` because each assertion runs code
+                prompt=prompt,
+                assertions={
+                    f"correct assertion {i}": MbppAssertion(
+                        assertion=assertion
+                    ).evaluate
+                    for i, assertion in enumerate(problem.test_list)
+                },
+            )
+        )
+
+    return Benchmark(
+        name="MBPP",
+        tasks=tasks,
+    )
diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/problem.py b/gpt_engineer/benchmark/benchmarks/mbpp/problem.py
new file mode 100644
index 0000000000..ca3cd0ad4f
--- /dev/null
+++ b/gpt_engineer/benchmark/benchmarks/mbpp/problem.py
@@ -0,0 +1,25 @@
+from dataclasses import dataclass
+from typing import List
+
+
+@dataclass(frozen=True)
+class Problem:
+    source_file: int
+    task_id: str
+    prompt: str
+    code: str
+    test_imports: str
+    test_list: List[str]
+
+    @property
+    def starting_code(self) -> str:
+        lines: List[str] = []
+
+        for line in self.code.split("\n"):
+            lines.append(line)
+
+            if line.startswith("def "):
+                lines.append("pass #  TODO: Implement method\n")
+                break
+
+        return "\n".join(lines)
diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/problems.py b/gpt_engineer/benchmark/benchmarks/mbpp/problems.py
new file mode 100644
index 0000000000..c9dffd97a6
--- /dev/null
+++ b/gpt_engineer/benchmark/benchmarks/mbpp/problems.py
@@ -0,0 +1,3 @@
+# TODO: Pick problems
+# Temporary testing against these problems
+PROBLEM_IDS = range(0, 100)

From 389586a216ecf0ffb815f43d64f2f4696d90ee07 Mon Sep 17 00:00:00 2001
From: azrv <yellowbang368@gmail.com>
Date: Thu, 4 Apr 2024 20:49:36 +0300
Subject: [PATCH 2/3] Load api key from .env when running benchmarks

---
 gpt_engineer/benchmark/__main__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gpt_engineer/benchmark/__main__.py b/gpt_engineer/benchmark/__main__.py
index 78f51b13d8..a1ca76525b 100644
--- a/gpt_engineer/benchmark/__main__.py
+++ b/gpt_engineer/benchmark/__main__.py
@@ -28,6 +28,7 @@
 from langchain.cache import SQLiteCache
 from langchain.globals import set_llm_cache
 
+from gpt_engineer.applications.cli.main import load_env_if_needed
 from gpt_engineer.benchmark.benchmarks.load import get_benchmark
 from gpt_engineer.benchmark.run import print_results, run
 
@@ -87,6 +88,7 @@ def main(
     None
     """
     set_llm_cache(SQLiteCache(database_path=".langchain.db"))
+    load_env_if_needed()
 
     benchmarks = benchmarks.split(",")
     for benchmark_name in benchmarks:

From 2649d8f59896526b218f1a0c79e182a41dac8886 Mon Sep 17 00:00:00 2001
From: azrv <yellowbang368@gmail.com>
Date: Thu, 4 Apr 2024 20:54:31 +0300
Subject: [PATCH 3/3] Use working directory to search for cached dataset

---
 gpt_engineer/benchmark/benchmarks/apps/load.py | 2 +-
 gpt_engineer/benchmark/benchmarks/mbpp/load.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpt_engineer/benchmark/benchmarks/apps/load.py b/gpt_engineer/benchmark/benchmarks/apps/load.py
index 2908b34b0b..65cf515713 100644
--- a/gpt_engineer/benchmark/benchmarks/apps/load.py
+++ b/gpt_engineer/benchmark/benchmarks/apps/load.py
@@ -23,7 +23,7 @@
 from gpt_engineer.core.files_dict import FilesDict
 from gpt_engineer.core.prompt import Prompt
 
-DATASET_PATH = Path("gpt_engineer/benchmark/benchmarks/apps/dataset")
+DATASET_PATH = Path(__file__).parent / "dataset"
 MAX_N_TEST_EXAMPLES = 10
 
 
diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/load.py b/gpt_engineer/benchmark/benchmarks/mbpp/load.py
index da95108a19..9aefef0d92 100644
--- a/gpt_engineer/benchmark/benchmarks/mbpp/load.py
+++ b/gpt_engineer/benchmark/benchmarks/mbpp/load.py
@@ -23,7 +23,7 @@
 from gpt_engineer.core.files_dict import FilesDict
 from gpt_engineer.core.prompt import Prompt
 
-DATASET_PATH = Path("gpt_engineer/benchmark/benchmarks/mbpp/dataset")
+DATASET_PATH = Path(__file__).parent / "dataset"
 MAX_N_TEST_EXAMPLES = 10