From d02fcb44f2ccdb32cd19cca0d8c191055605dfca Mon Sep 17 00:00:00 2001 From: azrv Date: Wed, 3 Apr 2024 23:48:19 +0300 Subject: [PATCH 1/3] Integrate MBPP benchmarks --- .gitignore | 1 + gpt_engineer/benchmark/benchmarks/load.py | 2 + .../benchmark/benchmarks/mbpp/load.py | 114 ++++++++++++++++++ .../benchmark/benchmarks/mbpp/problem.py | 25 ++++ .../benchmark/benchmarks/mbpp/problems.py | 3 + 5 files changed, 145 insertions(+) create mode 100644 gpt_engineer/benchmark/benchmarks/mbpp/load.py create mode 100644 gpt_engineer/benchmark/benchmarks/mbpp/problem.py create mode 100644 gpt_engineer/benchmark/benchmarks/mbpp/problems.py diff --git a/.gitignore b/.gitignore index 7540c0236e..79745c28db 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,4 @@ webapp/.next/ # locally saved datasets gpt_engineer/benchmark/benchmarks/apps/dataset +gpt_engineer/benchmark/benchmarks/mbpp/dataset diff --git a/gpt_engineer/benchmark/benchmarks/load.py b/gpt_engineer/benchmark/benchmarks/load.py index 2d3c266362..6ed8659a26 100644 --- a/gpt_engineer/benchmark/benchmarks/load.py +++ b/gpt_engineer/benchmark/benchmarks/load.py @@ -12,12 +12,14 @@ from gpt_engineer.benchmark.benchmarks.apps.load import load_apps from gpt_engineer.benchmark.benchmarks.gpteng.load import load_gpteng from gpt_engineer.benchmark.benchmarks.gptme.load import load_gptme +from gpt_engineer.benchmark.benchmarks.mbpp.load import load_mbpp from gpt_engineer.benchmark.types import Benchmark BENCHMARKS = { "gptme": load_gptme, "gpteng": load_gpteng, "apps": load_apps, + "mbpp": load_mbpp, } diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/load.py b/gpt_engineer/benchmark/benchmarks/mbpp/load.py new file mode 100644 index 0000000000..da95108a19 --- /dev/null +++ b/gpt_engineer/benchmark/benchmarks/mbpp/load.py @@ -0,0 +1,114 @@ +""" +Module for loading MBPP evaluation tasks. + +This module provides functionality to load tasks for evaluating GPT-based models +on smaller, more focused tasks. It defines a set of tasks with predefined prompts +and assertions to benchmark the performance of AI models. + +Functions +--------- +load_mbpp : function + Loads the MBPP benchmark, which consists of a series coding problems. +""" +from pathlib import Path +from subprocess import TimeoutExpired +from typing import Union + +from datasets import Dataset, DatasetDict, load_dataset, load_from_disk + +from gpt_engineer.benchmark.benchmarks.mbpp.problem import Problem +from gpt_engineer.benchmark.benchmarks.mbpp.problems import PROBLEM_IDS +from gpt_engineer.benchmark.types import Assertable, Benchmark, Task +from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv +from gpt_engineer.core.files_dict import FilesDict +from gpt_engineer.core.prompt import Prompt + +DATASET_PATH = Path("gpt_engineer/benchmark/benchmarks/mbpp/dataset") +MAX_N_TEST_EXAMPLES = 10 + + +class MbppAssertion: + def __init__(self, assertion: str): + self.assertion = assertion + + def evaluate(self, assertable: Assertable) -> bool: + generated_code = assertable.files["main.py"] + code_with_assertion = f"{generated_code}\n{self.assertion}" + + # Create new execution environment for every run to avoid side effects + env = DiskExecutionEnv() + env.upload(FilesDict({"main.py": code_with_assertion})) + pro = env.popen("python main.py") + + try: + stdout, stderr = pro.communicate(timeout=2) + stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8") + except TimeoutExpired: + print("Execution Timeout") + return False + + return not stderr + + +def _get_dataset() -> Union[Dataset, DatasetDict]: + try: + return load_from_disk(str(DATASET_PATH)) + except FileNotFoundError: + print("Dataset not found locally, downloading...") + + dataset = load_dataset("mbpp", "sanitized", trust_remote_code=True) + dataset.save_to_disk(DATASET_PATH) + + return dataset + + +def load_mbpp(): + """ + Loads the MBPP benchmark, which consists of a series coding problems. + + Returns + ------- + Benchmark + A Benchmark object containing a list of Task objects for the MBPP evaluation. + """ + dataset = _get_dataset() + tasks = [] + + problems = [ + Problem( + source_file=problem["source_file"], + task_id=problem["task_id"], + prompt=problem["prompt"], + code=problem["code"], + test_imports=problem["test_imports"], + test_list=problem["test_list"], + ) + for problem in dataset["test"] + if problem["task_id"] in PROBLEM_IDS + ] + + for problem in problems: + prompt = Prompt( + problem.prompt + + "Please extend given function without changing it's declaration including arguments." + ) + + tasks.append( + Task( + name=str(problem.task_id), + initial_code=FilesDict({"main.py": problem.starting_code}), + command=None, # Explicitly setting `None` because each assertion runs code + prompt=prompt, + assertions={ + f"correct assertion {i}": MbppAssertion( + assertion=assertion + ).evaluate + for i, assertion in enumerate(problem.test_list) + }, + ) + ) + + return Benchmark( + name="MBPP", + tasks=tasks, + ) diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/problem.py b/gpt_engineer/benchmark/benchmarks/mbpp/problem.py new file mode 100644 index 0000000000..ca3cd0ad4f --- /dev/null +++ b/gpt_engineer/benchmark/benchmarks/mbpp/problem.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +from typing import List + + +@dataclass(frozen=True) +class Problem: + source_file: int + task_id: str + prompt: str + code: str + test_imports: str + test_list: List[str] + + @property + def starting_code(self) -> str: + lines: List[str] = [] + + for line in self.code.split("\n"): + lines.append(line) + + if line.startswith("def "): + lines.append("pass # TODO: Implement method\n") + break + + return "\n".join(lines) diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/problems.py b/gpt_engineer/benchmark/benchmarks/mbpp/problems.py new file mode 100644 index 0000000000..c9dffd97a6 --- /dev/null +++ b/gpt_engineer/benchmark/benchmarks/mbpp/problems.py @@ -0,0 +1,3 @@ +# TODO: Pick problems +# Temporary testing against these problems +PROBLEM_IDS = range(0, 100) From 389586a216ecf0ffb815f43d64f2f4696d90ee07 Mon Sep 17 00:00:00 2001 From: azrv Date: Thu, 4 Apr 2024 20:49:36 +0300 Subject: [PATCH 2/3] Load api key from .env when running benchmarks --- gpt_engineer/benchmark/__main__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gpt_engineer/benchmark/__main__.py b/gpt_engineer/benchmark/__main__.py index 78f51b13d8..a1ca76525b 100644 --- a/gpt_engineer/benchmark/__main__.py +++ b/gpt_engineer/benchmark/__main__.py @@ -28,6 +28,7 @@ from langchain.cache import SQLiteCache from langchain.globals import set_llm_cache +from gpt_engineer.applications.cli.main import load_env_if_needed from gpt_engineer.benchmark.benchmarks.load import get_benchmark from gpt_engineer.benchmark.run import print_results, run @@ -87,6 +88,7 @@ def main( None """ set_llm_cache(SQLiteCache(database_path=".langchain.db")) + load_env_if_needed() benchmarks = benchmarks.split(",") for benchmark_name in benchmarks: From 2649d8f59896526b218f1a0c79e182a41dac8886 Mon Sep 17 00:00:00 2001 From: azrv Date: Thu, 4 Apr 2024 20:54:31 +0300 Subject: [PATCH 3/3] Use working directory to search for cached dataset --- gpt_engineer/benchmark/benchmarks/apps/load.py | 2 +- gpt_engineer/benchmark/benchmarks/mbpp/load.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gpt_engineer/benchmark/benchmarks/apps/load.py b/gpt_engineer/benchmark/benchmarks/apps/load.py index 2908b34b0b..65cf515713 100644 --- a/gpt_engineer/benchmark/benchmarks/apps/load.py +++ b/gpt_engineer/benchmark/benchmarks/apps/load.py @@ -23,7 +23,7 @@ from gpt_engineer.core.files_dict import FilesDict from gpt_engineer.core.prompt import Prompt -DATASET_PATH = Path("gpt_engineer/benchmark/benchmarks/apps/dataset") +DATASET_PATH = Path(__file__).parent / "dataset" MAX_N_TEST_EXAMPLES = 10 diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/load.py b/gpt_engineer/benchmark/benchmarks/mbpp/load.py index da95108a19..9aefef0d92 100644 --- a/gpt_engineer/benchmark/benchmarks/mbpp/load.py +++ b/gpt_engineer/benchmark/benchmarks/mbpp/load.py @@ -23,7 +23,7 @@ from gpt_engineer.core.files_dict import FilesDict from gpt_engineer.core.prompt import Prompt -DATASET_PATH = Path("gpt_engineer/benchmark/benchmarks/mbpp/dataset") +DATASET_PATH = Path(__file__).parent / "dataset" MAX_N_TEST_EXAMPLES = 10