Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a testing module for backtesting #33

Merged
merged 21 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 0 additions & 33 deletions .devcontainer/devcontainer.json

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11"]
python-version: ["3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,7 @@ __pycache__/
.env

docs/build
notebooks
notebooks
.mypy_cache
.devcontainer/*

2 changes: 1 addition & 1 deletion LICENCE
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [yyyy] [name of copyright owner]
Copyright 2023 Phospho SAS

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/apidocs/phospho/phospho.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ phospho.collection
```
````

````{py:function} user_feedback(task_id: str, flag: typing.Optional[typing.Literal[success, failure]] = None, note: typing.Optional[str] = None, source: str = 'user', raw_flag: typing.Optional[str] = None, raw_flag_to_flag: typing.Optional[typing.Callable[[typing.Any], typing.Literal[success, failure]]] = None) -> phospho.tasks.Task
````{py:function} user_feedback(task_id: str, flag: typing.Optional[typing.Literal[success, failure]] = None, notes: typing.Optional[str] = None, source: str = 'user', raw_flag: typing.Optional[str] = None, raw_flag_to_flag: typing.Optional[typing.Callable[[typing.Any], typing.Literal[success, failure]]] = None) -> phospho.tasks.Task
:canonical: phospho.user_feedback

```{autodoc2-docstring} phospho.user_feedback
Expand Down
8 changes: 8 additions & 0 deletions docs/source/apidocs/phospho/phospho.tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,14 @@

````

````{py:method} update(metadata: typing.Optional[dict] = None, data: typing.Optional[dict] = None, notes: typing.Optional[str] = None, flag: typing.Optional[typing.Literal[success, failure]] = None, flag_source: typing.Optional[str] = None)
:canonical: phospho.tasks.Task.update

```{autodoc2-docstring} phospho.tasks.Task.update
```

````

`````

`````{py:class} TaskCollection(client)
Expand Down
108 changes: 0 additions & 108 deletions examples/agent_testing.py

This file was deleted.

Binary file not shown.
34 changes: 34 additions & 0 deletions examples/streamlit_santa_agent/phospho_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
This is an example of how to backtest an agent with phospho

1. Setup the API key and project id as environment variables
2. Run the script

```bash
export PHOSPHO_API_KEY=your_api_key
export PHOSPHO_PROJECT_ID=your_project_id
python phospho_test.py
```
"""
import phospho

# This is the agent to test
from backend import SantaClausAgent

phospho.config.BASE_URL = "http://localhost:8000/v0"
phospho_test = phospho.PhosphoTest(executor_type="parallel")


@phospho_test.test
def test_santa(**inputs):
santa_claus_agent = SantaClausAgent()
return santa_claus_agent.answer(**inputs)


phospho_test.run(
# source_loader="pandas",
# source_loader_params={"path": "golden_dataset.xlsx"},
source_loader="backtest",
source_loader_params={"sample_size": 5},
metrics=["evaluate"],
)
18 changes: 15 additions & 3 deletions phospho/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from .agent import Agent
from .message import Message
from .client import Client
Expand All @@ -14,7 +15,8 @@
)
from .extractor import get_input_output, RawDataType
from ._version import __version__
from . import evals
from . import config
from .testing import PhosphoTest

__all__ = [
"Client",
Expand All @@ -37,7 +39,7 @@
"log",
"wrap",
"extractor",
"evals",
"PhosphoTest",
]

import pydantic
Expand Down Expand Up @@ -404,7 +406,7 @@ def log(
concatenate_raw_outputs_if_task_id_exists: bool = True,
stream: bool = False,
**kwargs: Dict[str, Any],
) -> Dict[str, object]:
) -> Optional[Dict[str, object]]:
"""Phospho's main all-purpose logging endpoint, with support for streaming.

Usage:
Expand Down Expand Up @@ -433,6 +435,11 @@ def log(
- log_event (Dict[str, object]):
The content of what has been logged.
"""
PHOSPHO_EXECUTION_MODE = os.getenv("PHOSPHO_EXECUTION_MODE")
if PHOSPHO_EXECUTION_MODE == "backtest":
# In backtest mode, don't log anything
return None

if stream:
# Implement the streaming logic over the output
# Note: The output must be mutable. Generators are not mutable
Expand Down Expand Up @@ -642,6 +649,11 @@ def wrapped_function(*func_args, **func_kwargs):
task_id=task_id,
)

PHOSPHO_EXECUTION_MODE = os.getenv("PHOSPHO_EXECUTION_MODE")
if PHOSPHO_EXECUTION_MODE == "backtest":
# In backtest mode, don't wrap the function
return __fn

return wrapped_function


Expand Down
22 changes: 20 additions & 2 deletions phospho/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from phospho.sessions import SessionCollection
from phospho.tasks import TaskCollection, Task
from phospho.evals import Comparison
from phospho.models import Comparison, Test


class Client:
Expand Down Expand Up @@ -100,7 +100,11 @@ def tasks(self) -> TaskCollection:
return TaskCollection(client=self)

def compare(
self, context_input: str, old_output: str, new_output: str
self,
context_input: str,
old_output: str,
new_output: str,
test_id: Optional[str] = None,
) -> Comparison:
"""
Compare the old and new answers to the context_input with an LLM
Expand All @@ -112,6 +116,7 @@ def compare(
"context_input": context_input,
"old_output": old_output,
"new_output": new_output,
"test_id": test_id,
},
)

Expand All @@ -138,3 +143,16 @@ def flag(
},
)
return Task(client=self, task_id=task_id, _content=response.json())

def create_test(self) -> Test:
"""
Start a test
"""

response = self._post(
"/tests/",
payload={
"project_id": self._project_id(),
},
)
return Test(**response.json())
19 changes: 9 additions & 10 deletions phospho/consumer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,11 @@ def __init__(
atexit.register(self.stop)

def run(self) -> None:
# If we are in backtest mode, we don't want to send logs
PHOSPHO_EXECUTION_MODE = os.getenv("PHOSPHO_EXECUTION_MODE")

while self.running:
if PHOSPHO_EXECUTION_MODE != "backtest":
self.send_batch()
self.send_batch()
time.sleep(self.tick)

if PHOSPHO_EXECUTION_MODE != "backtest":
self.send_batch()
self.send_batch()

def send_batch(self) -> None:
batch = self.log_queue.get_batch()
Expand All @@ -48,9 +43,13 @@ def send_batch(self) -> None:
logger.debug(f"Sending {len(batch)} log events to {self.client.base_url}")

try:
self.client._post(
f"/log/{self.client._project_id()}", {"batched_log_events": batch}
)
# If we are in backtest mode, we don't want to send logs
PHOSPHO_EXECUTION_MODE = os.getenv("PHOSPHO_EXECUTION_MODE")
if PHOSPHO_EXECUTION_MODE != "backtest":
self.client._post(
f"/log/{self.client._project_id()}",
{"batched_log_events": batch},
)
except Exception as e:
logger.warning(f"Error sending log events: {e}")

Expand Down
22 changes: 0 additions & 22 deletions phospho/evals.py

This file was deleted.

Loading