Skip to content

Commit

Permalink
[RFC] Add LLMEvaluator to create LLM-as-a-judge evaluators (#831)
Browse files Browse the repository at this point in the history
It's currently quite annoying to use LLM-as-a-judge evaluators in code,
and there is a bit of a disconnect between the SDK and the UI.

Our [off-the-shelf
evaluators](https://docs.smith.langchain.com/how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators)
don't even use tool calling

In the UI, you can specify the prompt and output schema. With LangSmith,
you have to use `.with_structured_output` within a custom function. This
can be a lot of boilerplate for the user.

Additionally, a JSON schema or generic pydantic model is likely not the
best interface for allowing people to specify the score format for their
LLM evaluators. Opted for something more opinionated,
`ContinuousScoreConfig` and `CategoricalScoreConfig`

**Important detail**: I map each score to a tool as opposed to each
argument of a tool. This allows other attributes, like explanation to be
extracted and mapped to the same feedback entry.

Future work:
* Allow people to load these from a file
* Create off-the-shelf evaluators based off of `LLMEvaluator`
* async
  • Loading branch information
agola11 authored Jul 19, 2024
2 parents 5cd607b + 726387d commit c594628
Show file tree
Hide file tree
Showing 2 changed files with 501 additions and 0 deletions.
292 changes: 292 additions & 0 deletions python/langsmith/evaluation/llm_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
"""Contains the LLMEvaluator class for building LLM-as-a-judge evaluators."""

from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast

from pydantic import BaseModel

import langsmith.beta._utils as beta_utils
from langsmith.evaluation import EvaluationResult, EvaluationResults, RunEvaluator
from langsmith.schemas import Example, Run


class CategoricalScoreConfig(BaseModel):
"""Configuration for a categorical score."""

key: str
choices: List[str]
description: str
include_explanation: bool = False
explanation_description: Optional[str] = None


class ContinuousScoreConfig(BaseModel):
"""Configuration for a continuous score."""

key: str
min: float = 0
max: float = 1
description: str
include_explanation: bool = False
explanation_description: Optional[str] = None


def _create_score_json_schema(
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
) -> dict:
properties: Dict[str, Any] = {}
if isinstance(score_config, CategoricalScoreConfig):
properties["score"] = {
"type": "string",
"enum": score_config.choices,
"description": f"The score for the evaluation, one of "
f"{', '.join(score_config.choices)}.",
}
elif isinstance(score_config, ContinuousScoreConfig):
properties["score"] = {
"type": "number",
"minimum": score_config.min,
"maximum": score_config.max,
"description": f"The score for the evaluation, between "
f"{score_config.min} and {score_config.max}, inclusive.",
}
else:
raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'")

if score_config.include_explanation:
properties["explanation"] = {
"type": "string",
"description": (
"The explanation for the score."
if score_config.explanation_description is None
else score_config.explanation_description
),
}

return {
"title": score_config.key,
"description": score_config.description,
"type": "object",
"properties": properties,
"required": (
["score", "explanation"] if score_config.include_explanation else ["score"]
),
}


class LLMEvaluator(RunEvaluator):
"""A class for building LLM-as-a-judge evaluators."""

def __init__(
self,
*,
prompt_template: Union[str, List[Tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
model_name: str = "gpt-4o",
model_provider: str = "openai",
**kwargs,
):
"""Initialize the LLMEvaluator.
Args:
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
template to use for the evaluation. If a string is provided, it is
assumed to be a human / user message.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The configuration for the score, either categorical or continuous.
map_variables (Optional[Callable[[Run, Example], dict]], optional):
A function that maps the run and example to the variables in the
prompt. Defaults to None. If None, it is assumed that the prompt
only requires 'input', 'output', and 'expected'.
model_name (Optional[str], optional): The model to use for the evaluation.
Defaults to "gpt-4o".
model_provider (Optional[str], optional): The model provider to use
for the evaluation. Defaults to "openai".
"""
try:
from langchain.chat_models import init_chat_model
except ImportError as e:
raise ImportError(
"LLMEvaluator requires langchain to be installed. "
"Please install langchain by running `pip install langchain`."
) from e

chat_model = init_chat_model(
model=model_name, model_provider=model_provider, **kwargs
)

self._initialize(prompt_template, score_config, map_variables, chat_model)

@classmethod
def from_model(
cls,
model: Any,
*,
prompt_template: Union[str, List[Tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
):
"""Create an LLMEvaluator instance from a BaseChatModel instance.
Args:
model (BaseChatModel): The chat model instance to use for the evaluation.
prompt_template (Union[str, List[Tuple[str, str]]): The prompt
template to use for the evaluation. If a string is provided, it is
assumed to be a system message.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The configuration for the score, either categorical or continuous.
map_variables (Optional[Callable[[Run, Example]], dict]], optional):
A function that maps the run and example to the variables in the
prompt. Defaults to None. If None, it is assumed that the prompt
only requires 'input', 'output', and 'expected'.
Returns:
LLMEvaluator: An instance of LLMEvaluator.
"""
instance = cls.__new__(cls)
instance._initialize(prompt_template, score_config, map_variables, model)
return instance

def _initialize(
self,
prompt_template: Union[str, List[Tuple[str, str]]],
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
map_variables: Optional[Callable[[Run, Optional[Example]], dict]],
chat_model: Any,
):
"""Shared initialization code for __init__ and from_model.
Args:
prompt_template (Union[str, List[Tuple[str, str]]): The prompt template.
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
The score configuration.
map_variables (Optional[Callable[[Run, Example]], dict]]):
Function to map variables.
chat_model (BaseChatModel): The chat model instance.
"""
try:
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts import ChatPromptTemplate
except ImportError as e:
raise ImportError(
"LLMEvaluator requires langchain-core to be installed. "
"Please install langchain-core by running `pip install langchain-core`."
) from e

if not (
isinstance(chat_model, BaseChatModel)
and hasattr(chat_model, "with_structured_output")
):
raise ValueError(
"chat_model must be an instance of "
"BaseLanguageModel and support structured output."
)

if isinstance(prompt_template, str):
self.prompt = ChatPromptTemplate.from_messages([("human", prompt_template)])
else:
self.prompt = ChatPromptTemplate.from_messages(prompt_template)

if set(self.prompt.input_variables) - {"input", "output", "expected"}:
if not map_variables:
raise ValueError(
"map_inputs must be provided if the prompt template contains "
"variables other than 'input', 'output', and 'expected'"
)
self.map_variables = map_variables

self.score_config = score_config
self.score_schema = _create_score_json_schema(self.score_config)

chat_model = chat_model.with_structured_output(self.score_schema)
self.runnable = self.prompt | chat_model

@beta_utils.warn_beta
def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> Union[EvaluationResult, EvaluationResults]:
"""Evaluate a run."""
variables = self._prepare_variables(run, example)
output: dict = cast(dict, self.runnable.invoke(variables))
return self._parse_output(output)

@beta_utils.warn_beta
async def aevaluate_run(
self, run: Run, example: Optional[Example] = None
) -> Union[EvaluationResult, EvaluationResults]:
"""Asynchronously evaluate a run."""
variables = self._prepare_variables(run, example)
output: dict = cast(dict, await self.runnable.ainvoke(variables))
return self._parse_output(output)

def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
"""Prepare variables for model invocation."""
if self.map_variables:
return self.map_variables(run, example)

variables = {}
if "input" in self.prompt.input_variables:
if len(run.inputs) == 0:
raise ValueError(
"No input keys are present in run.inputs but the prompt "
"requires 'input'."
)
if len(run.inputs) != 1:
raise ValueError(
"Multiple input keys are present in run.inputs. Please provide "
"a map_variables function."
)
variables["input"] = list(run.inputs.values())[0]

if "output" in self.prompt.input_variables:
if not run.outputs:
raise ValueError(
"No output keys are present in run.outputs but the prompt "
"requires 'output'."
)
if len(run.outputs) == 0:
raise ValueError(
"No output keys are present in run.outputs but the prompt "
"requires 'output'."
)
if len(run.outputs) != 1:
raise ValueError(
"Multiple output keys are present in run.outputs. Please "
"provide a map_variables function."
)
variables["output"] = list(run.outputs.values())[0]

if "expected" in self.prompt.input_variables:
if not example or not example.outputs:
raise ValueError(
"No example or example outputs is provided but the prompt "
"requires 'expected'."
)
if len(example.outputs) == 0:
raise ValueError(
"No output keys are present in example.outputs but the prompt "
"requires 'expected'."
)
if len(example.outputs) != 1:
raise ValueError(
"Multiple output keys are present in example.outputs. Please "
"provide a map_variables function."
)
variables["expected"] = list(example.outputs.values())[0]

return variables

def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]:
"""Parse the model output into an evaluation result."""
if isinstance(self.score_config, CategoricalScoreConfig):
value = output["score"]
explanation = output.get("explanation", None)
return EvaluationResult(
key=self.score_config.key, value=value, comment=explanation
)
elif isinstance(self.score_config, ContinuousScoreConfig):
score = output["score"]
explanation = output.get("explanation", None)
return EvaluationResult(
key=self.score_config.key, score=score, comment=explanation
)
Loading

0 comments on commit c594628

Please sign in to comment.