-
Notifications
You must be signed in to change notification settings - Fork 74
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[RFC] Add
LLMEvaluator
to create LLM-as-a-judge evaluators (#831)
It's currently quite annoying to use LLM-as-a-judge evaluators in code, and there is a bit of a disconnect between the SDK and the UI. Our [off-the-shelf evaluators](https://docs.smith.langchain.com/how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators) don't even use tool calling In the UI, you can specify the prompt and output schema. With LangSmith, you have to use `.with_structured_output` within a custom function. This can be a lot of boilerplate for the user. Additionally, a JSON schema or generic pydantic model is likely not the best interface for allowing people to specify the score format for their LLM evaluators. Opted for something more opinionated, `ContinuousScoreConfig` and `CategoricalScoreConfig` **Important detail**: I map each score to a tool as opposed to each argument of a tool. This allows other attributes, like explanation to be extracted and mapped to the same feedback entry. Future work: * Allow people to load these from a file * Create off-the-shelf evaluators based off of `LLMEvaluator` * async
- Loading branch information
Showing
2 changed files
with
501 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,292 @@ | ||
"""Contains the LLMEvaluator class for building LLM-as-a-judge evaluators.""" | ||
|
||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast | ||
|
||
from pydantic import BaseModel | ||
|
||
import langsmith.beta._utils as beta_utils | ||
from langsmith.evaluation import EvaluationResult, EvaluationResults, RunEvaluator | ||
from langsmith.schemas import Example, Run | ||
|
||
|
||
class CategoricalScoreConfig(BaseModel): | ||
"""Configuration for a categorical score.""" | ||
|
||
key: str | ||
choices: List[str] | ||
description: str | ||
include_explanation: bool = False | ||
explanation_description: Optional[str] = None | ||
|
||
|
||
class ContinuousScoreConfig(BaseModel): | ||
"""Configuration for a continuous score.""" | ||
|
||
key: str | ||
min: float = 0 | ||
max: float = 1 | ||
description: str | ||
include_explanation: bool = False | ||
explanation_description: Optional[str] = None | ||
|
||
|
||
def _create_score_json_schema( | ||
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig], | ||
) -> dict: | ||
properties: Dict[str, Any] = {} | ||
if isinstance(score_config, CategoricalScoreConfig): | ||
properties["score"] = { | ||
"type": "string", | ||
"enum": score_config.choices, | ||
"description": f"The score for the evaluation, one of " | ||
f"{', '.join(score_config.choices)}.", | ||
} | ||
elif isinstance(score_config, ContinuousScoreConfig): | ||
properties["score"] = { | ||
"type": "number", | ||
"minimum": score_config.min, | ||
"maximum": score_config.max, | ||
"description": f"The score for the evaluation, between " | ||
f"{score_config.min} and {score_config.max}, inclusive.", | ||
} | ||
else: | ||
raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'") | ||
|
||
if score_config.include_explanation: | ||
properties["explanation"] = { | ||
"type": "string", | ||
"description": ( | ||
"The explanation for the score." | ||
if score_config.explanation_description is None | ||
else score_config.explanation_description | ||
), | ||
} | ||
|
||
return { | ||
"title": score_config.key, | ||
"description": score_config.description, | ||
"type": "object", | ||
"properties": properties, | ||
"required": ( | ||
["score", "explanation"] if score_config.include_explanation else ["score"] | ||
), | ||
} | ||
|
||
|
||
class LLMEvaluator(RunEvaluator): | ||
"""A class for building LLM-as-a-judge evaluators.""" | ||
|
||
def __init__( | ||
self, | ||
*, | ||
prompt_template: Union[str, List[Tuple[str, str]]], | ||
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig], | ||
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None, | ||
model_name: str = "gpt-4o", | ||
model_provider: str = "openai", | ||
**kwargs, | ||
): | ||
"""Initialize the LLMEvaluator. | ||
Args: | ||
prompt_template (Union[str, List[Tuple[str, str]]): The prompt | ||
template to use for the evaluation. If a string is provided, it is | ||
assumed to be a human / user message. | ||
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]): | ||
The configuration for the score, either categorical or continuous. | ||
map_variables (Optional[Callable[[Run, Example], dict]], optional): | ||
A function that maps the run and example to the variables in the | ||
prompt. Defaults to None. If None, it is assumed that the prompt | ||
only requires 'input', 'output', and 'expected'. | ||
model_name (Optional[str], optional): The model to use for the evaluation. | ||
Defaults to "gpt-4o". | ||
model_provider (Optional[str], optional): The model provider to use | ||
for the evaluation. Defaults to "openai". | ||
""" | ||
try: | ||
from langchain.chat_models import init_chat_model | ||
except ImportError as e: | ||
raise ImportError( | ||
"LLMEvaluator requires langchain to be installed. " | ||
"Please install langchain by running `pip install langchain`." | ||
) from e | ||
|
||
chat_model = init_chat_model( | ||
model=model_name, model_provider=model_provider, **kwargs | ||
) | ||
|
||
self._initialize(prompt_template, score_config, map_variables, chat_model) | ||
|
||
@classmethod | ||
def from_model( | ||
cls, | ||
model: Any, | ||
*, | ||
prompt_template: Union[str, List[Tuple[str, str]]], | ||
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig], | ||
map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None, | ||
): | ||
"""Create an LLMEvaluator instance from a BaseChatModel instance. | ||
Args: | ||
model (BaseChatModel): The chat model instance to use for the evaluation. | ||
prompt_template (Union[str, List[Tuple[str, str]]): The prompt | ||
template to use for the evaluation. If a string is provided, it is | ||
assumed to be a system message. | ||
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]): | ||
The configuration for the score, either categorical or continuous. | ||
map_variables (Optional[Callable[[Run, Example]], dict]], optional): | ||
A function that maps the run and example to the variables in the | ||
prompt. Defaults to None. If None, it is assumed that the prompt | ||
only requires 'input', 'output', and 'expected'. | ||
Returns: | ||
LLMEvaluator: An instance of LLMEvaluator. | ||
""" | ||
instance = cls.__new__(cls) | ||
instance._initialize(prompt_template, score_config, map_variables, model) | ||
return instance | ||
|
||
def _initialize( | ||
self, | ||
prompt_template: Union[str, List[Tuple[str, str]]], | ||
score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig], | ||
map_variables: Optional[Callable[[Run, Optional[Example]], dict]], | ||
chat_model: Any, | ||
): | ||
"""Shared initialization code for __init__ and from_model. | ||
Args: | ||
prompt_template (Union[str, List[Tuple[str, str]]): The prompt template. | ||
score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]): | ||
The score configuration. | ||
map_variables (Optional[Callable[[Run, Example]], dict]]): | ||
Function to map variables. | ||
chat_model (BaseChatModel): The chat model instance. | ||
""" | ||
try: | ||
from langchain_core.language_models.chat_models import BaseChatModel | ||
from langchain_core.prompts import ChatPromptTemplate | ||
except ImportError as e: | ||
raise ImportError( | ||
"LLMEvaluator requires langchain-core to be installed. " | ||
"Please install langchain-core by running `pip install langchain-core`." | ||
) from e | ||
|
||
if not ( | ||
isinstance(chat_model, BaseChatModel) | ||
and hasattr(chat_model, "with_structured_output") | ||
): | ||
raise ValueError( | ||
"chat_model must be an instance of " | ||
"BaseLanguageModel and support structured output." | ||
) | ||
|
||
if isinstance(prompt_template, str): | ||
self.prompt = ChatPromptTemplate.from_messages([("human", prompt_template)]) | ||
else: | ||
self.prompt = ChatPromptTemplate.from_messages(prompt_template) | ||
|
||
if set(self.prompt.input_variables) - {"input", "output", "expected"}: | ||
if not map_variables: | ||
raise ValueError( | ||
"map_inputs must be provided if the prompt template contains " | ||
"variables other than 'input', 'output', and 'expected'" | ||
) | ||
self.map_variables = map_variables | ||
|
||
self.score_config = score_config | ||
self.score_schema = _create_score_json_schema(self.score_config) | ||
|
||
chat_model = chat_model.with_structured_output(self.score_schema) | ||
self.runnable = self.prompt | chat_model | ||
|
||
@beta_utils.warn_beta | ||
def evaluate_run( | ||
self, run: Run, example: Optional[Example] = None | ||
) -> Union[EvaluationResult, EvaluationResults]: | ||
"""Evaluate a run.""" | ||
variables = self._prepare_variables(run, example) | ||
output: dict = cast(dict, self.runnable.invoke(variables)) | ||
return self._parse_output(output) | ||
|
||
@beta_utils.warn_beta | ||
async def aevaluate_run( | ||
self, run: Run, example: Optional[Example] = None | ||
) -> Union[EvaluationResult, EvaluationResults]: | ||
"""Asynchronously evaluate a run.""" | ||
variables = self._prepare_variables(run, example) | ||
output: dict = cast(dict, await self.runnable.ainvoke(variables)) | ||
return self._parse_output(output) | ||
|
||
def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict: | ||
"""Prepare variables for model invocation.""" | ||
if self.map_variables: | ||
return self.map_variables(run, example) | ||
|
||
variables = {} | ||
if "input" in self.prompt.input_variables: | ||
if len(run.inputs) == 0: | ||
raise ValueError( | ||
"No input keys are present in run.inputs but the prompt " | ||
"requires 'input'." | ||
) | ||
if len(run.inputs) != 1: | ||
raise ValueError( | ||
"Multiple input keys are present in run.inputs. Please provide " | ||
"a map_variables function." | ||
) | ||
variables["input"] = list(run.inputs.values())[0] | ||
|
||
if "output" in self.prompt.input_variables: | ||
if not run.outputs: | ||
raise ValueError( | ||
"No output keys are present in run.outputs but the prompt " | ||
"requires 'output'." | ||
) | ||
if len(run.outputs) == 0: | ||
raise ValueError( | ||
"No output keys are present in run.outputs but the prompt " | ||
"requires 'output'." | ||
) | ||
if len(run.outputs) != 1: | ||
raise ValueError( | ||
"Multiple output keys are present in run.outputs. Please " | ||
"provide a map_variables function." | ||
) | ||
variables["output"] = list(run.outputs.values())[0] | ||
|
||
if "expected" in self.prompt.input_variables: | ||
if not example or not example.outputs: | ||
raise ValueError( | ||
"No example or example outputs is provided but the prompt " | ||
"requires 'expected'." | ||
) | ||
if len(example.outputs) == 0: | ||
raise ValueError( | ||
"No output keys are present in example.outputs but the prompt " | ||
"requires 'expected'." | ||
) | ||
if len(example.outputs) != 1: | ||
raise ValueError( | ||
"Multiple output keys are present in example.outputs. Please " | ||
"provide a map_variables function." | ||
) | ||
variables["expected"] = list(example.outputs.values())[0] | ||
|
||
return variables | ||
|
||
def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]: | ||
"""Parse the model output into an evaluation result.""" | ||
if isinstance(self.score_config, CategoricalScoreConfig): | ||
value = output["score"] | ||
explanation = output.get("explanation", None) | ||
return EvaluationResult( | ||
key=self.score_config.key, value=value, comment=explanation | ||
) | ||
elif isinstance(self.score_config, ContinuousScoreConfig): | ||
score = output["score"] | ||
explanation = output.get("explanation", None) | ||
return EvaluationResult( | ||
key=self.score_config.key, score=score, comment=explanation | ||
) |
Oops, something went wrong.