[RFC] Add LLMEvaluator to create LLM-as-a-judge evaluators (#831)

It's currently quite annoying to use LLM-as-a-judge evaluators in code, and there is a bit of a disconnect between the SDK and the UI. Our [off-the-shelf evaluators](https://docs.smith.langchain.com/how_to_guides/evaluation/use_langchain_off_the_shelf_evaluators) don't even use tool calling In the UI, you can specify the prompt and output schema. With LangSmith, you have to use `.with_structured_output` within a custom function. This can be a lot of boilerplate for the user. Additionally, a JSON schema or generic pydantic model is likely not the best interface for allowing people to specify the score format for their LLM evaluators. Opted for something more opinionated, `ContinuousScoreConfig` and `CategoricalScoreConfig` **Important detail**: I map each score to a tool as opposed to each argument of a tool. This allows other attributes, like explanation to be extracted and mapped to the same feedback entry. Future work: * Allow people to load these from a file * Create off-the-shelf evaluators based off of `LLMEvaluator` * async
langchain-ai · Jul 19, 2024 · c594628 · c594628
2 parents 5cd607b + 726387d
commit c594628
Show file tree

Hide file tree

Showing 2 changed files with 501 additions and 0 deletions.
diff --git a/python/langsmith/evaluation/llm_evaluator.py b/python/langsmith/evaluation/llm_evaluator.py
@@ -0,0 +1,292 @@
+"""Contains the LLMEvaluator class for building LLM-as-a-judge evaluators."""
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+
+from pydantic import BaseModel
+
+import langsmith.beta._utils as beta_utils
+from langsmith.evaluation import EvaluationResult, EvaluationResults, RunEvaluator
+from langsmith.schemas import Example, Run
+
+
+class CategoricalScoreConfig(BaseModel):
+    """Configuration for a categorical score."""
+
+    key: str
+    choices: List[str]
+    description: str
+    include_explanation: bool = False
+    explanation_description: Optional[str] = None
+
+
+class ContinuousScoreConfig(BaseModel):
+    """Configuration for a continuous score."""
+
+    key: str
+    min: float = 0
+    max: float = 1
+    description: str
+    include_explanation: bool = False
+    explanation_description: Optional[str] = None
+
+
+def _create_score_json_schema(
+    score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
+) -> dict:
+    properties: Dict[str, Any] = {}
+    if isinstance(score_config, CategoricalScoreConfig):
+        properties["score"] = {
+            "type": "string",
+            "enum": score_config.choices,
+            "description": f"The score for the evaluation, one of "
+            f"{', '.join(score_config.choices)}.",
+        }
+    elif isinstance(score_config, ContinuousScoreConfig):
+        properties["score"] = {
+            "type": "number",
+            "minimum": score_config.min,
+            "maximum": score_config.max,
+            "description": f"The score for the evaluation, between "
+            f"{score_config.min} and {score_config.max}, inclusive.",
+        }
+    else:
+        raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'")
+
+    if score_config.include_explanation:
+        properties["explanation"] = {
+            "type": "string",
+            "description": (
+                "The explanation for the score."
+                if score_config.explanation_description is None
+                else score_config.explanation_description
+            ),
+        }
+
+    return {
+        "title": score_config.key,
+        "description": score_config.description,
+        "type": "object",
+        "properties": properties,
+        "required": (
+            ["score", "explanation"] if score_config.include_explanation else ["score"]
+        ),
+    }
+
+
+class LLMEvaluator(RunEvaluator):
+    """A class for building LLM-as-a-judge evaluators."""
+
+    def __init__(
+        self,
+        *,
+        prompt_template: Union[str, List[Tuple[str, str]]],
+        score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
+        map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
+        model_name: str = "gpt-4o",
+        model_provider: str = "openai",
+        **kwargs,
+    ):
+        """Initialize the LLMEvaluator.
+
+        Args:
+            prompt_template (Union[str, List[Tuple[str, str]]): The prompt
+                template to use for the evaluation. If a string is provided, it is
+                assumed to be a human / user message.
+            score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
+                The configuration for the score, either categorical or continuous.
+            map_variables (Optional[Callable[[Run, Example], dict]], optional):
+                A function that maps the run and example to the variables in the
+                prompt. Defaults to None. If None, it is assumed that the prompt
+                only requires 'input', 'output', and 'expected'.
+            model_name (Optional[str], optional): The model to use for the evaluation.
+                Defaults to "gpt-4o".
+            model_provider (Optional[str], optional): The model provider to use
+                for the evaluation. Defaults to "openai".
+        """
+        try:
+            from langchain.chat_models import init_chat_model
+        except ImportError as e:
+            raise ImportError(
+                "LLMEvaluator requires langchain to be installed. "
+                "Please install langchain by running `pip install langchain`."
+            ) from e
+
+        chat_model = init_chat_model(
+            model=model_name, model_provider=model_provider, **kwargs
+        )
+
+        self._initialize(prompt_template, score_config, map_variables, chat_model)
+
+    @classmethod
+    def from_model(
+        cls,
+        model: Any,
+        *,
+        prompt_template: Union[str, List[Tuple[str, str]]],
+        score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
+        map_variables: Optional[Callable[[Run, Optional[Example]], dict]] = None,
+    ):
+        """Create an LLMEvaluator instance from a BaseChatModel instance.
+
+        Args:
+            model (BaseChatModel): The chat model instance to use for the evaluation.
+            prompt_template (Union[str, List[Tuple[str, str]]): The prompt
+                template to use for the evaluation. If a string is provided, it is
+                assumed to be a system message.
+            score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
+                The configuration for the score, either categorical or continuous.
+            map_variables (Optional[Callable[[Run, Example]], dict]], optional):
+                A function that maps the run and example to the variables in the
+                prompt. Defaults to None. If None, it is assumed that the prompt
+                only requires 'input', 'output', and 'expected'.
+
+        Returns:
+            LLMEvaluator: An instance of LLMEvaluator.
+        """
+        instance = cls.__new__(cls)
+        instance._initialize(prompt_template, score_config, map_variables, model)
+        return instance
+
+    def _initialize(
+        self,
+        prompt_template: Union[str, List[Tuple[str, str]]],
+        score_config: Union[CategoricalScoreConfig, ContinuousScoreConfig],
+        map_variables: Optional[Callable[[Run, Optional[Example]], dict]],
+        chat_model: Any,
+    ):
+        """Shared initialization code for __init__ and from_model.
+
+        Args:
+            prompt_template (Union[str, List[Tuple[str, str]]): The prompt template.
+            score_config (Union[CategoricalScoreConfig, ContinuousScoreConfig]):
+                The score configuration.
+            map_variables (Optional[Callable[[Run, Example]], dict]]):
+                Function to map variables.
+            chat_model (BaseChatModel): The chat model instance.
+        """
+        try:
+            from langchain_core.language_models.chat_models import BaseChatModel
+            from langchain_core.prompts import ChatPromptTemplate
+        except ImportError as e:
+            raise ImportError(
+                "LLMEvaluator requires langchain-core to be installed. "
+                "Please install langchain-core by running `pip install langchain-core`."
+            ) from e
+
+        if not (
+            isinstance(chat_model, BaseChatModel)
+            and hasattr(chat_model, "with_structured_output")
+        ):
+            raise ValueError(
+                "chat_model must be an instance of "
+                "BaseLanguageModel and support structured output."
+            )
+
+        if isinstance(prompt_template, str):
+            self.prompt = ChatPromptTemplate.from_messages([("human", prompt_template)])
+        else:
+            self.prompt = ChatPromptTemplate.from_messages(prompt_template)
+
+        if set(self.prompt.input_variables) - {"input", "output", "expected"}:
+            if not map_variables:
+                raise ValueError(
+                    "map_inputs must be provided if the prompt template contains "
+                    "variables other than 'input', 'output', and 'expected'"
+                )
+        self.map_variables = map_variables
+
+        self.score_config = score_config
+        self.score_schema = _create_score_json_schema(self.score_config)
+
+        chat_model = chat_model.with_structured_output(self.score_schema)
+        self.runnable = self.prompt | chat_model
+
+    @beta_utils.warn_beta
+    def evaluate_run(
+        self, run: Run, example: Optional[Example] = None
+    ) -> Union[EvaluationResult, EvaluationResults]:
+        """Evaluate a run."""
+        variables = self._prepare_variables(run, example)
+        output: dict = cast(dict, self.runnable.invoke(variables))
+        return self._parse_output(output)
+
+    @beta_utils.warn_beta
+    async def aevaluate_run(
+        self, run: Run, example: Optional[Example] = None
+    ) -> Union[EvaluationResult, EvaluationResults]:
+        """Asynchronously evaluate a run."""
+        variables = self._prepare_variables(run, example)
+        output: dict = cast(dict, await self.runnable.ainvoke(variables))
+        return self._parse_output(output)
+
+    def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
+        """Prepare variables for model invocation."""
+        if self.map_variables:
+            return self.map_variables(run, example)
+
+        variables = {}
+        if "input" in self.prompt.input_variables:
+            if len(run.inputs) == 0:
+                raise ValueError(
+                    "No input keys are present in run.inputs but the prompt "
+                    "requires 'input'."
+                )
+            if len(run.inputs) != 1:
+                raise ValueError(
+                    "Multiple input keys are present in run.inputs. Please provide "
+                    "a map_variables function."
+                )
+            variables["input"] = list(run.inputs.values())[0]
+
+        if "output" in self.prompt.input_variables:
+            if not run.outputs:
+                raise ValueError(
+                    "No output keys are present in run.outputs but the prompt "
+                    "requires 'output'."
+                )
+            if len(run.outputs) == 0:
+                raise ValueError(
+                    "No output keys are present in run.outputs but the prompt "
+                    "requires 'output'."
+                )
+            if len(run.outputs) != 1:
+                raise ValueError(
+                    "Multiple output keys are present in run.outputs. Please "
+                    "provide a map_variables function."
+                )
+            variables["output"] = list(run.outputs.values())[0]
+
+        if "expected" in self.prompt.input_variables:
+            if not example or not example.outputs:
+                raise ValueError(
+                    "No example or example outputs is provided but the prompt "
+                    "requires 'expected'."
+                )
+            if len(example.outputs) == 0:
+                raise ValueError(
+                    "No output keys are present in example.outputs but the prompt "
+                    "requires 'expected'."
+                )
+            if len(example.outputs) != 1:
+                raise ValueError(
+                    "Multiple output keys are present in example.outputs. Please "
+                    "provide a map_variables function."
+                )
+            variables["expected"] = list(example.outputs.values())[0]
+
+        return variables
+
+    def _parse_output(self, output: dict) -> Union[EvaluationResult, EvaluationResults]:
+        """Parse the model output into an evaluation result."""
+        if isinstance(self.score_config, CategoricalScoreConfig):
+            value = output["score"]
+            explanation = output.get("explanation", None)
+            return EvaluationResult(
+                key=self.score_config.key, value=value, comment=explanation
+            )
+        elif isinstance(self.score_config, ContinuousScoreConfig):
+            score = output["score"]
+            explanation = output.get("explanation", None)
+            return EvaluationResult(
+                key=self.score_config.key, score=score, comment=explanation
+            )