import { BaseLanguageModel } from "@langchain/core/language_models/base";
import { RunnableConfig } from "@langchain/core/runnables";
import { Example, Run } from "langsmith";
import { EvaluationResult, RunEvaluator } from "langsmith/evaluation";
import { Criteria as CriteriaType, type EmbeddingDistanceEvalChainInput } from "../evaluation/index.js";
import { LoadEvaluatorOptions } from "../evaluation/loader.js";
import { EvaluatorType } from "../evaluation/types.js";
export type EvaluatorInputs = {
input?: string | unknown;
prediction: string | unknown;
reference?: string | unknown;
};
export type EvaluatorInputFormatter = ({ rawInput, rawPrediction, rawReferenceOutput, run, }: {
rawInput: any;
rawPrediction: any;
rawReferenceOutput?: any;
run: Run;
}) => EvaluatorInputs;
export type DynamicRunEvaluatorParams = Record, Prediction extends Record = Record, Reference extends Record = Record> = {
input: Input;
prediction?: Prediction;
reference?: Reference;
run: Run;
example?: Example;
};
/**
* Type of a function that can be coerced into a RunEvaluator function.
* While we have the class-based RunEvaluator, it's often more convenient to directly
* pass a function to the runner. This type allows us to do that.
*/
export type RunEvaluatorLike = ((props: DynamicRunEvaluatorParams, options: RunnableConfig) => Promise) | ((props: DynamicRunEvaluatorParams, options: RunnableConfig) => EvaluationResult);
export declare function isOffTheShelfEvaluator(evaluator: T | EvalConfig | U): evaluator is T | EvalConfig;
export declare function isCustomEvaluator(evaluator: T | EvalConfig | U): evaluator is U;
export type RunEvalType = T | EvalConfig | U;
/**
* Configuration class for running evaluations on datasets.
*
* @remarks
* RunEvalConfig in LangSmith is a configuration class for running evaluations on datasets. Its primary purpose is to define the parameters and evaluators that will be applied during the evaluation of a dataset. This configuration can include various evaluators, custom evaluators, and different keys for inputs, predictions, and references.
*
* @typeparam T - The type of evaluators.
* @typeparam U - The type of custom evaluators.
*/
export type RunEvalConfig = {
/**
* Evaluators to apply to a dataset run.
* You can optionally specify these by name, or by
* configuring them with an EvalConfig object.
*/
evaluators?: RunEvalType[];
/**
* Convert the evaluation data into formats that can be used by the evaluator.
* This should most commonly be a string.
* Parameters are the raw input from the run, the raw output, raw reference output, and the raw run.
* @example
* ```ts
* // Chain input: { input: "some string" }
* // Chain output: { output: "some output" }
* // Reference example output format: { output: "some reference output" }
* const formatEvaluatorInputs = ({
* rawInput,
* rawPrediction,
* rawReferenceOutput,
* }) => {
* return {
* input: rawInput.input,
* prediction: rawPrediction.output,
* reference: rawReferenceOutput.output,
* };
* };
* ```
* @returns The prepared data.
*/
formatEvaluatorInputs?: EvaluatorInputFormatter;
/**
* Custom evaluators to apply to a dataset run.
* Each evaluator is provided with a run trace containing the model
* outputs, as well as an "example" object representing a record
* in the dataset.
*
* @deprecated Use `evaluators` instead.
*/
customEvaluators?: U[];
};
export interface EvalConfig extends LoadEvaluatorOptions {
/**
* The name of the evaluator to use.
* Example: labeled_criteria, criteria, etc.
*/
evaluatorType: keyof EvaluatorType;
/**
* The feedback (or metric) name to use for the logged
* evaluation results. If none provided, we default to
* the evaluationName.
*/
feedbackKey?: string;
/**
* Convert the evaluation data into formats that can be used by the evaluator.
* This should most commonly be a string.
* Parameters are the raw input from the run, the raw output, raw reference output, and the raw run.
* @example
* ```ts
* // Chain input: { input: "some string" }
* // Chain output: { output: "some output" }
* // Reference example output format: { output: "some reference output" }
* const formatEvaluatorInputs = ({
* rawInput,
* rawPrediction,
* rawReferenceOutput,
* }) => {
* return {
* input: rawInput.input,
* prediction: rawPrediction.output,
* reference: rawReferenceOutput.output,
* };
* };
* ```
* @returns The prepared data.
*/
formatEvaluatorInputs: EvaluatorInputFormatter;
}
/**
* Configuration to load a "CriteriaEvalChain" evaluator,
* which prompts an LLM to determine whether the model's
* prediction complies with the provided criteria.
* @param criteria - The criteria to use for the evaluator.
* @param llm - The language model to use for the evaluator.
* @returns The configuration for the evaluator.
* @example
* ```ts
* const evalConfig = {
* evaluators: [Criteria("helpfulness")],
* };
* @example
* ```ts
* const evalConfig = {
* evaluators: [
* Criteria({
* "isCompliant": "Does the submission comply with the requirements of XYZ"
* })
* ],
* };
* @example
* ```ts
* const evalConfig = {
* evaluators: [{
* evaluatorType: "criteria",
* criteria: "helpfulness"
* formatEvaluatorInputs: ...
* }]
* };
* ```
* @example
* ```ts
* const evalConfig = {
* evaluators: [{
* evaluatorType: "criteria",
* criteria: { "isCompliant": "Does the submission comply with the requirements of XYZ" },
* formatEvaluatorInputs: ...
* }]
* };
*/
export type Criteria = EvalConfig & {
evaluatorType: "criteria";
/**
* The "criteria" to insert into the prompt template
* used for evaluation. See the prompt at
* https://smith.langchain.com/hub/langchain-ai/criteria-evaluator
* for more information.
*/
criteria?: CriteriaType | Record;
/**
* The language model to use as the evaluator, defaults to GPT-4
*/
llm?: BaseLanguageModel;
};
export type CriteriaEvalChainConfig = Criteria;
export declare function Criteria(criteria: CriteriaType | Record, config?: Pick, "formatEvaluatorInputs" | "llm" | "feedbackKey">): EvalConfig;
/**
* Configuration to load a "LabeledCriteriaEvalChain" evaluator,
* which prompts an LLM to determine whether the model's
* prediction complies with the provided criteria and also
* provides a "ground truth" label for the evaluator to incorporate
* in its evaluation.
* @param criteria - The criteria to use for the evaluator.
* @param llm - The language model to use for the evaluator.
* @returns The configuration for the evaluator.
* @example
* ```ts
* const evalConfig = {
* evaluators: [LabeledCriteria("correctness")],
* };
* @example
* ```ts
* const evalConfig = {
* evaluators: [
* LabeledCriteria({
* "mentionsAllFacts": "Does the include all facts provided in the reference?"
* })
* ],
* };
* @example
* ```ts
* const evalConfig = {
* evaluators: [{
* evaluatorType: "labeled_criteria",
* criteria: "correctness",
* formatEvaluatorInputs: ...
* }],
* };
* ```
* @example
* ```ts
* const evalConfig = {
* evaluators: [{
* evaluatorType: "labeled_criteria",
* criteria: { "mentionsAllFacts": "Does the include all facts provided in the reference?" },
* formatEvaluatorInputs: ...
* }],
* };
*/
export type LabeledCriteria = EvalConfig & {
evaluatorType: "labeled_criteria";
/**
* The "criteria" to insert into the prompt template
* used for evaluation. See the prompt at
* https://smith.langchain.com/hub/langchain-ai/labeled-criteria
* for more information.
*/
criteria?: CriteriaType | Record;
/**
* The language model to use as the evaluator, defaults to GPT-4
*/
llm?: BaseLanguageModel;
};
export declare function LabeledCriteria(criteria: CriteriaType | Record, config?: Pick, "formatEvaluatorInputs" | "llm" | "feedbackKey">): LabeledCriteria;
/**
* Configuration to load a "EmbeddingDistanceEvalChain" evaluator,
* which embeds distances to score semantic difference between
* a prediction and reference.
*/
export type EmbeddingDistance = EvalConfig & EmbeddingDistanceEvalChainInput & {
evaluatorType: "embedding_distance";
};
export declare function EmbeddingDistance(distanceMetric: EmbeddingDistanceEvalChainInput["distanceMetric"], config?: Pick, "formatEvaluatorInputs" | "embedding" | "feedbackKey">): EmbeddingDistance;