评估开源模型#

为使此代码正常运行，请使用您的凭据配置 LangSmith 环境变量。

import os

os.environ["LANGCHAIN_API_KEY"] = "ls_.."  # Your LangSmith API key

import getpass
import os

keys = ["LANGCHAIN_API_KEY", "FIREWORKS_API_KEY"]
for key in keys:
    if not os.environ.get(key):
        os.environ[key] = getpass(f"Set {key}")

代理工厂#

为了进行评估，我们需要一个代理工厂，它将为每次评估运行创建一个新的代理执行器实例。

我们将使用 LangChain 基准测试提供的自定义 AgentFactory——请参阅 intro 部分，了解如何定义您自己的。

我们将为此使用 Fireworks API。

import json
from functools import partial
from typing import Sequence, Tuple

from langchain.agents import AgentExecutor
from langchain.agents.structured_chat.output_parser import (
    AgentAction,
    AgentFinish,
)
from langchain.output_parsers.json import parse_json_markdown
from langchain.prompts import ChatPromptTemplate
from langchain.tools import tool
from langchain_core.runnables import RunnableLambda

from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.schema import BaseTask, RegisteredModel
from langchain_benchmarks.tool_usage import apply_agent_executor_adapter
from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter


@tool
def final_answer(answer: str) -> str:
    """The final answer to the question."""
    return answer


def extract_first_json_object(text):
    # A hacky FSM to get the first JSON object across newlines
    OUTSIDE, INSIDE, IN_STRING = range(3)

    state = OUTSIDE
    nested_level = 0
    start_index = None

    def is_escaped(index):
        escape = False
        while index > 0 and text[index - 1] == "\\":
            escape = not escape
            index -= 1
        return escape

    for i, char in enumerate(text):
        if state == OUTSIDE:
            if char == "{":
                state = INSIDE
                nested_level = 1
                start_index = i

        elif state == INSIDE:
            if char == '"' and not is_escaped(i):
                state = IN_STRING
            elif char == "{":
                nested_level += 1
            elif char == "}":
                nested_level -= 1
                if nested_level == 0:
                    return text[start_index : i + 1]

        elif state == IN_STRING:
            if char == '"' and not is_escaped(i):
                state = INSIDE

    return None


def parse(message, prefix: str = "") -> dict:
    content = prefix + message.content.replace("\_", "_")
    content = extract_first_json_object(content)
    try:
        response = json.loads(content)
    except json.JSONDecodeError:
        response = parse_json_markdown(content)
    if response["action"] == "final_answer":
        return AgentFinish({"output": response["action_input"]}, content)
    else:
        return AgentAction(
            response["action"],
            response.get("action_input", {}),
            content,
        )


def format_intermediate_steps(
    intermediate_steps: Sequence[Tuple[AgentAction, str]],
) -> str:
    if not intermediate_steps:
        return ""

    # response_tmpl = "{action}\n{{\"response\": \"{observation}\"}}"
    response_tmpl = "{action}\n# Returned {observation}"
    serialized = "\n".join(
        [
            # f"{agent_action.log.strip()}\n{{\"response\": \"{observation}\"}}"
            response_tmpl.format(
                action=agent_action.log.strip(), observation=observation
            )
            for agent_action, observation in intermediate_steps
        ]
    )
    return f"""
```log.txt
{serialized}
```
Consider previous steps above. What's your next step?
"""


def format_scratchpad(x):
    intermediate_steps = x["intermediate_steps"]
    return format_intermediate_steps(intermediate_steps)


class AgentFactory:
    def __init__(
        self, task: BaseTask, model: RegisteredModel, num_retries: int = 5
    ) -> None:
        self.task = task
        self.model = model
        self.num_retries = num_retries

    def create_this_ugly_thing(self, env):
        tools = env.tools

        # schemas = []
        # for tool in tools + [final_answer]:
        #     function_def = convert_to_openai_function(tool.args_schema)
        #     function_def["name"] = tool.name
        #     schemas.append(function_def)
        # tools_str = "\n".join([json.dumps(sc) for sc in schemas])
        tools_str = "\n".join([tool.description for tool in tools + [final_answer]])
        messages = [
            (
                "system",
                f"Task Instructions: {self.task.instructions}\n\n"
                "The following tools are exposed via an API:\n"
                "{tools}\n\n"
                "Respond with one JSONL line to make your next action and call the API of a single tool."
                """ Format invocations like this:
{{"action": "tool name","action_input": {{TOOL BODY}}}}
\n\nUse the final_answer tool only once you know the correct answer and have called the tools required for the task.""",
            ),
            (
                "user",
                "{input}{agent_scratchpad}\n\nNote: Remember to respond in 1 JSONL line.",
            ),
        ]
        parse_fn = parse
        if self.model.type == "llm":
            messages += [("assistant", "{{")]
            # Fill it back in
            parse_fn = partial(parse_fn, prefix="{")
        prompt = ChatPromptTemplate.from_messages(messages)
        prompt = prompt.partial(tools=tools_str)

        llm = self.model.get_model(model_params={"temperature": 0}).bind(stop=["\n\n"])
        if self.num_retries:
            llm = llm.with_retry(stop_after_attempt=self.num_retries)

        @RunnableLambda
        def empty_fallback(x):
            """Return an empty response to avoid misleading metrics."""
            return {
                "intermediate_steps": [],
                "state": None,
                "output": "ERROR",
            }

        agent = (
            {
                "input": lambda x: x["input"],
                "agent_scratchpad": format_scratchpad,
            }
            | prompt
            | llm
            | parse_fn
        )

        return AgentExecutor(
            agent=agent, tools=tools, return_intermediate_steps=True
        ).with_fallbacks([empty_fallback])

    def __call__(self):
        # This factory creates a new environment for every agent run.
        # The reason is that the environment may be associated with an environment state (e.g., typewriter)
        # which is changed by the actions of the agent.
        # At the end of the run, the environment state will be read.
        env = self.task.create_environment()
        executor = self.create_this_ugly_thing(env)
        # Apply the adapters so that inputs and outputs match dataset schema
        # state_reader automatically adds the state of the environment at the end of the run.
        return apply_agent_executor_adapter(executor, state_reader=env.read_state)

评估#

现在让我们评估一个代理

from langchain_benchmarks.model_registration import model_registry

import uuid

from langsmith.client import Client

experiment_uuid = uuid.uuid4().hex[:4]

client = Client()

task_names = [task.name for task in registry.filter(Type="ToolUsageTask")]
models = ["mixtral-8x7b-fw-chat", "mixtral-8x7b"]

for task_name in task_names:
    for model_name in models:
        print()
        model = model_registry[model_name]
        task = registry[task_name]
        clone_public_dataset(task.dataset_id, dataset_name=task.name)
        eval_config = task.get_eval_config()
        test_run = client.run_on_dataset(
            dataset_name=task.name,
            llm_or_chain_factory=AgentFactory(task, model),
            evaluation=eval_config,
            project_name=f"{model.name}-{experiment_uuid}-{task.name}",
            tags=[model.name],
            project_metadata={"id": experiment_uuid, **model.params},
            verbose=True,
        )

[------->                                          ] 3/20View the evaluation results for project 'mixtral-8x7b-fw-chat-ece3-Tool Usage - Typewriter (1 tool)' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=2b92de52-2830-40cb-a396-4c08e0bf1c9b

View all tests for Dataset Tool Usage - Typewriter (1 tool) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3
[------------------------------------------------->] 20/20
View the evaluation results for project 'mixtral-8x7b-ece3-Tool Usage - Typewriter (1 tool)' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=ff797831-aee8-43db-a814-7727f9240006

View all tests for Dataset Tool Usage - Typewriter (1 tool) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3
[------------------------------------------------->] 20/20
View the evaluation results for project 'mixtral-8x7b-fw-chat-ece3-Tool Usage - Typewriter (26 tools)' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=1adbc135-93d9-46b2-a33a-e5470eded263

View all tests for Dataset Tool Usage - Typewriter (26 tools) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478
[------------------------------------------------->] 20/20
View the evaluation results for project 'mixtral-8x7b-ece3-Tool Usage - Typewriter (26 tools)' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=a8548cef-4afd-4f7e-9d21-7bd2fb3f9033

View all tests for Dataset Tool Usage - Typewriter (26 tools) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478
[------------------------------------------------->] 20/20
View the evaluation results for project 'mixtral-8x7b-fw-chat-ece3-Tool Usage - Relational Data' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=685df1fb-605d-40e3-b645-ae132a0a6229

View all tests for Dataset Tool Usage - Relational Data at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826
[------------------------------------------------->] 21/21
View the evaluation results for project 'mixtral-8x7b-ece3-Tool Usage - Relational Data' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=bb4d1ee4-bbc8-4969-a4f0-2b0732444785

View all tests for Dataset Tool Usage - Relational Data at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826
[------------------------------------------------->] 21/21
View the evaluation results for project 'mixtral-8x7b-fw-chat-ece3-Multiverse Math' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=ac7ec5aa-108d-4c5b-9c30-8e954fa132aa

View all tests for Dataset Multiverse Math at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0
[------------------------------------------------->] 10/10
View the evaluation results for project 'mixtral-8x7b-ece3-Multiverse Math' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=9d8573ee-847f-400a-8894-2e77c62e76ab

View all tests for Dataset Multiverse Math at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0
[------------------------------------------------->] 10/10

评估开源模型

目录

评估开源模型#

代理工厂#

评估#