基准测试所有任务#

让我们针对所有工具使用任务进行基准测试。

扩展 test 列表以使用不同的模型和代理架构进行基准测试。

请注意，这需要 langsmith>=0.0.72 才能在最后运行可视化部分。

设置#

凭据#

首先，让我们设置要测试的模型和凭据。

import os
from getpass import getpass

# This is just the default list below
required_env_vars = [
    "LANGCHAIN_API_KEY",
    "ANTHROPIC_API_KEY",
    "OPENAI_API_KEY",
    "MISTRAL_API_KEY",
]
for var in required_env_vars:
    if var not in os.environ:
        os.environ[var] = getpass(f"Provide the required {var}")

实例化模型#

from langchain_anthropic import ChatAnthropic
from langchain_core.tools import tool
from langchain_google_vertexai import ChatVertexAI
from langchain_mistralai import ChatMistralAI
from langchain_openai import ChatOpenAI

from langchain_benchmarks.tool_usage.agents import StandardAgentFactory

tests = [
    (
        "gemini-1.0-pro-002",
        ChatVertexAI(model_name="gemini-1.0-pro-002", temperature=0),
    ),
    (
        "gemini-1.5-pro-preview-0409",
        ChatVertexAI(model_name="gemini-1.5-pro-preview-0409", temperature=0),
    ),
    (
        "open-mixtral-8x22b-2404",
        ChatMistralAI(model="open-mixtral-8x22b-2404", temperature=0),
    ),
    ("mistral-large-2402", ChatMistralAI(model="mistral-large-2402", temperature=0)),
    (
        "claude-3-opus-20240229",
        ChatAnthropic(model="claude-3-opus-20240229", temperature=0),
    ),
    (
        "claude-3-haiku-20240307",
        ChatAnthropic(model="claude-3-haiku-20240307", temperature=0),
    ),
    (
        "claude-3-sonnet-20240229",
        ChatAnthropic(model="claude-3-sonnet-20240229", temperature=0),
    ),
    ("gpt-3.5-turbo-0125", ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)),
    (
        "gpt-4-turbo-2024-04-09",
        ChatOpenAI(model="gpt-4-turbo-2024-04-09", temperature=0),
    ),
]

设置实验#

import datetime

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langsmith.client import Client

from langchain_benchmarks import (
    __version__,
    clone_public_dataset,
    model_registry,
    registry,
)
from langchain_benchmarks.rate_limiting import RateLimiter

# Create prompts for the agents
# Using two prompts because some chat models do not support SystemMessage.
without_system_message_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "human",
            "{instructions}\n{question}",
        ),  # Populated from task.instructions automatically
        MessagesPlaceholder("agent_scratchpad"),  # Workspace for the agent
    ]
)

with_system_message_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "{instructions}"),
        ("human", "{question}"),  # Populated from task.instructions automatically
        MessagesPlaceholder("agent_scratchpad"),  # Workspace for the agent
    ]
)

生成实验 ID。

我们可以使用此实验 ID 标记我们的运行，并使用此实验 ID 从 LangSmith 中提取数据。

experiment_uuid = "sky25"  # Or generate ranom using uuid.uuid4().hex[:4]
# experiment_uuid = uuid.uuid4().hex[:4]

运行#

client = Client()  # Launch langsmith client for cloning datasets
today = datetime.date.today().isoformat()


for task in registry.tasks:
    if task.type != "ToolUsageTask":
        continue

    # This is a small test dataset that can be used to verify
    # that everything is set up correctly prior to running over
    # all results. We may remove it in the future.
    if task.name == "Multiverse Math (Tiny)":
        continue

    dataset_name = task.name + f" ({today})"
    clone_public_dataset(task.dataset_id, dataset_name=dataset_name)

    for model_name, model in tests:
        if model_name.startswith("gemini"):
            # google models don't use system prompt
            prompt = without_system_message_prompt
            rate_limiter = RateLimiter(requests_per_second=0.1)
        else:
            prompt = with_system_message_prompt
            rate_limiter = RateLimiter(requests_per_second=1)
        print()
        print(f"Benchmarking {task.name} with model: {model_name}")
        eval_config = task.get_eval_config()

        agent_factory = StandardAgentFactory(
            task, model, prompt, rate_limiter=rate_limiter
        )

        client.run_on_dataset(
            dataset_name=dataset_name,
            llm_or_chain_factory=agent_factory,
            evaluation=eval_config,
            verbose=False,
            project_name=f"{model_name}-{task.name}-{today}-{experiment_uuid}",
            concurrency_level=5,
            project_metadata={
                "model": model_name,
                "id": experiment_uuid,
                "task": task.name,
                "date": today,
                "langchain_benchmarks_version": __version__,
            },
        )

检查#

请注意，如果队列负载过重，您可能需要等待一段时间再运行以下操作，以确保所有运行都在数据库中，并且所有统计数据都已正确计算。

!pip install --quiet -U pandas

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from langsmith.client import Client

让我们获取所有具有相同实验 ID 的数据，并将其放入数据框中。

experiment_ids = ["sky25"]
dataset_names = [
    "Tool Usage - Typewriter (1 tool)",
    "Tool Usage - Typewriter (26 tools)",
    "Tool Usage - Relational Data",
    "Multiverse Math",
]

client = Client()
projects = []
for dataset_name in dataset_names:
    dataset_name_ = dataset_name + f" ({today})"
    for project in client.list_projects(reference_dataset_name=dataset_name_):
        if (
            project.metadata.get("id") in experiment_ids
            and project.end_time is not None
        ):
            projects.append(project)

dfs = []
keys = set()
for project in projects:
    # Temporary way to get tag information
    try:
        test_results = client.get_test_results(project_name=project.name)
    except Exception as e:
        print(e, project.run_count)
        continue

    for k, v in project.metadata.items():
        test_results[k] = v
    keys.update(test_results.columns)
    dfs.append(test_results)
for df in dfs:
    missing = list(keys - set(df.columns))
    for key in missing:
        df[key] = None
df = pd.concat(dfs)

计算标准化的“正确”列。它对工具使用任务使用“正确最终状态”，对其他任务使用“正确性（基于输出）”。

correct = []

for r in df.to_dict(orient="records"):
    if "Typewriter" in r["task"]:
        correct.append(r["feedback.correct final state"])
    else:
        correct.append(r["feedback.correctness"])

df["correct"] = correct
df["correct"].fillna(0, inplace=True)

计算一些统计数据。我们正在使用估计平均值的标准误差，假设伯努利过程。

num_correct = df.groupby(["model", "task"])["correct"].sum().to_frame("num_correct")
total = df.groupby(["task", "model"]).size().to_frame("total")
stats_df = total.join(num_correct)
stats_df["% correct"] = stats_df["num_correct"] / stats_df["total"]
stats_df["error"] = np.sqrt(
    stats_df["% correct"] * (1 - stats_df["% correct"]) / stats_df["total"]
)

tasks = [
    "Tool Usage - Typewriter (1 tool)",
    "Tool Usage - Typewriter (26 tools)",
    "Multiverse Math",
    "Tool Usage - Relational Data",
]

stats_df = stats_df.reset_index()
models = stats_df["model"].unique()

models

array(['claude-3-haiku-20240307', 'claude-3-opus-20240229',
       'claude-3-sonnet-20240229', 'gemini-1.0-pro-002',
       'gemini-1.5-pro-preview-0409', 'gpt-3.5-turbo-0125',
       'gpt-4-turbo-2024-04-09', 'mistral-large-2402',
       'open-mixtral-8x22b-2404'], dtype=object)

绘制结果

from itertools import product

x = np.arange(len(tasks))  # the label locations
width = 0.06  # the width of the bars
multiplier = 1.1

fig, ax = plt.subplots(layout="constrained", figsize=(20, 4))
colormap = plt.get_cmap("Set3").colors
idx = 0
for model in models:
    try:
        results = stats_df.set_index("model").loc[model]
    except:
        continue
    if len(results) == 0:
        continue
    color = colormap[idx]
    idx += 1

    results = results.set_index("task").loc[tasks]
    measurement = results["% correct"]

    values = [round(m, 2) for m in measurement]

    offset = width * multiplier * 1.4
    rects = ax.bar(
        x + offset,
        values,
        width,
        label=f"{model}",
        yerr=results["error"],
        color=color,
    )
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel("% Questions Answered Correctly")
ax.set_title("Tool Usage Performance")
ax.set_xticks(x + width + 0.3, tasks)
ax.legend(
    loc="center left", ncols=1, bbox_to_anchor=(1.0, 0.5), frameon=False, title="Model"
)
ax.set_ylim(0, 1.10)
plt.savefig("overall_perf.png", dpi=300, bbox_inches="tight")
plt.show()

基准测试所有任务

内容

基准测试所有任务#

设置#

凭据#

实例化模型#

设置实验#

运行#

检查#