基准测试所有任务#
让我们针对所有工具使用任务进行基准测试。
扩展 test
列表以使用不同的模型和代理架构进行基准测试。
请注意,这需要 langsmith>=0.0.72
才能在最后运行可视化部分。
设置#
凭据#
首先,让我们设置要测试的模型和凭据。
import os
from getpass import getpass
# This is just the default list below
required_env_vars = [
"LANGCHAIN_API_KEY",
"ANTHROPIC_API_KEY",
"OPENAI_API_KEY",
"MISTRAL_API_KEY",
]
for var in required_env_vars:
if var not in os.environ:
os.environ[var] = getpass(f"Provide the required {var}")
实例化模型#
from langchain_anthropic import ChatAnthropic
from langchain_core.tools import tool
from langchain_google_vertexai import ChatVertexAI
from langchain_mistralai import ChatMistralAI
from langchain_openai import ChatOpenAI
from langchain_benchmarks.tool_usage.agents import StandardAgentFactory
tests = [
(
"gemini-1.0-pro-002",
ChatVertexAI(model_name="gemini-1.0-pro-002", temperature=0),
),
(
"gemini-1.5-pro-preview-0409",
ChatVertexAI(model_name="gemini-1.5-pro-preview-0409", temperature=0),
),
(
"open-mixtral-8x22b-2404",
ChatMistralAI(model="open-mixtral-8x22b-2404", temperature=0),
),
("mistral-large-2402", ChatMistralAI(model="mistral-large-2402", temperature=0)),
(
"claude-3-opus-20240229",
ChatAnthropic(model="claude-3-opus-20240229", temperature=0),
),
(
"claude-3-haiku-20240307",
ChatAnthropic(model="claude-3-haiku-20240307", temperature=0),
),
(
"claude-3-sonnet-20240229",
ChatAnthropic(model="claude-3-sonnet-20240229", temperature=0),
),
("gpt-3.5-turbo-0125", ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)),
(
"gpt-4-turbo-2024-04-09",
ChatOpenAI(model="gpt-4-turbo-2024-04-09", temperature=0),
),
]
设置实验#
import datetime
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langsmith.client import Client
from langchain_benchmarks import (
__version__,
clone_public_dataset,
model_registry,
registry,
)
from langchain_benchmarks.rate_limiting import RateLimiter
# Create prompts for the agents
# Using two prompts because some chat models do not support SystemMessage.
without_system_message_prompt = ChatPromptTemplate.from_messages(
[
(
"human",
"{instructions}\n{question}",
), # Populated from task.instructions automatically
MessagesPlaceholder("agent_scratchpad"), # Workspace for the agent
]
)
with_system_message_prompt = ChatPromptTemplate.from_messages(
[
("system", "{instructions}"),
("human", "{question}"), # Populated from task.instructions automatically
MessagesPlaceholder("agent_scratchpad"), # Workspace for the agent
]
)
生成实验 ID。
我们可以使用此实验 ID 标记我们的运行,并使用此实验 ID 从 LangSmith 中提取数据。
experiment_uuid = "sky25" # Or generate ranom using uuid.uuid4().hex[:4]
# experiment_uuid = uuid.uuid4().hex[:4]
运行#
client = Client() # Launch langsmith client for cloning datasets
today = datetime.date.today().isoformat()
for task in registry.tasks:
if task.type != "ToolUsageTask":
continue
# This is a small test dataset that can be used to verify
# that everything is set up correctly prior to running over
# all results. We may remove it in the future.
if task.name == "Multiverse Math (Tiny)":
continue
dataset_name = task.name + f" ({today})"
clone_public_dataset(task.dataset_id, dataset_name=dataset_name)
for model_name, model in tests:
if model_name.startswith("gemini"):
# google models don't use system prompt
prompt = without_system_message_prompt
rate_limiter = RateLimiter(requests_per_second=0.1)
else:
prompt = with_system_message_prompt
rate_limiter = RateLimiter(requests_per_second=1)
print()
print(f"Benchmarking {task.name} with model: {model_name}")
eval_config = task.get_eval_config()
agent_factory = StandardAgentFactory(
task, model, prompt, rate_limiter=rate_limiter
)
client.run_on_dataset(
dataset_name=dataset_name,
llm_or_chain_factory=agent_factory,
evaluation=eval_config,
verbose=False,
project_name=f"{model_name}-{task.name}-{today}-{experiment_uuid}",
concurrency_level=5,
project_metadata={
"model": model_name,
"id": experiment_uuid,
"task": task.name,
"date": today,
"langchain_benchmarks_version": __version__,
},
)
检查#
请注意,如果队列负载过重,您可能需要等待一段时间再运行以下操作,以确保所有运行都在数据库中,并且所有统计数据都已正确计算。
!pip install --quiet -U pandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from langsmith.client import Client
让我们获取所有具有相同实验 ID 的数据,并将其放入数据框中。
experiment_ids = ["sky25"]
dataset_names = [
"Tool Usage - Typewriter (1 tool)",
"Tool Usage - Typewriter (26 tools)",
"Tool Usage - Relational Data",
"Multiverse Math",
]
client = Client()
projects = []
for dataset_name in dataset_names:
dataset_name_ = dataset_name + f" ({today})"
for project in client.list_projects(reference_dataset_name=dataset_name_):
if (
project.metadata.get("id") in experiment_ids
and project.end_time is not None
):
projects.append(project)
dfs = []
keys = set()
for project in projects:
# Temporary way to get tag information
try:
test_results = client.get_test_results(project_name=project.name)
except Exception as e:
print(e, project.run_count)
continue
for k, v in project.metadata.items():
test_results[k] = v
keys.update(test_results.columns)
dfs.append(test_results)
for df in dfs:
missing = list(keys - set(df.columns))
for key in missing:
df[key] = None
df = pd.concat(dfs)
计算标准化的“正确”列。它对工具使用任务使用“正确最终状态”,对其他任务使用“正确性(基于输出)”。
correct = []
for r in df.to_dict(orient="records"):
if "Typewriter" in r["task"]:
correct.append(r["feedback.correct final state"])
else:
correct.append(r["feedback.correctness"])
df["correct"] = correct
df["correct"].fillna(0, inplace=True)
计算一些统计数据。我们正在使用估计平均值的标准误差,假设伯努利过程。
num_correct = df.groupby(["model", "task"])["correct"].sum().to_frame("num_correct")
total = df.groupby(["task", "model"]).size().to_frame("total")
stats_df = total.join(num_correct)
stats_df["% correct"] = stats_df["num_correct"] / stats_df["total"]
stats_df["error"] = np.sqrt(
stats_df["% correct"] * (1 - stats_df["% correct"]) / stats_df["total"]
)
tasks = [
"Tool Usage - Typewriter (1 tool)",
"Tool Usage - Typewriter (26 tools)",
"Multiverse Math",
"Tool Usage - Relational Data",
]
stats_df = stats_df.reset_index()
models = stats_df["model"].unique()
models
array(['claude-3-haiku-20240307', 'claude-3-opus-20240229',
'claude-3-sonnet-20240229', 'gemini-1.0-pro-002',
'gemini-1.5-pro-preview-0409', 'gpt-3.5-turbo-0125',
'gpt-4-turbo-2024-04-09', 'mistral-large-2402',
'open-mixtral-8x22b-2404'], dtype=object)
绘制结果
from itertools import product
x = np.arange(len(tasks)) # the label locations
width = 0.06 # the width of the bars
multiplier = 1.1
fig, ax = plt.subplots(layout="constrained", figsize=(20, 4))
colormap = plt.get_cmap("Set3").colors
idx = 0
for model in models:
try:
results = stats_df.set_index("model").loc[model]
except:
continue
if len(results) == 0:
continue
color = colormap[idx]
idx += 1
results = results.set_index("task").loc[tasks]
measurement = results["% correct"]
values = [round(m, 2) for m in measurement]
offset = width * multiplier * 1.4
rects = ax.bar(
x + offset,
values,
width,
label=f"{model}",
yerr=results["error"],
color=color,
)
ax.bar_label(rects, padding=3)
multiplier += 1
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel("% Questions Answered Correctly")
ax.set_title("Tool Usage Performance")
ax.set_xticks(x + width + 0.3, tasks)
ax.legend(
loc="center left", ncols=1, bbox_to_anchor=(1.0, 0.5), frameon=False, title="Model"
)
ax.set_ylim(0, 1.10)
plt.savefig("overall_perf.png", dpi=300, bbox_inches="tight")
plt.show()