评估开源模型#
为使此代码正常运行,请使用您的凭据配置 LangSmith 环境变量。
import os
os.environ["LANGCHAIN_API_KEY"] = "ls_.." # Your LangSmith API key
import getpass
import os
keys = ["LANGCHAIN_API_KEY", "FIREWORKS_API_KEY"]
for key in keys:
if not os.environ.get(key):
os.environ[key] = getpass(f"Set {key}")
代理工厂#
为了进行评估,我们需要一个代理工厂,它将为每次评估运行创建一个新的代理执行器实例。
我们将使用 LangChain 基准测试提供的自定义 AgentFactory——请参阅 intro
部分,了解如何定义您自己的。
我们将为此使用 Fireworks API。
import json
from functools import partial
from typing import Sequence, Tuple
from langchain.agents import AgentExecutor
from langchain.agents.structured_chat.output_parser import (
AgentAction,
AgentFinish,
)
from langchain.output_parsers.json import parse_json_markdown
from langchain.prompts import ChatPromptTemplate
from langchain.tools import tool
from langchain_core.runnables import RunnableLambda
from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.schema import BaseTask, RegisteredModel
from langchain_benchmarks.tool_usage import apply_agent_executor_adapter
from langchain_benchmarks.tool_usage.agents import apply_agent_executor_adapter
@tool
def final_answer(answer: str) -> str:
"""The final answer to the question."""
return answer
def extract_first_json_object(text):
# A hacky FSM to get the first JSON object across newlines
OUTSIDE, INSIDE, IN_STRING = range(3)
state = OUTSIDE
nested_level = 0
start_index = None
def is_escaped(index):
escape = False
while index > 0 and text[index - 1] == "\\":
escape = not escape
index -= 1
return escape
for i, char in enumerate(text):
if state == OUTSIDE:
if char == "{":
state = INSIDE
nested_level = 1
start_index = i
elif state == INSIDE:
if char == '"' and not is_escaped(i):
state = IN_STRING
elif char == "{":
nested_level += 1
elif char == "}":
nested_level -= 1
if nested_level == 0:
return text[start_index : i + 1]
elif state == IN_STRING:
if char == '"' and not is_escaped(i):
state = INSIDE
return None
def parse(message, prefix: str = "") -> dict:
content = prefix + message.content.replace("\_", "_")
content = extract_first_json_object(content)
try:
response = json.loads(content)
except json.JSONDecodeError:
response = parse_json_markdown(content)
if response["action"] == "final_answer":
return AgentFinish({"output": response["action_input"]}, content)
else:
return AgentAction(
response["action"],
response.get("action_input", {}),
content,
)
def format_intermediate_steps(
intermediate_steps: Sequence[Tuple[AgentAction, str]],
) -> str:
if not intermediate_steps:
return ""
# response_tmpl = "{action}\n{{\"response\": \"{observation}\"}}"
response_tmpl = "{action}\n# Returned {observation}"
serialized = "\n".join(
[
# f"{agent_action.log.strip()}\n{{\"response\": \"{observation}\"}}"
response_tmpl.format(
action=agent_action.log.strip(), observation=observation
)
for agent_action, observation in intermediate_steps
]
)
return f"""
```log.txt
{serialized}
```
Consider previous steps above. What's your next step?
"""
def format_scratchpad(x):
intermediate_steps = x["intermediate_steps"]
return format_intermediate_steps(intermediate_steps)
class AgentFactory:
def __init__(
self, task: BaseTask, model: RegisteredModel, num_retries: int = 5
) -> None:
self.task = task
self.model = model
self.num_retries = num_retries
def create_this_ugly_thing(self, env):
tools = env.tools
# schemas = []
# for tool in tools + [final_answer]:
# function_def = convert_to_openai_function(tool.args_schema)
# function_def["name"] = tool.name
# schemas.append(function_def)
# tools_str = "\n".join([json.dumps(sc) for sc in schemas])
tools_str = "\n".join([tool.description for tool in tools + [final_answer]])
messages = [
(
"system",
f"Task Instructions: {self.task.instructions}\n\n"
"The following tools are exposed via an API:\n"
"{tools}\n\n"
"Respond with one JSONL line to make your next action and call the API of a single tool."
""" Format invocations like this:
{{"action": "tool name","action_input": {{TOOL BODY}}}}
\n\nUse the final_answer tool only once you know the correct answer and have called the tools required for the task.""",
),
(
"user",
"{input}{agent_scratchpad}\n\nNote: Remember to respond in 1 JSONL line.",
),
]
parse_fn = parse
if self.model.type == "llm":
messages += [("assistant", "{{")]
# Fill it back in
parse_fn = partial(parse_fn, prefix="{")
prompt = ChatPromptTemplate.from_messages(messages)
prompt = prompt.partial(tools=tools_str)
llm = self.model.get_model(model_params={"temperature": 0}).bind(stop=["\n\n"])
if self.num_retries:
llm = llm.with_retry(stop_after_attempt=self.num_retries)
@RunnableLambda
def empty_fallback(x):
"""Return an empty response to avoid misleading metrics."""
return {
"intermediate_steps": [],
"state": None,
"output": "ERROR",
}
agent = (
{
"input": lambda x: x["input"],
"agent_scratchpad": format_scratchpad,
}
| prompt
| llm
| parse_fn
)
return AgentExecutor(
agent=agent, tools=tools, return_intermediate_steps=True
).with_fallbacks([empty_fallback])
def __call__(self):
# This factory creates a new environment for every agent run.
# The reason is that the environment may be associated with an environment state (e.g., typewriter)
# which is changed by the actions of the agent.
# At the end of the run, the environment state will be read.
env = self.task.create_environment()
executor = self.create_this_ugly_thing(env)
# Apply the adapters so that inputs and outputs match dataset schema
# state_reader automatically adds the state of the environment at the end of the run.
return apply_agent_executor_adapter(executor, state_reader=env.read_state)
评估#
现在让我们评估一个代理
from langchain_benchmarks.model_registration import model_registry
import uuid
from langsmith.client import Client
experiment_uuid = uuid.uuid4().hex[:4]
client = Client()
task_names = [task.name for task in registry.filter(Type="ToolUsageTask")]
models = ["mixtral-8x7b-fw-chat", "mixtral-8x7b"]
for task_name in task_names:
for model_name in models:
print()
model = model_registry[model_name]
task = registry[task_name]
clone_public_dataset(task.dataset_id, dataset_name=task.name)
eval_config = task.get_eval_config()
test_run = client.run_on_dataset(
dataset_name=task.name,
llm_or_chain_factory=AgentFactory(task, model),
evaluation=eval_config,
project_name=f"{model.name}-{experiment_uuid}-{task.name}",
tags=[model.name],
project_metadata={"id": experiment_uuid, **model.params},
verbose=True,
)
[-------> ] 3/20View the evaluation results for project 'mixtral-8x7b-fw-chat-ece3-Tool Usage - Typewriter (1 tool)' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=2b92de52-2830-40cb-a396-4c08e0bf1c9b
View all tests for Dataset Tool Usage - Typewriter (1 tool) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3
[------------------------------------------------->] 20/20
View the evaluation results for project 'mixtral-8x7b-ece3-Tool Usage - Typewriter (1 tool)' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3/compare?selectedSessions=ff797831-aee8-43db-a814-7727f9240006
View all tests for Dataset Tool Usage - Typewriter (1 tool) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/82ca6840-cf23-4bb0-a9be-55237ebbe9d3
[------------------------------------------------->] 20/20
View the evaluation results for project 'mixtral-8x7b-fw-chat-ece3-Tool Usage - Typewriter (26 tools)' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=1adbc135-93d9-46b2-a33a-e5470eded263
View all tests for Dataset Tool Usage - Typewriter (26 tools) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478
[------------------------------------------------->] 20/20
View the evaluation results for project 'mixtral-8x7b-ece3-Tool Usage - Typewriter (26 tools)' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478/compare?selectedSessions=a8548cef-4afd-4f7e-9d21-7bd2fb3f9033
View all tests for Dataset Tool Usage - Typewriter (26 tools) at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/2f462c7a-f9b9-46e7-b96b-7469e965f478
[------------------------------------------------->] 20/20
View the evaluation results for project 'mixtral-8x7b-fw-chat-ece3-Tool Usage - Relational Data' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=685df1fb-605d-40e3-b645-ae132a0a6229
View all tests for Dataset Tool Usage - Relational Data at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826
[------------------------------------------------->] 21/21
View the evaluation results for project 'mixtral-8x7b-ece3-Tool Usage - Relational Data' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826/compare?selectedSessions=bb4d1ee4-bbc8-4969-a4f0-2b0732444785
View all tests for Dataset Tool Usage - Relational Data at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/df6be6c9-05b3-445e-8836-ebb4aba63826
[------------------------------------------------->] 21/21
View the evaluation results for project 'mixtral-8x7b-fw-chat-ece3-Multiverse Math' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=ac7ec5aa-108d-4c5b-9c30-8e954fa132aa
View all tests for Dataset Multiverse Math at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0
[------------------------------------------------->] 10/10
View the evaluation results for project 'mixtral-8x7b-ece3-Multiverse Math' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0/compare?selectedSessions=9d8573ee-847f-400a-8894-2e77c62e76ab
View all tests for Dataset Multiverse Math at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/108bdc68-1808-4b60-92ef-fbd9bd7e1ad0
[------------------------------------------------->] 10/10