半结构化评估:多向量#
Semi-structured Reports
是一个公共数据集,包含来自具有文本和表格的文档的问答对。
问答对源于表格以及文档中的一些段落。
我们使用多向量检索器来评估 RAG 的性能。
先决条件#
# %pip install -U langchain langsmith langchain_benchmarks
# %pip install --quiet chromadb openai pypdf tiktoken
import getpass
import os
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
env_vars = ["LANGCHAIN_API_KEY", "OPENAI_API_KEY"]
for var in env_vars:
if var not in os.environ:
os.environ[var] = getpass.getpass(prompt=f"Enter your {var}: ")
数据集#
从远程缓存获取与数据集关联的 PDF,以便我们可以执行摄取。
import os
from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names
# Task
task = registry["Semi-structured Reports"]
# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]
克隆数据集,以便它在我们的 LangSmith 数据集中可用。
clone_public_dataset(task.dataset_id, dataset_name=task.name)
加载和索引#
我们构建了一个专注于表格的检索器。
为此,我们使用 LLM 扫描每个页面并总结页面中的任何表格。
然后我们索引这些摘要以供检索,并使用多向量检索器存储包含表格的原始页面文本。
最后,我们使用集成检索器将检索到的表格块与原始文本块混合。
将不同检索器的排名组合成一个统一的排名。
每个检索器都提供一个基于与查询的相关性进行排名的文档(或搜索结果)列表。
权重表示您对每个检索器结果的相对重要性或信任度。
权重用于缩放每个检索器对最终组合排名的贡献。
RRF 方法使用检索器提供的列表中每个项目的排名。
基本思想是给在列表中排名更高(即排名数字更低)的项目赋予更高的分数。
import uuid
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
def prepare_documents(docs):
"""
Prepare documents for prompt. Concatenates Document objects (after extracting their page_content)
and strings into a single string, separated by two newlines.
:param docs: A list of str or Document objects.
:return: A single string containing all documents.
"""
# Process each document and append it to the list
processed_docs = [
doc.page_content if isinstance(doc, Document) else doc for doc in docs
]
# Join all processed documents into a single string
return "\n\n".join(processed_docs)
def create_multi_vector_retriever(vectorstore, text_summaries, texts):
"""
Create retriever that indexes summaries, but returns raw images or texts
"""
# Initialize the storage layer
store = InMemoryStore()
id_key = "doc_id"
# Create the multi-vector retriever
retriever = MultiVectorRetriever(
vectorstore=vectorstore,
docstore=store,
id_key=id_key,
)
# Helper function to add documents to the vectorstore and docstore
def add_documents(retriever, doc_summaries, doc_contents):
doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
summary_docs = [
Document(page_content=s, metadata={id_key: doc_ids[i]})
for i, s in enumerate(doc_summaries)
]
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, doc_contents)))
# Add texts, tables, and images
add_documents(retriever, text_summaries, texts)
return retriever
def generate_doc_summary(file):
"""
Create a doc summary
"""
# Prompt
prompt_text = """You are an assistant tasked extracting two attributes \
from financial documents. (1) Tell me the company that the document is \
focused on. (2) Look at any tables in the document and tell me the units \
of the table. Many table will have '(In thousands)' or '(in millions)' prior \
to the table text. Provide these two for the document: \n\n {document} """
prompt = ChatPromptTemplate.from_template(prompt_text)
# Text summary chain
model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
summarize_chain = {"document": lambda x: x} | prompt | model | StrOutputParser()
# Load doc
loader = PyPDFLoader(file)
pdf_pages = loader.load()
texts = [t.page_content for t in pdf_pages]
text_string = " ".join(texts)
summary = summarize_chain.invoke({"document": text_string})
return summary
def generate_table_summaries(texts):
"""
Summarize text elements
texts: List of str
"""
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables within a provided text chunk. \
If the text chunk contains tables, then give a brief summary of the table and list the row and column \
names to identify what is captured in the table. Do not sumnmarize quantitative results in the table. \
If there is no table present, then just return "No table". \n\n Text: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
# Text summary chain
model = ChatOpenAI(temperature=0, model="gpt-4")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
# Initialize empty summaries
text_summaries = []
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
return text_summaries
def load_and_split(file, token_count, split_document=True):
"""
Load and optionally split PDF files.
Args:
file (str): File path.
token_count (int): Token count for splitting.
split_document (bool): Flag for splitting or returning pages.
"""
loader = PyPDFLoader(file)
pdf_pages = loader.load()
if split_document:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=token_count, chunk_overlap=50
)
docs = text_splitter.split_documents(pdf_pages)
texts = [d.page_content for d in docs]
else:
texts = [d.page_content for d in pdf_pages]
print(f"There are {len(texts)} text elements")
return texts
def load_files(files, token_count, split_document):
"""
Load files.
Args:
files (list): List of file names.
dir (str): Directory path.
token_count (int): Token count for splitting.
split_document (bool): Flag for splitting documents.
"""
texts = []
for fi in files:
doc_summary = generate_doc_summary(fi)
texts.extend(load_and_split(fi, token_count, split_document))
return texts, doc_summary
def rag_chain(retriever):
"""
RAG chain.
Args:
retriever: The retriever to use.
"""
# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
# LLM
model = ChatOpenAI(temperature=0, model="gpt-4")
# RAG pipeline
chain = (
{
"context": retriever | RunnableLambda(prepare_documents),
"question": RunnablePassthrough(),
}
| prompt
| model
| StrOutputParser()
)
return chain
# Experiment configurations
experiments = [
(None, False, "page_split_multivector"),
]
# Run
stor_chain = {}
for token_count, split_document, expt in experiments:
# Get texts and doc summary
doc_texts, doc_summary = load_files(files, token_count, split_document)
# Get table summaries
doc_table_summaries = generate_table_summaries(doc_texts)
# Add doc summary to table summary to preserve context
doc_text_summaries = [
"Here is a summary of the doc: \n\n"
+ doc_summary
+ "\n\n Here is a summary of a table within this doc: \n\n"
+ t
for t in doc_table_summaries
]
# The vectorstore to use to index the summaries
vectorstore = Chroma(collection_name=expt, embedding_function=OpenAIEmbeddings())
# Create our table retriever
table_retriever = create_multi_vector_retriever(
vectorstore, doc_table_summaries, doc_texts
)
# Create our docs retriever
vectorstore_docs = Chroma.from_texts(
texts=doc_texts, collection_name=expt + "docs", embedding=OpenAIEmbeddings()
)
docs_retriever = vectorstore_docs.as_retriever()
# Initialize ensemble retriever
ensemble_retriever = EnsembleRetriever(
retrievers=[table_retriever, docs_retriever], weights=[0.75, 0.25]
)
# Chain
stor_chain[expt] = rag_chain(ensemble_retriever)
评估#
在我们的数据集 Semi-structured Reports
上运行评估。
import uuid
from langchain.smith import RunEvalConfig
from langsmith.client import Client
# Config
client = Client()
eval_config = RunEvalConfig(
evaluators=["cot_qa"],
)
# Experiments
chain_map = {
"page_split_multivector_emsemble": stor_chain["page_split_multivector"],
}
# Run evaluation
run_id = uuid.uuid4().hex[:4]
test_runs = {}
for project_name, chain in chain_map.items():
test_runs[project_name] = client.run_on_dataset(
dataset_name=task.name,
llm_or_chain_factory=lambda: (lambda x: x["Question"]) | chain,
evaluation=eval_config,
verbose=True,
project_name=f"{run_id}-{project_name}",
project_metadata={"chain": project_name},
)