半结构化评估:多向量#

Semi-structured Reports 是一个公共数据集,包含来自具有文本和表格的文档的问答对。

问答对源于表格以及文档中的一些段落。

我们使用多向量检索器来评估 RAG 的性能。

先决条件#

# %pip install -U langchain langsmith langchain_benchmarks
# %pip install --quiet chromadb openai pypdf tiktoken
import getpass
import os

os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
env_vars = ["LANGCHAIN_API_KEY", "OPENAI_API_KEY"]
for var in env_vars:
    if var not in os.environ:
        os.environ[var] = getpass.getpass(prompt=f"Enter your {var}: ")

数据集#

从远程缓存获取与数据集关联的 PDF,以便我们可以执行摄取。

import os

from langchain_benchmarks import clone_public_dataset, registry
from langchain_benchmarks.rag.tasks.semi_structured_reports import get_file_names

# Task
task = registry["Semi-structured Reports"]

# Files used
paths = list(get_file_names())
files = [str(p) for p in paths]

克隆数据集,以便它在我们的 LangSmith 数据集中可用。

clone_public_dataset(task.dataset_id, dataset_name=task.name)

加载和索引#

我们构建了一个专注于表格的检索器。

为此,我们使用 LLM 扫描每个页面并总结页面中的任何表格。

然后我们索引这些摘要以供检索,并使用多向量检索器存储包含表格的原始页面文本。

最后,我们使用集成检索器将检索到的表格块与原始文本块混合。

  • 将不同检索器的排名组合成一个统一的排名。

  • 每个检索器都提供一个基于与查询的相关性进行排名的文档(或搜索结果)列表。

  • 权重表示您对每个检索器结果的相对重要性或信任度。

  • 权重用于缩放每个检索器对最终组合排名的贡献。

  • RRF 方法使用检索器提供的列表中每个项目的排名。

  • 基本思想是给在列表中排名更高(即排名数字更低)的项目赋予更高的分数。

import uuid

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma


def prepare_documents(docs):
    """
    Prepare documents for prompt. Concatenates Document objects (after extracting their page_content)
    and strings into a single string, separated by two newlines.

    :param docs: A list of str or Document objects.
    :return: A single string containing all documents.
    """
    # Process each document and append it to the list
    processed_docs = [
        doc.page_content if isinstance(doc, Document) else doc for doc in docs
    ]

    # Join all processed documents into a single string
    return "\n\n".join(processed_docs)


def create_multi_vector_retriever(vectorstore, text_summaries, texts):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """

    # Initialize the storage layer
    store = InMemoryStore()
    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key,
    )

    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(doc_summaries)
        ]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    # Add texts, tables, and images
    add_documents(retriever, text_summaries, texts)
    return retriever


def generate_doc_summary(file):
    """
    Create a doc summary
    """

    # Prompt
    prompt_text = """You are an assistant tasked extracting two attributes \
    from financial documents. (1) Tell me the company that the document is \
    focused on. (2) Look at any tables in the document and tell me the units \ 
    of the table. Many table will have '(In thousands)' or '(in millions)' prior \
    to the table text. Provide these two for the document: \n\n {document} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatOpenAI(temperature=0, model="gpt-4-1106-preview")
    summarize_chain = {"document": lambda x: x} | prompt | model | StrOutputParser()

    # Load doc
    loader = PyPDFLoader(file)
    pdf_pages = loader.load()
    texts = [t.page_content for t in pdf_pages]
    text_string = " ".join(texts)
    summary = summarize_chain.invoke({"document": text_string})
    return summary


def generate_table_summaries(texts):
    """
    Summarize text elements
    texts: List of str
    """

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing tables within a provided text chunk. \
    If the text chunk contains tables, then give a brief summary of the table and list the row and column \
    names to identify what is captured in the table. Do not sumnmarize quantitative results in the table. \ 
    If there is no table present, then just return "No table". \n\n Text: {element} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatOpenAI(temperature=0, model="gpt-4")
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

    # Initialize empty summaries
    text_summaries = []
    text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

    return text_summaries


def load_and_split(file, token_count, split_document=True):
    """
    Load and optionally split PDF files.

    Args:
        file (str): File path.
        token_count (int): Token count for splitting.
        split_document (bool): Flag for splitting or returning pages.
    """

    loader = PyPDFLoader(file)
    pdf_pages = loader.load()

    if split_document:
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            chunk_size=token_count, chunk_overlap=50
        )

        docs = text_splitter.split_documents(pdf_pages)
        texts = [d.page_content for d in docs]
    else:
        texts = [d.page_content for d in pdf_pages]

    print(f"There are {len(texts)} text elements")
    return texts


def load_files(files, token_count, split_document):
    """
    Load files.

    Args:
        files (list): List of file names.
        dir (str): Directory path.
        token_count (int): Token count for splitting.
        split_document (bool): Flag for splitting documents.
    """

    texts = []
    for fi in files:
        doc_summary = generate_doc_summary(fi)
        texts.extend(load_and_split(fi, token_count, split_document))
    return texts, doc_summary


def rag_chain(retriever):
    """
    RAG chain.

    Args:
        retriever: The retriever to use.
    """

    # Prompt template
    template = """Answer the question based only on the following context, which can include text and tables:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    # LLM
    model = ChatOpenAI(temperature=0, model="gpt-4")

    # RAG pipeline
    chain = (
        {
            "context": retriever | RunnableLambda(prepare_documents),
            "question": RunnablePassthrough(),
        }
        | prompt
        | model
        | StrOutputParser()
    )
    return chain


# Experiment configurations
experiments = [
    (None, False, "page_split_multivector"),
]

# Run
stor_chain = {}
for token_count, split_document, expt in experiments:
    # Get texts and doc summary
    doc_texts, doc_summary = load_files(files, token_count, split_document)

    # Get table summaries
    doc_table_summaries = generate_table_summaries(doc_texts)

    # Add doc summary to table summary to preserve context
    doc_text_summaries = [
        "Here is a summary of the doc: \n\n"
        + doc_summary
        + "\n\n Here is a summary of a table within this doc: \n\n"
        + t
        for t in doc_table_summaries
    ]

    # The vectorstore to use to index the summaries
    vectorstore = Chroma(collection_name=expt, embedding_function=OpenAIEmbeddings())

    # Create our table retriever
    table_retriever = create_multi_vector_retriever(
        vectorstore, doc_table_summaries, doc_texts
    )

    # Create our docs retriever
    vectorstore_docs = Chroma.from_texts(
        texts=doc_texts, collection_name=expt + "docs", embedding=OpenAIEmbeddings()
    )
    docs_retriever = vectorstore_docs.as_retriever()

    # Initialize ensemble retriever
    ensemble_retriever = EnsembleRetriever(
        retrievers=[table_retriever, docs_retriever], weights=[0.75, 0.25]
    )

    # Chain
    stor_chain[expt] = rag_chain(ensemble_retriever)

评估#

在我们的数据集 Semi-structured Reports 上运行评估。

import uuid

from langchain.smith import RunEvalConfig
from langsmith.client import Client

# Config
client = Client()
eval_config = RunEvalConfig(
    evaluators=["cot_qa"],
)

# Experiments
chain_map = {
    "page_split_multivector_emsemble": stor_chain["page_split_multivector"],
}

# Run evaluation
run_id = uuid.uuid4().hex[:4]
test_runs = {}
for project_name, chain in chain_map.items():
    test_runs[project_name] = client.run_on_dataset(
        dataset_name=task.name,
        llm_or_chain_factory=lambda: (lambda x: x["Question"]) | chain,
        evaluation=eval_config,
        verbose=True,
        project_name=f"{run_id}-{project_name}",
        project_metadata={"chain": project_name},
    )