ChatDoc:功能逻辑

  • 读取pdf、excel、doc三种常见的文档格式
  • 根据文档内容,智能抽取内容并输出相应格式

1,将文档向量化并索引入库 2,使用自然语言找出相关文本块


#导入必须的包
from langchain.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader
from langchain.text_splitter import  CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import  Chroma


#定义chatdoc
class ChatDoc():
    def __init__(self):
        self.doc = None
        self.splitText = [] #分割后的文本

    def getFile(self):
        doc = self.doc
        loaders = {
            "docx":Docx2txtLoader,
            "pdf":PyPDFLoader,
            "xlsx":UnstructuredExcelLoader,
        }
        file_extension = doc.split(".")[-1]
        loader_class = loaders.get(file_extension)
        if loader_class:
            try:
                loader = loader_class(doc)
                text = loader.load()
                return text
            except Exception as e: 
                print(f"Error loading {file_extension} files:{e}") 
        else:
             print(f"Unsupported file extension: {file_extension}")
             return  None 

    #处理文档的函数
    def splitSentences(self):
        full_text = self.getFile() #获取文档内容
        if full_text != None:
            #对文档进行分割
            text_split = CharacterTextSplitter(
                chunk_size=150,
                chunk_overlap=20,
            )
            texts = text_split.split_documents(full_text)
            self.splitText = texts
    
    #向量化与向量存储
    def embeddingAndVectorDB(self):
        embeddings = OpenAIEmbeddings(
             model="text-embedding-3-small"
        )
        db =Chroma.from_documents(
            documents = self.splitText,
            embedding = embeddings,
        )
        return db
    
    #提问并找到相关的文本块
    def askAndFindFiles(self,question):
        db = self.embeddingAndVectorDB()
        retriever = db.as_retriever()
        results = retriever.invoke(question)
        return results

chat_doc = ChatDoc()
chat_doc.doc = "example/fake.docx"
chat_doc.splitSentences()
chat_doc.askAndFindFiles("这家公司叫什么名字?")


提高文档检索精度

使用多重查询提高文档检索精确度


#提问并找到相关的文本块
    def askAndFindFiles(self,question):
        db = self.embeddingAndVectorDB()
        #把问题交给LLM进行多角度的扩展
        llm = ChatOpenAI(temperature=0)
        retriever_from_llm = MultiQueryRetriever.from_llm(
            retriever = db.as_retriever(),
            llm = llm,
        )
        return retriever_from_llm.get_relevant_documents(question)

使用上下文压缩检索降低冗余信息


 #提问并找到相关的文本块
    def askAndFindFiles(self,question):
        db = self.embeddingAndVectorDB()
        retriever = db.as_retriever()
        llm = OpenAI(temperature=0)
        compressor = LLMChainExtractor.from_llm(
            llm = llm,
        )
        compressor_retriever = ContextualCompressionRetriever(
            base_retriever = retriever,
            base_compressor = compressor,
        )
        return compressor_retriever.get_relevant_documents(query=question)

在向量存储里使用最大边际相似性(MMR)和相似性打分


#提问并找到相关的文本块
    def askAndFindFiles(self,question):
        db = self.embeddingAndVectorDB()
        #retriever = db.as_retriever(search_type="mmr")
        retriever = db.as_retriever(search_type="similarity_score_threshold",search_kwargs={"score_threshold":.1,"k":1})
        return retriever.get_relevant_documents(query=question)

和文件聊天


#导入必须的包
from langchain.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader
from langchain.text_splitter import  CharacterTextSplitter
from langchain.embeddings import  OpenAIEmbeddings
from langchain.vectorstores import  Chroma
#导入聊天所需的模块
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate


#定义chatdoc
class ChatDoc():
    def __init__(self):
        self.doc = None
        self.splitText = [] #分割后的文本
        self.template = [
            ("system","你是一个处理文档的秘书,你从不说自己是一个大模型或者AI助手,你会根据下面提供的上下文内容来继续回答问题.\n 上下文内容\n {context} \n"),
            ("human","你好!"),
            ("ai","你好"),
            ("human","{question}"),
        ]
        self.prompt = ChatPromptTemplate.from_messages(self.template)

    def getFile(self):
        doc = self.doc
        loaders = {
            "docx":Docx2txtLoader,
            "pdf":PyPDFLoader,
            "xlsx":UnstructuredExcelLoader,
        }
        file_extension = doc.split(".")[-1]
        loader_class = loaders.get(file_extension)
        if loader_class:
            try:
                loader = loader_class(doc)
                text = loader.load()
                return text
            except Exception as e: 
                print(f"Error loading {file_extension} files:{e}") 
        else:
             print(f"Unsupported file extension: {file_extension}")
             return  None 

    #处理文档的函数
    def splitSentences(self):
        full_text = self.getFile() #获取文档内容
        if full_text != None:
            #对文档进行分割
            text_split = CharacterTextSplitter(
                chunk_size=150,
                chunk_overlap=20,
            )
            texts = text_split.split_documents(full_text)
            self.splitText = texts
    
    #向量化与向量存储
    def embeddingAndVectorDB(self):
        embeddings = OpenAIEmbeddings()
        db =Chroma.from_documents(
            documents = self.splitText,
            embedding = embeddings,
        )
        return db
    
    #提问并找到相关的文本块
    def askAndFindFiles(self,question):
        db = self.embeddingAndVectorDB()
        #retriever = db.as_retriever(search_type="mmr")
        retriever = db.as_retriever(search_type="similarity_score_threshold",search_kwargs={"score_threshold":.5,"k":1})
        return retriever.get_relevant_documents(query=question)
    
    #用自然语言和文档聊天
    def chatWithDoc(self,question):
        _content = ""
        context = self.askAndFindFiles(question)
        for i in context:
            _content += i.page_content
        
        messages = self.prompt.format_messages(context=_content,question=question)
        chat = ChatOpenAI(
            model="gpt-4",
            temperature=0,
        )
        return chat.invoke(messages)

chat_doc = ChatDoc()
chat_doc.doc = "example/fake.docx"
chat_doc.splitSentences()
chat_doc.chatWithDoc("公司注册地址是哪里?")


Prev post

LangChain 010

Next post

LangChain 012