ChatDoc:功能逻辑
- 读取pdf、excel、doc三种常见的文档格式
- 根据文档内容,智能抽取内容并输出相应格式
1,将文档向量化并索引入库 2,使用自然语言找出相关文本块
#导入必须的包
from langchain.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
#定义chatdoc
class ChatDoc():
def __init__(self):
self.doc = None
self.splitText = [] #分割后的文本
def getFile(self):
doc = self.doc
loaders = {
"docx":Docx2txtLoader,
"pdf":PyPDFLoader,
"xlsx":UnstructuredExcelLoader,
}
file_extension = doc.split(".")[-1]
loader_class = loaders.get(file_extension)
if loader_class:
try:
loader = loader_class(doc)
text = loader.load()
return text
except Exception as e:
print(f"Error loading {file_extension} files:{e}")
else:
print(f"Unsupported file extension: {file_extension}")
return None
#处理文档的函数
def splitSentences(self):
full_text = self.getFile() #获取文档内容
if full_text != None:
#对文档进行分割
text_split = CharacterTextSplitter(
chunk_size=150,
chunk_overlap=20,
)
texts = text_split.split_documents(full_text)
self.splitText = texts
#向量化与向量存储
def embeddingAndVectorDB(self):
embeddings = OpenAIEmbeddings(
model="text-embedding-3-small"
)
db =Chroma.from_documents(
documents = self.splitText,
embedding = embeddings,
)
return db
#提问并找到相关的文本块
def askAndFindFiles(self,question):
db = self.embeddingAndVectorDB()
retriever = db.as_retriever()
results = retriever.invoke(question)
return results
chat_doc = ChatDoc()
chat_doc.doc = "example/fake.docx"
chat_doc.splitSentences()
chat_doc.askAndFindFiles("这家公司叫什么名字?")
提高文档检索精度
使用多重查询提高文档检索精确度
#提问并找到相关的文本块
def askAndFindFiles(self,question):
db = self.embeddingAndVectorDB()
#把问题交给LLM进行多角度的扩展
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
retriever = db.as_retriever(),
llm = llm,
)
return retriever_from_llm.get_relevant_documents(question)
使用上下文压缩检索降低冗余信息
#提问并找到相关的文本块
def askAndFindFiles(self,question):
db = self.embeddingAndVectorDB()
retriever = db.as_retriever()
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(
llm = llm,
)
compressor_retriever = ContextualCompressionRetriever(
base_retriever = retriever,
base_compressor = compressor,
)
return compressor_retriever.get_relevant_documents(query=question)
在向量存储里使用最大边际相似性(MMR)和相似性打分
#提问并找到相关的文本块
def askAndFindFiles(self,question):
db = self.embeddingAndVectorDB()
#retriever = db.as_retriever(search_type="mmr")
retriever = db.as_retriever(search_type="similarity_score_threshold",search_kwargs={"score_threshold":.1,"k":1})
return retriever.get_relevant_documents(query=question)
和文件聊天
#导入必须的包
from langchain.document_loaders import UnstructuredExcelLoader,Docx2txtLoader,PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
#导入聊天所需的模块
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
#定义chatdoc
class ChatDoc():
def __init__(self):
self.doc = None
self.splitText = [] #分割后的文本
self.template = [
("system","你是一个处理文档的秘书,你从不说自己是一个大模型或者AI助手,你会根据下面提供的上下文内容来继续回答问题.\n 上下文内容\n {context} \n"),
("human","你好!"),
("ai","你好"),
("human","{question}"),
]
self.prompt = ChatPromptTemplate.from_messages(self.template)
def getFile(self):
doc = self.doc
loaders = {
"docx":Docx2txtLoader,
"pdf":PyPDFLoader,
"xlsx":UnstructuredExcelLoader,
}
file_extension = doc.split(".")[-1]
loader_class = loaders.get(file_extension)
if loader_class:
try:
loader = loader_class(doc)
text = loader.load()
return text
except Exception as e:
print(f"Error loading {file_extension} files:{e}")
else:
print(f"Unsupported file extension: {file_extension}")
return None
#处理文档的函数
def splitSentences(self):
full_text = self.getFile() #获取文档内容
if full_text != None:
#对文档进行分割
text_split = CharacterTextSplitter(
chunk_size=150,
chunk_overlap=20,
)
texts = text_split.split_documents(full_text)
self.splitText = texts
#向量化与向量存储
def embeddingAndVectorDB(self):
embeddings = OpenAIEmbeddings()
db =Chroma.from_documents(
documents = self.splitText,
embedding = embeddings,
)
return db
#提问并找到相关的文本块
def askAndFindFiles(self,question):
db = self.embeddingAndVectorDB()
#retriever = db.as_retriever(search_type="mmr")
retriever = db.as_retriever(search_type="similarity_score_threshold",search_kwargs={"score_threshold":.5,"k":1})
return retriever.get_relevant_documents(query=question)
#用自然语言和文档聊天
def chatWithDoc(self,question):
_content = ""
context = self.askAndFindFiles(question)
for i in context:
_content += i.page_content
messages = self.prompt.format_messages(context=_content,question=question)
chat = ChatOpenAI(
model="gpt-4",
temperature=0,
)
return chat.invoke(messages)
chat_doc = ChatDoc()
chat_doc.doc = "example/fake.docx"
chat_doc.splitSentences()
chat_doc.chatWithDoc("公司注册地址是哪里?")