文档切割
原理
- 将文档分成小的、有意义的块(句子).
- 将小的块组合成为一个更大的块,直到达到一定的大小.
- 一旦达到一定的大小,接着开始创建与下一个块重叠的部分.
示例
- 第一个文档分割
- 按字符切割
- 代码文档切割
- 按token来切割
文档分割
from langchain.text_splitter import RecursiveCharacterTextSplitter
#加载要切割的文档
with open("test.txt") as f:
zuizhonghuanxiang = f.read()
#初始化切割器
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=50,#切分的文本块大小,一般通过长度函数计算
chunk_overlap=20,#切分的文本块重叠大小,一般通过长度函数计算
length_function=len,#长度函数,也可以传递tokenize函数
add_start_index=True,#是否添加起始索引
)
text = text_splitter.create_documents([zuizhonghuanxiang])
text[0]
text[1]
字符串切割
from langchain.text_splitter import CharacterTextSplitter
#加载要切分的文档
with open("test.txt") as f:
zuizhonghuanxiang = f.read()
#初始化切分器
text_splitter = CharacterTextSplitter(
separator="。",#切割的标志字符,默认是\n\n
chunk_size=50,#切分的文本块大小,一般通过长度函数计算
chunk_overlap=20,#切分的文本块重叠大小,一般通过长度函数计算
length_function=len,#长度函数,也可以传递tokenize函数
add_start_index=True,#是否添加起始索引
is_separator_regex=False,#是否是正则表达式
)
text = text_splitter.create_documents([zuizhonghuanxiang])
print(text[0])
代码切割
from langchain.text_splitter import (
RecursiveCharacterTextSplitter,
Language,
)
#支持解析的编程语言
#[e.value for e in Language]
#要切割的代码文档
PYTHON_CODE = """
def hello_world():
print("Hello, World!")
#调用函数
hello_world()
"""
py_spliter = RecursiveCharacterTextSplitter.from_language(
language=Language.PYTHON,
chunk_size=50,
chunk_overlap=10,
)
python_docs = py_spliter.create_documents([PYTHON_CODE])
python_docs
按token来切割文档
from langchain.text_splitter import CharacterTextSplitter
#要切割的文档
with open("test.txt") as f:
zuizhonghuanxiang = f.read()
#初始化切分器
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=4000,#切分的文本块大小,一般通过长度函数计算
chunk_overlap=30,#切分的文本块重叠大小,一般通过长度函数计算
)
text = text_splitter.create_documents([zuizhonghuanxiang])
print(text[0])
文档的总结、精炼和翻译
#加载文档
with open("letter.txt") as f:
content = f.read()
from dotenv import load_dotenv
import os
load_dotenv("openai.env")
OPENAI_API_KEY = os.environ.get("OPEN_API_KEY")
OPENAI_API_BASE = os.environ.get("OPENAI_API_BASE")
OPENAI_MODEL = "gpt-3.5-turbo-16k"
OPENAI_TOKEN_LIMIT = 8000
from doctran import Doctran
doctrans = Doctran(
openai_api_key=OPENAI_API_KEY,
openai_model=OPENAI_MODEL,
openai_token_limit=OPENAI_TOKEN_LIMIT,
)
documents = doctrans.parse(content=content)
#总结文档
summary = documents.summarize(token_limit=100).execute()
print(summary.transformed_content)
#翻译一下文档
translation = documents.translate(language="chinese").execute()
print(translation.transformed_content)
#精炼文档,删除除了某个主题或关键词之外的内容,仅保留与主题相关的内容
refined = documents.refine(topics=["marketing","Development"]).execute()
print(refined.transformed_content)