def process_text_with_splitter(text:str,page_numbers:List[int],save_path:str=None)->FAISS:
"""
处理文本并创建向量存储
:param
text:提取的文本内容
page_numbers:每行文本对应的页码列表
save_path: 可选,保存向量数据苦的路径
:return:
knowledgeBase:基于FAISS的向量存储对象
"""
#创建文本分割器,用于将长文本分割成小块
text_splitter = RecursiveCharacterTextSplitter(
separators = ["\n\n","\n","."," ",""],
chunk_size = 512,
chunk_overlap = 128,
length_function = len,
)
#分割文本
chunks = text_splitter.split_text(text)
#logging.debug(f"Text split into {len(chunks)} chunks.")
print(f"文本被分割成{len(chunks)}个块。")
#创建嵌入模型,OpenAI嵌入模型,配置环境变量OPENAI_API_KEY
#embeddings = OpenAIEmbeddings()
#调用阿里百炼平台文本嵌入模型,配置环境变量DASHSCOPE_API_KEY
embeddings = DashScopeEmbeddings(
# dashscope_api_key="sk-4877c04400004b7a93109c76f5d301ef",
model = "text-embedding-v3"
)
#从文本块创建知识库
knowledgeBase = FAISS.from_texts(chunks,embeddings)
print("已经从文本块创建知识库......")
#存储每个文本块对应的页码信息
page_info = {chunk:page_numbers[i] for i,chunk in enumerate(chunks)}
knowledgeBase.page_info = page_info
print(f"page_info is:{page_info}")
#如果提供了保存路机构,则保存向量数据库和页码信息
if save_path:
os.makedirs(save_path,exist_ok=True)
knowledgeBase.save_local(save_path)
print(f"向量数据库已经保存到:{save_path}")
with open(os.path.join(save_path,"page_info.pkl"),"wb") as f:
pickle.dump(page_info,f)
print(f"页码信息已经保存到:{os.path.join(save_path,'page_info.pkl')}")
return knowledgeBase
以上程序关于页码的代码应该是错误的,页码对不上的:page_info = {chunk:page_numbers[i] for i,chunk in enumerate(chunks)}