增加标题增强文档功能

This commit is contained in:
weiweiw 2025-01-13 16:11:37 +08:00
parent 74f4f8174d
commit 700a7c7298
1 changed files with 4 additions and 1 deletions

View File

@ -377,6 +377,7 @@ class KnowledgeFile:
else: else:
print("文件不存在") print("文件不存在")
logger.info(f"********docs2texts")
docs = docs or self.file2docs(refresh=refresh) docs = docs or self.file2docs(refresh=refresh)
#remove the redundant line break after loading, by weiweiwang 2025/1/13 #remove the redundant line break after loading, by weiweiwang 2025/1/13
for doc in docs: for doc in docs:
@ -392,6 +393,8 @@ class KnowledgeFile:
chunk_size=chunk_size, chunk_size=chunk_size,
chunk_overlap=chunk_overlap, chunk_overlap=chunk_overlap,
) )
else:
logger.error(f"text_splitter is Not None, text_splitter_name: {self.text_splitter_name}")
if self.text_splitter_name == "MarkdownHeaderTextSplitter": if self.text_splitter_name == "MarkdownHeaderTextSplitter":
docs = text_splitter.split_text(docs[0].page_content) docs = text_splitter.split_text(docs[0].page_content)
else: else:
@ -400,7 +403,7 @@ class KnowledgeFile:
if not docs: if not docs:
return [] return []
print(f"文档切分示例:{docs[0]}") print(f"文档切分{len(docs)}")
if zh_title_enhance: if zh_title_enhance:
# docs = func_zh_title_enhance(docs) # docs = func_zh_title_enhance(docs)
docs = zh_third_title_enhance(docs) docs = zh_third_title_enhance(docs)