merge single line to the next content

This commit is contained in:
wvivi2023 2023-12-13 18:06:49 +08:00
parent 8c367eb460
commit bf21b8f116
2 changed files with 25 additions and 2 deletions

View File

@ -25,7 +25,7 @@ from server.utils import run_in_thread_pool, embedding_device, get_model_worker_
import io
from typing import List, Union, Callable, Dict, Optional, Tuple, Generator
import chardet
import re
def validate_kb_name(knowledge_base_id: str) -> bool:
# 检查是否包含预期外的字符或路径攻击关键字
@ -314,6 +314,10 @@ class KnowledgeFile:
print("文件不存在")
docs = docs or self.file2docs(refresh=refresh)
#after loading, remove the redundant line break
for doc in docs:
if doc.page_content.strip()!="":
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
if not docs:

View File

@ -97,7 +97,26 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
if _good_splits:
merged_text = self._merge_splits(_good_splits, _separator)
final_chunks.extend(merged_text)
return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
final_chunks = [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
#将单行和两行的和下面的分块合并
return_chunks = []
temp_sencond = ""
for chunk in final_chunks:
if temp_sencond =="":
if len(chunk.splitlines()) <= 1:
temp_sencond = chunk
else:
return_chunks.append(chunk)
else:
return_chunks.append(temp_sencond + chunk)
temp_sencond = ""
if temp_sencond !="":
return_chunks.append(temp_sencond)
return return_chunks
#return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
if __name__ == "__main__":