merge single line to the next content

This commit is contained in:
wvivi2023 2023-12-13 18:06:49 +08:00
parent 8c367eb460
commit bf21b8f116
2 changed files with 25 additions and 2 deletions

View File

@ -25,7 +25,7 @@ from server.utils import run_in_thread_pool, embedding_device, get_model_worker_
import io import io
from typing import List, Union, Callable, Dict, Optional, Tuple, Generator from typing import List, Union, Callable, Dict, Optional, Tuple, Generator
import chardet import chardet
import re
def validate_kb_name(knowledge_base_id: str) -> bool: def validate_kb_name(knowledge_base_id: str) -> bool:
# 检查是否包含预期外的字符或路径攻击关键字 # 检查是否包含预期外的字符或路径攻击关键字
@ -314,6 +314,10 @@ class KnowledgeFile:
print("文件不存在") print("文件不存在")
docs = docs or self.file2docs(refresh=refresh) docs = docs or self.file2docs(refresh=refresh)
#after loading, remove the redundant line break
for doc in docs:
if doc.page_content.strip()!="":
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
file_name_without_extension, file_extension = os.path.splitext(self.filepath) file_name_without_extension, file_extension = os.path.splitext(self.filepath)
print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}") print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
if not docs: if not docs:

View File

@ -97,7 +97,26 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
if _good_splits: if _good_splits:
merged_text = self._merge_splits(_good_splits, _separator) merged_text = self._merge_splits(_good_splits, _separator)
final_chunks.extend(merged_text) final_chunks.extend(merged_text)
return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
final_chunks = [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
#将单行和两行的和下面的分块合并
return_chunks = []
temp_sencond = ""
for chunk in final_chunks:
if temp_sencond =="":
if len(chunk.splitlines()) <= 1:
temp_sencond = chunk
else:
return_chunks.append(chunk)
else:
return_chunks.append(temp_sencond + chunk)
temp_sencond = ""
if temp_sencond !="":
return_chunks.append(temp_sencond)
return return_chunks
#return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
if __name__ == "__main__": if __name__ == "__main__":