merge single line to the next content
This commit is contained in:
parent
8c367eb460
commit
bf21b8f116
|
|
@ -25,7 +25,7 @@ from server.utils import run_in_thread_pool, embedding_device, get_model_worker_
|
|||
import io
|
||||
from typing import List, Union, Callable, Dict, Optional, Tuple, Generator
|
||||
import chardet
|
||||
|
||||
import re
|
||||
|
||||
def validate_kb_name(knowledge_base_id: str) -> bool:
|
||||
# 检查是否包含预期外的字符或路径攻击关键字
|
||||
|
|
@ -314,6 +314,10 @@ class KnowledgeFile:
|
|||
print("文件不存在")
|
||||
|
||||
docs = docs or self.file2docs(refresh=refresh)
|
||||
#after loading, remove the redundant line break
|
||||
for doc in docs:
|
||||
if doc.page_content.strip()!="":
|
||||
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
|
||||
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
|
||||
print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
|
||||
if not docs:
|
||||
|
|
|
|||
|
|
@ -97,7 +97,26 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
if _good_splits:
|
||||
merged_text = self._merge_splits(_good_splits, _separator)
|
||||
final_chunks.extend(merged_text)
|
||||
return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
|
||||
|
||||
final_chunks = [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
|
||||
#将单行和两行的和下面的分块合并
|
||||
return_chunks = []
|
||||
temp_sencond = ""
|
||||
for chunk in final_chunks:
|
||||
if temp_sencond =="":
|
||||
if len(chunk.splitlines()) <= 1:
|
||||
temp_sencond = chunk
|
||||
else:
|
||||
return_chunks.append(chunk)
|
||||
else:
|
||||
return_chunks.append(temp_sencond + chunk)
|
||||
temp_sencond = ""
|
||||
|
||||
if temp_sencond !="":
|
||||
return_chunks.append(temp_sencond)
|
||||
|
||||
return return_chunks
|
||||
#return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Reference in New Issue