merge single line to the next content
This commit is contained in:
parent
8c367eb460
commit
bf21b8f116
|
|
@ -25,7 +25,7 @@ from server.utils import run_in_thread_pool, embedding_device, get_model_worker_
|
||||||
import io
|
import io
|
||||||
from typing import List, Union, Callable, Dict, Optional, Tuple, Generator
|
from typing import List, Union, Callable, Dict, Optional, Tuple, Generator
|
||||||
import chardet
|
import chardet
|
||||||
|
import re
|
||||||
|
|
||||||
def validate_kb_name(knowledge_base_id: str) -> bool:
|
def validate_kb_name(knowledge_base_id: str) -> bool:
|
||||||
# 检查是否包含预期外的字符或路径攻击关键字
|
# 检查是否包含预期外的字符或路径攻击关键字
|
||||||
|
|
@ -314,6 +314,10 @@ class KnowledgeFile:
|
||||||
print("文件不存在")
|
print("文件不存在")
|
||||||
|
|
||||||
docs = docs or self.file2docs(refresh=refresh)
|
docs = docs or self.file2docs(refresh=refresh)
|
||||||
|
#after loading, remove the redundant line break
|
||||||
|
for doc in docs:
|
||||||
|
if doc.page_content.strip()!="":
|
||||||
|
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
|
||||||
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
|
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
|
||||||
print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
|
print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}")
|
||||||
if not docs:
|
if not docs:
|
||||||
|
|
|
||||||
|
|
@ -97,7 +97,26 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
||||||
if _good_splits:
|
if _good_splits:
|
||||||
merged_text = self._merge_splits(_good_splits, _separator)
|
merged_text = self._merge_splits(_good_splits, _separator)
|
||||||
final_chunks.extend(merged_text)
|
final_chunks.extend(merged_text)
|
||||||
return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
|
|
||||||
|
final_chunks = [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
|
||||||
|
#将单行和两行的和下面的分块合并
|
||||||
|
return_chunks = []
|
||||||
|
temp_sencond = ""
|
||||||
|
for chunk in final_chunks:
|
||||||
|
if temp_sencond =="":
|
||||||
|
if len(chunk.splitlines()) <= 1:
|
||||||
|
temp_sencond = chunk
|
||||||
|
else:
|
||||||
|
return_chunks.append(chunk)
|
||||||
|
else:
|
||||||
|
return_chunks.append(temp_sencond + chunk)
|
||||||
|
temp_sencond = ""
|
||||||
|
|
||||||
|
if temp_sencond !="":
|
||||||
|
return_chunks.append(temp_sencond)
|
||||||
|
|
||||||
|
return return_chunks
|
||||||
|
#return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue