diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index 6fd0ef7..dc0e6ae 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -25,7 +25,7 @@ from server.utils import run_in_thread_pool, embedding_device, get_model_worker_ import io from typing import List, Union, Callable, Dict, Optional, Tuple, Generator import chardet - +import re def validate_kb_name(knowledge_base_id: str) -> bool: # 检查是否包含预期外的字符或路径攻击关键字 @@ -314,6 +314,10 @@ class KnowledgeFile: print("文件不存在") docs = docs or self.file2docs(refresh=refresh) + #after loading, remove the redundant line break + for doc in docs: + if doc.page_content.strip()!="": + doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip()) file_name_without_extension, file_extension = os.path.splitext(self.filepath) print(f"filepath:{self.filepath},文件名拆分后:{file_name_without_extension},{file_extension}") if not docs: diff --git a/text_splitter/chinese_recursive_text_splitter.py b/text_splitter/chinese_recursive_text_splitter.py index a4ac4fc..ac2448b 100644 --- a/text_splitter/chinese_recursive_text_splitter.py +++ b/text_splitter/chinese_recursive_text_splitter.py @@ -97,7 +97,26 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): if _good_splits: merged_text = self._merge_splits(_good_splits, _separator) final_chunks.extend(merged_text) - return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""] + + final_chunks = [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""] + #将单行和两行的和下面的分块合并 + return_chunks = [] + temp_sencond = "" + for chunk in final_chunks: + if temp_sencond =="": + if len(chunk.splitlines()) <= 1: + temp_sencond = chunk + else: + return_chunks.append(chunk) + else: + return_chunks.append(temp_sencond + chunk) + temp_sencond = "" + + if temp_sencond !="": + return_chunks.append(temp_sencond) + + return return_chunks + #return [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""] if __name__ == "__main__":