From d096443b038fca4b1e500c3450633e1c65c2775f Mon Sep 17 00:00:00 2001 From: weiweiw <14335254+weiweiw22@user.noreply.gitee.com> Date: Mon, 13 Jan 2025 10:56:24 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=A0=87=E9=A2=98=E5=A2=9E?= =?UTF-8?q?=E5=BC=BA=E6=96=87=E6=A1=A3=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../chatchat/server/knowledge_base/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/libs/chatchat-server/chatchat/server/knowledge_base/utils.py b/libs/chatchat-server/chatchat/server/knowledge_base/utils.py index 521d4e5..b76fc8d 100644 --- a/libs/chatchat-server/chatchat/server/knowledge_base/utils.py +++ b/libs/chatchat-server/chatchat/server/knowledge_base/utils.py @@ -21,7 +21,7 @@ from chatchat.server.file_rag.text_splitter import ( ) from chatchat.server.utils import run_in_process_pool, run_in_thread_pool from chatchat.utils import build_logger - +import re logger = build_logger() @@ -333,6 +333,7 @@ class KnowledgeFile: self.kb_name = knowledge_base_name self.filename = str(Path(filename).as_posix()) self.ext = os.path.splitext(filename)[-1].lower() + self.doc_title_name, file_extension = os.path.splitext(filename) if self.ext not in SUPPORTED_EXTS: raise ValueError(f"暂未支持的文件格式 {self.filename}") self.loader_kwargs = loader_kwargs @@ -341,6 +342,7 @@ class KnowledgeFile: self.splited_docs = None self.document_loader_name = get_LoaderClass(self.ext) self.text_splitter_name = Settings.kb_settings.TEXT_SPLITTER_NAME + print(f"KnowledgeFile: filepath:{self.filepath}") def file2docs(self, refresh: bool = False): if self.docs is None or refresh: @@ -366,6 +368,7 @@ class KnowledgeFile: chunk_overlap: int = Settings.kb_settings.OVERLAP_SIZE, text_splitter: TextSplitter = None, ): + #add the title name on every paragraph, by weiweiwang 2025/1/13 def customize_zh_title_enhance(docs: Document) -> Document: if len(docs) > 0: for doc in docs: @@ -375,6 +378,11 @@ class KnowledgeFile: print("文件不存在") docs = docs or self.file2docs(refresh=refresh) + #remove the redundant line break after loading, by weiweiwang 2025/1/13 + for doc in docs: + if doc.page_content.strip() != "": + doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip()) + if not docs: return [] if self.ext not in [".csv"]: