From 74f4f8174d18aecc22da24d2d7f896bc8bac9bdf Mon Sep 17 00:00:00 2001 From: weiweiw <14335254+weiweiw22@user.noreply.gitee.com> Date: Mon, 13 Jan 2025 11:30:14 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=A0=87=E9=A2=98=E5=A2=9E?= =?UTF-8?q?=E5=BC=BA=E6=96=87=E6=A1=A3=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../chinese_recursive_text_splitter.py | 94 ++++++++++++++----- 1 file changed, 73 insertions(+), 21 deletions(-) diff --git a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py index 9c41b8c..ab66d99 100644 --- a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py +++ b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py @@ -1,4 +1,4 @@ -import logging +#ChineseRecursiveTextSplitter import re from typing import Any, List, Optional @@ -9,9 +9,13 @@ from chatchat.utils import build_logger logger = build_logger() +First_SEPARATOE = "\n\n\n\n\n\n\n\n\n\n" +Second_SEPARATOE = "\n\n\n\n\n\n\n\n" +Third_SEPARATOE = "\n\n\n\n\n\n" +Fourth_SEPARATOE = "\n\n\n\n" -def _split_text_with_regex_from_end( - text: str, separator: str, keep_separator: bool +def _customer_split_text_with_regex_from_end( + text: str, separator: str, keep_separator: bool ) -> List[str]: # Now that we have the separator, split the text if separator: @@ -28,33 +32,60 @@ def _split_text_with_regex_from_end( splits = list(text) return [s for s in splits if s != ""] +def customerLen(text:str)->int: + length = len(re.sub(r'[\s\n]+', '', text)) + return length class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): def __init__( - self, - separators: Optional[List[str]] = None, - keep_separator: bool = True, - is_separator_regex: bool = True, - **kwargs: Any, + self, + separators: Optional[List[str]] = None, + keep_separator: bool = True, + is_separator_regex: bool = True, + **kwargs: Any, ) -> None: """Create a new TextSplitter.""" super().__init__(keep_separator=keep_separator, **kwargs) self._separators = separators or [ - "\n\n", - "\n", - "。|!|?", - "\.\s|\!\s|\?\s", - ";|;\s", - ",|,\s", + First_SEPARATOE, + Second_SEPARATOE, + Third_SEPARATOE, + Fourth_SEPARATOE ] self._is_separator_regex = is_separator_regex + self.is_recursive = False + self._length_function = customerLen def _split_text(self, text: str, separators: List[str]) -> List[str]: """Split incoming text and return chunks.""" + #print(f"***********************************ChineseRecursiveTextSplitter***********************************") final_chunks = [] # Get appropriate separator to use separator = separators[-1] new_separators = [] + if self.is_recursive == False: + #一级目录 + text = re.sub(r'(\n+前\s+言\n+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过前言分块 + text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的 + text = re.sub(r'(手工分段\*\*\s*)', r"\n\n\n\n\n\n\n\n\n\n", text) # 将“手工分段**”替换 + text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 章 + + #二级目录 + text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过表 A.2 + text = re.sub(r'(\n+(?