import re from typing import List, Optional, Any from langchain.text_splitter import RecursiveCharacterTextSplitter import logging logger = logging.getLogger(__name__) First_SEPARATOE = "\n\n\n\n\n\n\n\n\n\n" Second_SEPARATOE = "\n\n\n\n\n\n\n\n" Third_SEPARATOE = "\n\n\n\n\n\n" Fourth_SEPARATOE = "\n\n\n\n" def _split_text_with_regex_from_end( text: str, separator: str, keep_separator: bool ) -> List[str]: # Now that we have the separator, split the text if separator: if keep_separator: # The parentheses in the pattern keep the delimiters in the result. _splits = re.split(f"({separator})", text) splits = ["".join(i) for i in zip(_splits[0::2], _splits[1::2])] if len(_splits) % 2 == 1: splits += _splits[-1:] # splits = [_splits[0]] + splits else: splits = re.split(separator, text) else: splits = list(text) return [s for s in splits if s != ""] def customerLen(text:str)->int: length = len(re.sub(r'[\s\n]+', '', text)) return length class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): def __init__( self, separators: Optional[List[str]] = None, keep_separator: bool = True, is_separator_regex: bool = True, **kwargs: Any, ) -> None: """Create a new TextSplitter.""" super().__init__(keep_separator=keep_separator, **kwargs) self._separators = separators or [ First_SEPARATOE, Second_SEPARATOE, Third_SEPARATOE, Fourth_SEPARATOE #"\n\n", #"\n", # "。|!|?", # "\.\s|\!\s|\?\s", # ";|;\s", # ",|,\s" ] self._is_separator_regex = is_separator_regex self.is_recursive = False self._length_function = customerLen def _split_text(self, text: str, separators: List[str]) -> List[str]: """Split incoming text and return chunks.""" #print(f"***********************************ChineseRecursiveTextSplitter***********************************") final_chunks = [] # Get appropriate separator to use separator = separators[-1] new_separators = [] if self.is_recursive == False: #一级目录 text = re.sub(r'(\n+前\s+言\n+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过前言分块 text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的 text = re.sub(r'(手工分段\*\*\s*)', r"\n\n\n\n\n\n\n\n\n\n", text) # 将“手工分段**”替换 text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 章 #二级目录 text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过表 A.2 text = re.sub(r'(\n+(?