diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index f1dbb47..2bcb353 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -11,6 +11,7 @@ from configs import ( TEXT_SPLITTER_NAME, ) import importlib +from text_splitter import zh_third_title_enhance from text_splitter import zh_second_title_enhance from text_splitter import zh_first_title_enhance import langchain.document_loaders @@ -371,6 +372,7 @@ class KnowledgeFile: return [] #先给二级下 被分开的三级目录分块 增加二级标题,再给分开的二级目录增加一级标题,然后给整个文档的所有分块增加文档标题分块 if zh_title_enhance: + docs = zh_third_title_enhance(docs) docs = zh_second_title_enhance(docs) docs = zh_first_title_enhance(docs) docs = customize_zh_title_enhance(docs) diff --git a/text_splitter/__init__.py b/text_splitter/__init__.py index 44baf2d..49821d5 100644 --- a/text_splitter/__init__.py +++ b/text_splitter/__init__.py @@ -2,5 +2,6 @@ from .chinese_text_splitter import ChineseTextSplitter from .ali_text_splitter import AliTextSplitter from .zh_title_enhance import zh_title_enhance from .chinese_recursive_text_splitter import ChineseRecursiveTextSplitter +from .zh_second_title_enhance import zh_third_title_enhance from .zh_second_title_enhance import zh_second_title_enhance -from .zh_second_title_enhance import zh_first_title_enhance \ No newline at end of file +from .zh_second_title_enhance import zh_first_title_enhance diff --git a/text_splitter/chinese_recursive_text_splitter.py b/text_splitter/chinese_recursive_text_splitter.py index 5712d15..78f8e55 100644 --- a/text_splitter/chinese_recursive_text_splitter.py +++ b/text_splitter/chinese_recursive_text_splitter.py @@ -9,6 +9,7 @@ logger = logging.getLogger(__name__) First_SEPARATOE = "\n\n\n\n\n\n\n\n\n\n" Second_SEPARATOE = "\n\n\n\n\n\n\n\n" Third_SEPARATOE = "\n\n\n\n\n\n" +Fourth_SEPARATOE = "\n\n\n\n" def _split_text_with_regex_from_end( text: str, separator: str, keep_separator: bool ) -> List[str]: @@ -41,7 +42,8 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): self._separators = separators or [ First_SEPARATOE, Second_SEPARATOE, - Third_SEPARATOE + Third_SEPARATOE, + Fourth_SEPARATOE #"\n\n", #"\n", # "。|!|?", @@ -60,21 +62,26 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): separator = separators[-1] new_separators = [] if self.is_recursive == False: + #一级目录 text = re.sub(r'(\n+前\s+言\n+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过前言分块 text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的 text = re.sub(r'(手工分段\*\*\s*)', r"\n\n\n\n\n\n\n\n\n\n", text) # 将“手工分段**”替换 text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过第 章 - #text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) # 通过表 A.2 + #二级目录 text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\s*\.\s*[A-Za-z0-9]+)*\s+)', r"\n\n\n\n\n\n\n\n\1", text) # 通过表 A.2 text = re.sub(r'(\n+(? str: + # 文本长度为0的话,肯定不是title + if len(text) == 0 and len (text)>= 25: + print("Not a title. Text is empty or longer than 25.") + return "" + splitlines = text.splitlines() + first_line = splitlines[0] + # 文本中有标点符号,就不是title + ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z" + ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN) + if ENDS_IN_PUNCT_RE.search(first_line) is not None: + return "" + + Third_TITLE = r'((?1: + Second_line = splitlines[1] + if TITLE_PUNCT_RE.search(Second_line) is not None: + return Second_line + return "" + +#judge if it is 4th level content +def is_fourth_level_content( + text: str, +) -> bool: + # 文本长度为0的话,肯定不是title + if len(text) == 0: + print("Not a title. Text is empty.") + return False + + splitlines = text.splitlines() + first_line = splitlines[0] + + Third_TITLE = r'((? Document: + title = None + print(f"zh_third_title_enhance ....") + if len(docs) > 0: + for doc in docs: + print(f"zh_third_title_enhance: {doc}") + third_title = get_third_level_title(doc.page_content) + if third_title: + title = third_title + print(f"title: {title}") + elif title: + print(f"title is not none") + temp_fourth_content = is_fourth_level_content(doc.page_content) + if temp_fourth_content: + print(f"is_fourth_level_content : {temp_fourth_content}") + doc.page_content = f"{title} {doc.page_content}" + else: + title = None + print(f"final title: {title}") + return docs + else: + print("zh_third_title_enhance 文件不存在") + #给三级被分开的内容 增加二级标题 def zh_second_title_enhance(docs: Document) -> Document: title = None