manually split

2023-12-15 08:59:13 +08:00 · 2023-12-15 08:59:13 +08:00 · 77bc5891c8
parent bf21b8f116
commit 77bc5891c8
3 changed files with 5 additions and 4 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/server/chat/knowledge_base_chat.py
+++ b/server/chat/knowledge_base_chat.py
@ -39,7 +39,7 @@ async def knowledge_base_chat(query: str = Body(..., description="用户输入",
        return BaseResponse(code=404, msg=f"未找到知识库 {knowledge_base_name}")
    history = [History.from_data(h) for h in history]
-    print(f"server/chat/knowledge_base_chat function, history:{history}")
+    print(f"******server/chat/knowledge_base_chat function, history:{history}")
    async def knowledge_base_chat_iterator(query: str,
                                           top_k: int,
                                           history: Optional[List[History]],
--- a/text_splitter/chinese_recursive_text_splitter.py
+++ b/text_splitter/chinese_recursive_text_splitter.py
@ -57,12 +57,13 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
        new_separators = []
        text = re.sub(r'(\n+前\s+言\n+)',  r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过前言分块
        text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的
-        text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 通过1.2 这样的章和节来分块
+        text = re.sub(r'(\n+[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s+(?!\.|[a-zA-Z0-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 通过\n1.2 这样的章和节来分块
        text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\.[A-Za-z0-9]+)+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 通过表  A.4.a 
        text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 通过第 条
        text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 通过第 条
        text = re.sub(r'(\n+(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 通过第 条
-        text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 再通过 1.2 来分块
+        text = re.sub(r'(手工分段\*\*\s+)', r"\n\n\n\n\n\n\n\n\n\n", text)  # 通过“手工分段**”
        #text = re.sub(r'(\n+[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 再通过 1.2 来分块
        text = text.rstrip()  # 段尾如果有多余的\n就去掉它
        for i, _s in enumerate(separators):
            _separator = _s if self._is_separator_regex else re.escape(_s)
@ -91,7 +92,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
                if not new_separators:
                    final_chunks.append(s)
                else:
-                    s = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s)  # 再通过 1.2.3 来分块
+                    s = re.sub(r'(\n+[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s*\.\s*[a-zA-Z0-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s)  # 再通过 1.2.3 来分块
                    other_info = self._split_text(s, new_separators)
                    final_chunks.extend(other_info)
        if _good_splits: