优化第一级目录分款

2023-12-05 16:51:45 +08:00 · 2023-12-05 16:51:45 +08:00 · c936f040e4
parent dce1d16e29
commit c936f040e4
5 changed files with 200 additions and 6 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/server/.DS_Store
+++ b/server/.DS_Store
--- a/server/knowledge_base/kb_doc_api.py
+++ b/server/knowledge_base/kb_doc_api.py
@ -75,12 +75,20 @@ def search_docs(query: str = Body(..., description="用户输入", examples=["
    print(f"search_docs, query:{query}")  
    docs = kb.search_docs(query, top_k, score_threshold)
    print(f"search_docs, docs:{docs}")
+
+    bFind = False
    if len(pre_doc) > 0:
        if docs is not None:
-            docs.append(pre_doc[0])
+            for tempDoc in docs:
+                if tempDoc[0].page_content == pre_doc[0][0].page_content:
+                    bFind = True
+                    break
        else:
-            docs = pre_doc[0]
-    
+             docs = pre_doc[0]     
+
+        if not bFind:
+            docs.append(pre_doc[0])
+
    data = [DocumentWithScore(**x[0].dict(), score=x[1]) for x in docs]

    return data
--- a/text_splitter/chinese_recursive_text_splitter.py
+++ b/text_splitter/chinese_recursive_text_splitter.py
@ -54,13 +54,15 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
        final_chunks = []
        # Get appropriate separator to use
        separator = separators[-1]
-        new_separators = [SPLIT_SEPARATOE]
+        new_separators = []
+        text = re.sub(r'(\n+前\s+言\n+)',  r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过前言分块
+        text = re.sub(r'(\n+\d+[^\S\n]+[^\s\.]+)', r"\n\n\n\n\n\n\n\n\n\n\1", text) #通过1 这样的
        text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+(?!\.|[a-zA-Z1-9]))', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 通过1.2 这样的章和节来分块
        text = re.sub(r'(\n+表\s*[A-Za-z0-9]+(\.[A-Za-z0-9]+)+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 通过表  A.4.a 
        text = re.sub(r'(\n+第\s*\S+\s*条\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 通过第 条
        text = re.sub(r'(\n+第\s*\S+\s*章\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 通过第 条
        text = re.sub(r'(\n+(一、|二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|十六、|十七、|十八、|十九、|二十、))', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 通过第 条
-        text = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 再通过 1.2 来分块
+        text = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", text)  # 再通过 1.2 来分块
        text = text.rstrip()  # 段尾如果有多余的\n就去掉它
        for i, _s in enumerate(separators):
            _separator = _s if self._is_separator_regex else re.escape(_s)
@ -89,7 +91,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
                if not new_separators:
                    final_chunks.append(s)
                else:
-                    s = re.sub(r'(\s+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s)  # 再通过 1.2.3 来分块
+                    s = re.sub(r'(\n+[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s*\.\s*[a-zA-Z1-9]+\s+)', r"\n\n\n\n\n\n\n\n\n\n\1", s)  # 再通过 1.2.3 来分块
                    other_info = self._split_text(s, new_separators)
                    final_chunks.extend(other_info)
        if _good_splits:
--- a/text_splitter/chinese_recursive_text_splitter_new.py
+++ b/text_splitter/chinese_recursive_text_splitter_new.py