0.2.6 enhance

2023-11-13 09:20:19 +08:00 · 2023-11-13 09:20:19 +08:00 · 60a12c05f6
parent 4cdd2a5e79
commit 60a12c05f6
12 changed files with 88 additions and 30 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/img/siji.jpg
+++ b/img/siji.jpg
--- a/server/.DS_Store
+++ b/server/.DS_Store
--- a/server/knowledge_base/kb_doc_api.py
+++ b/server/knowledge_base/kb_doc_api.py
@ -41,11 +41,13 @@ def search_docs(query: str = Body(..., description="用户输入", examples=["
    
    print(f"search_docs, query:{query}")
    docs = kb.search_docs(query, top_k, score_threshold)
+    if len(pre_doc) > 0:
+        if docs is not None:
+            docs.append(pre_doc[0])
+        else:
+            docs = pre_doc[0]
    data = [DocumentWithScore(**x[0].dict(), score=x[1]) for x in docs]
-    # i = 1
-    # for x in docs:
-    #     print(f"相似文档 {i}: {x}")
-    #     i = i+1
+

    return data

--- a/server/knowledge_base/kb_service/faiss_kb_service.py
+++ b/server/knowledge_base/kb_service/faiss_kb_service.py
@ -63,7 +63,7 @@ class FaissKBService(KBService):
        print(f"do_search,top_k:{top_k},score_threshold:{score_threshold}")
        with self.load_vector_store().acquire() as vs:
            docs = vs.similarity_search_with_score(query, k=top_k, score_threshold=score_threshold)
-        print(f"do_search,docs:{docs}")
+        #print(f"do_search,docs:{docs}")
        return docs

    def do_add_doc(self,
--- a/server/knowledge_base/utils.py
+++ b/server/knowledge_base/utils.py
@ -68,7 +68,7 @@ def load_embeddings(model: str = EMBEDDING_MODEL, device: str = embedding_device
    from server.knowledge_base.kb_cache.base import embeddings_pool
    return embeddings_pool.load_embeddings(model=model, device=device)

-
+#PDFPlumberLoader
 LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
               "UnstructuredMarkdownLoader": ['.md'],
               "CustomJSONLoader": [".json"],
@ -302,6 +302,8 @@ class KnowledgeFile:
        text_splitter: TextSplitter = None,
    ):
        docs = docs or self.file2docs(refresh=refresh)
+        file_name_without_extension, file_extension = os.path.splitext(self.filepath)
+        print(f"filepath:{self.filepath},文件名拆分后：{file_name_without_extension},{file_extension}")
        if not docs:
            return []
        if self.ext not in [".csv"]:
@ -314,13 +316,27 @@ class KnowledgeFile:
                    if doc.metadata:
                        doc.metadata["source"] = os.path.basename(self.filepath)
            else:
+                print(f"**********************docs2texts: text_splitter.split_documents(docs)")
+                outputfile = file_name_without_extension + "_source.txt"
+                with open(outputfile, 'w') as file:
+                    for doc in docs:
+                        file.write(doc.page_content)
                docs = text_splitter.split_documents(docs)
                
        #print(f"文档切分示例：{docs[0]}")
-        i = 0
-        for doc in docs:
-            print(f"**********切分段{i}：{doc}")
-            i = i+1
+        # print(f"KnowledgeFile: filepath:{self.filepath}")
+        # file_name_without_extension, file_extension = os.path.splitext(self.filepath)
+        # print("filepath:{self.filepath},文件名拆分后：{file_name_without_extension},{file_extension}")
+
+        i = 1
+        outputfile = file_name_without_extension + "_split.txt"
+        # 打开文件以写入模式
+        with open(outputfile, 'w') as file:
+            for doc in docs:
+                print(f"**********切分段{i}：{doc}")
+                file.write(f"分段{i}")
+                file.write(doc.page_content)
+                i = i+1
           
        if zh_title_enhance:
            docs = func_zh_title_enhance(docs)
@ -407,7 +423,8 @@ if __name__ == "__main__":
    kb_file = KnowledgeFile(filename="test.txt", knowledge_base_name="samples")
    # kb_file.text_splitter_name = "RecursiveCharacterTextSplitter"
    docs = kb_file.file2docs()
-    pprint(docs[-1])
+    #pprint(docs[-1])

-    docs = kb_file.file2text()
-    pprint(docs[-1])
+    docs  = kb_file.docs2texts()
+    #docs = kb_file.file2text()
+    #pprint(docs[-1])
--- a/test.py
+++ b/test.py
@ -13,9 +13,9 @@ if __name__ == '__main__':
    # pprint(docs[-1])

    faissService = FaissKBService("test")
-    faissService.add_doc(KnowledgeFile("国网安徽信通公司安全准入实施要求_修订.docx", "test"))
+    faissService.add_doc(KnowledgeFile("电力电缆故障测寻车技术规范.docx", "test"))
    # faissService.delete_doc(KnowledgeFile("README.md", "test"))
    # faissService.do_drop_kb()
-    print(faissService.search_docs("准入手续的内容是什么？"))
+    #print(faissService.search_docs("准入手续的内容是什么？"))


--- a/text_splitter/chinese_recursive_text_splitter.py
+++ b/text_splitter/chinese_recursive_text_splitter.py
--- a/text_splitter/柔性直流系统直流断路器验收规范.pdf
+++ b/text_splitter/柔性直流系统直流断路器验收规范.pdf
--- a/text_splitter/电力电缆故障测寻车技术规范.pdf
+++ b/text_splitter/电力电缆故障测寻车技术规范.pdf
--- a/webui.py
+++ b/webui.py
@ -17,7 +17,7 @@ if __name__ == "__main__":
        menu_items={
            'Get Help': 'https://github.com/chatchat-space/Langchain-Chatchat',
            'Report a bug': "https://github.com/chatchat-space/Langchain-Chatchat/issues",
-            'About': f"""欢迎使用 Langchain-Chatchat WebUI {VERSION}！"""
+            'About': f"""欢迎使用 思极大模型 WebUI {VERSION}！"""
        }
    )

@ -36,7 +36,8 @@ if __name__ == "__main__":
        st.image(
            os.path.join(
                "img",
-                "logo-long-chatchat-trans-v2.png"
+                "siji.jpg"
+                #"logo-long-chatchat-trans-v2.png"
            ),
            use_column_width=True
        )
--- a/电力电缆故障测寻车技术规范.pdf
+++ b/电力电缆故障测寻车技术规范.pdf