From 7f8855afe0f44b1b7c056b9686b45d8ca0284675 Mon Sep 17 00:00:00 2001 From: wvivi2023 Date: Tue, 27 Feb 2024 11:05:55 +0800 Subject: [PATCH] add a new api to search content by keyword for es --- server/api.py | 8 ++++++- server/knowledge_base/kb_doc_api.py | 14 ++++++++++++ server/knowledge_base/kb_service/base.py | 18 +++++++++++++++ .../kb_service/default_kb_service.py | 3 +++ .../kb_service/es_kb_service.py | 22 +++++++++++++++++++ 5 files changed, 64 insertions(+), 1 deletion(-) diff --git a/server/api.py b/server/api.py index 4e18e47..69d9c76 100644 --- a/server/api.py +++ b/server/api.py @@ -144,7 +144,7 @@ def mount_knowledge_routes(app: FastAPI): from server.knowledge_base.kb_doc_api import (list_files, upload_docs, delete_docs, update_docs, download_doc, recreate_vector_store, search_docs, DocumentWithVSId, update_info, - update_docs_by_id,) + update_docs_by_id,search_content,Document) app.post("/chat/knowledge_base_chat", tags=["Chat"], @@ -189,6 +189,12 @@ def mount_knowledge_routes(app: FastAPI): summary="搜索知识库" )(search_docs) + app.post("/knowledge_base/search_content", + tags=["Knowledge Base Management"], + response_model=List[Document], + summary="搜索文档库" + )(search_content) + app.post("/knowledge_base/update_docs_by_id", tags=["Knowledge Base Management"], response_model=BaseResponse, diff --git a/server/knowledge_base/kb_doc_api.py b/server/knowledge_base/kb_doc_api.py index 6eae1b0..381ec74 100644 --- a/server/knowledge_base/kb_doc_api.py +++ b/server/knowledge_base/kb_doc_api.py @@ -80,6 +80,20 @@ def search_docs( data = kb.list_docs(file_name=file_name, metadata=metadata) return data +def search_content( + query: str = Body("", description="用户输入", examples=["国网安徽信通准入手续"]), + knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]), + top_k: int = Body(2, description="匹配文档数"), + )-> List[Document]: + print("kb_doc_api search_content") + kb = KBServiceFactory.get_service_by_name(knowledge_base_name) + if kb is not None: + if query: + docs = kb.search_content(query, top_k) + print(f"search_content, docs:{docs}") + return docs + else: + return None def update_docs_by_id( knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]), diff --git a/server/knowledge_base/kb_service/base.py b/server/knowledge_base/kb_service/base.py index bd5a54e..344fff5 100644 --- a/server/knowledge_base/kb_service/base.py +++ b/server/knowledge_base/kb_service/base.py @@ -181,6 +181,14 @@ class KBService(ABC): docs = self.do_search(query, top_k, score_threshold) return docs + def search_content(self, + query: str, + top_k: int, + )->List[Document]: + print("KBService search_content") + docs = self.searchbyContent(query,top_k) + return docs + def get_doc_by_ids(self, ids: List[str]) -> List[Document]: return [] @@ -266,6 +274,16 @@ class KBService(ABC): """ pass + @abstractmethod + def searchbyContent(self, + query: str, + top_k: int, + )->List[Document]: + """ + 搜索知识库子类实自己逻辑 + """ + pass + @abstractmethod def do_add_doc(self, docs: List[Document], diff --git a/server/knowledge_base/kb_service/default_kb_service.py b/server/knowledge_base/kb_service/default_kb_service.py index a68d59c..f5f978f 100644 --- a/server/knowledge_base/kb_service/default_kb_service.py +++ b/server/knowledge_base/kb_service/default_kb_service.py @@ -36,3 +36,6 @@ class DefaultKBService(KBService): def do_delete_doc(self): pass + + def searchbyContent(self): + pass diff --git a/server/knowledge_base/kb_service/es_kb_service.py b/server/knowledge_base/kb_service/es_kb_service.py index 54c3c12..91bce60 100644 --- a/server/knowledge_base/kb_service/es_kb_service.py +++ b/server/knowledge_base/kb_service/es_kb_service.py @@ -156,6 +156,28 @@ class ESKBService(KBService): k=top_k) return docs + def searchbyContent(self, query:str, top_k: int = 2): + if self.es_client_python.indices.exists(index=self.index_name): + print(f"******ESKBService searchByContent {self.index_name}") + tem_query = { + "query": {"match": { + "context": "*" + query + "*" + }} + } + search_results = self.es_client_python.search(index=self.index_name, body=tem_query, size=top_k) + hits = [hit for hit in search_results["hits"]["hits"]] + docs_and_scores = [ + ( + Document( + page_content=hit["_source"]["context"], + metadata=hit["_source"]["metadata"], + ) + ) + for hit in hits + ] + + return docs_and_scores + def del_doc_by_ids(self, ids: List[str]) -> bool: for doc_id in ids: try: