将语义检索和词汇检索相结合提高检索准确性

2024-03-12 18:24:10 +08:00 · 2024-03-12 18:24:10 +08:00 · ca9b09549f
parent 3b49f2da54
commit ca9b09549f
4 changed files with 74 additions and 4 deletions
--- a/server/knowledge_base/kb_doc_api.py
+++ b/server/knowledge_base/kb_doc_api.py
@ -21,8 +21,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from configs import USE_RANKING
 import jieba
-
+from typing import List, Dict,Tuple
 def search_docs(
        query: str = Body("", description="用户输入", examples=["你好"]),
@ -42,8 +41,12 @@ def search_docs(
        if query:
            print(f"search_docs, query:{query}")  
            docs = kb.search_docs(query, FIRST_VECTOR_SEARCH_TOP_K, score_threshold)
-            print(f"search_docs, docs:{docs}")
+            print(f"search_docs,len of docs {len(docs)}, docs:{docs}")
-
+            
            docs_key = kb.search_content_internal(query,2)
            print(f"search_content_internal, len of docs {len(docs)}, docs:{docs_key}")
            docs = merge_and_deduplicate(docs, docs_key)
            print(f"after merge_and_deduplicate, len of docs:{docs}")
            if USE_RANKING:
                queryList = []
                queryList.append(query)
@ -80,6 +83,30 @@ def search_docs(
            data = kb.list_docs(file_name=file_name, metadata=metadata)
    return data
 def merge_and_deduplicate(list1: List[Tuple[Document, float]], list2: List[Tuple[Document, float]]) -> List[Tuple[Document, float]]:
    # 使用字典来存储 page_content 和对应的元组 (Document, float)
    merged_dict = {}
    # 遍历第一个列表
    for item in list1:
        document, value = item
        page_content = document.page_content
        # 如果 page_content 不在字典中，将其加入字典
        if page_content not in merged_dict:
            merged_dict[page_content] = item
    # 遍历第二个列表
    for item in list2:
        document, value = item
        page_content = document.page_content
        # 如果 page_content 不在字典中，将其加入字典
        if page_content not in merged_dict:
            merged_dict[page_content] = item
    # 将字典的值转换为列表并返回
    return list(merged_dict.values())
 def search_content(
        query: str = Body("", description="用户输入", examples=["国网安徽信通准入手续"]),
        knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
--- a/server/knowledge_base/kb_service/base.py
+++ b/server/knowledge_base/kb_service/base.py
@ -196,6 +196,13 @@ class KBService(ABC):
        docs = self.searchbyContent(query,top_k)
        return docs
    def search_content_internal(self,
                    query: str,
                    top_k: int,
                    )->List[Tuple[Document, float]]:
        docs = self.searchbyContentInternal(query,top_k)
        return docs
    def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
        return []
@ -290,6 +297,16 @@ class KBService(ABC):
        搜索知识库子类实自己逻辑
        """
        pass
    @abstractmethod
    def searchbyContentInternal(self,
                    query: str,
                    top_k: int,
                    )->List[Tuple[Document, float]]:
        """
        搜索知识库子类实自己逻辑
        """
        pass
    @abstractmethod
    def do_add_doc(self,
--- a/server/knowledge_base/kb_service/es_kb_service.py
+++ b/server/knowledge_base/kb_service/es_kb_service.py
@ -197,6 +197,28 @@ class ESKBService(KBService):
                    ))
            return docs_and_scores
    def searchbyContentInternal(self, query:str, top_k: int = 2):
        if self.es_client_python.indices.exists(index=self.index_name):
            print(f"******ESKBService searchbyContentInternal {self.index_name},query:{query}")
            tem_query = {
                "query": {"match": {
                        "context": "*" + query + "*"
                    }}
                }
            search_results = self.es_client_python.search(index=self.index_name, body=tem_query, size=top_k)
            hits = [hit for hit in search_results["hits"]["hits"]]
            docs_and_scores = [
            (
                Document(
                    page_content=hit["_source"]["context"],
                    metadata=hit["_source"]["metadata"],
                ),
                1.3,
            )
            for hit in hits
            ]
            return docs_and_scores
    def del_doc_by_ids(self, ids: List[str]) -> bool:
        for doc_id in ids:
            try:
--- a/server/knowledge_base/kb_service/faiss_kb_service.py
+++ b/server/knowledge_base/kb_service/faiss_kb_service.py
@ -117,6 +117,10 @@ class FaissKBService(KBService):
    def searchbyContent(self):
        pass
    def searchbyContentInternal(self):
        pass
 if __name__ == '__main__':
    faissService = FaissKBService("test")
    faissService.add_doc(KnowledgeFile("README.md", "test"))