将语义检索和词汇检索相结合提高检索准确性

This commit is contained in:
wvivi2023 2024-03-12 18:24:10 +08:00
parent 3b49f2da54
commit ca9b09549f
4 changed files with 74 additions and 4 deletions

View File

@ -21,8 +21,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from configs import USE_RANKING
import jieba
from typing import List, Dict,Tuple
def search_docs(
query: str = Body("", description="用户输入", examples=["你好"]),
@ -42,8 +41,12 @@ def search_docs(
if query:
print(f"search_docs, query:{query}")
docs = kb.search_docs(query, FIRST_VECTOR_SEARCH_TOP_K, score_threshold)
print(f"search_docs, docs:{docs}")
print(f"search_docs,len of docs {len(docs)}, docs:{docs}")
docs_key = kb.search_content_internal(query,2)
print(f"search_content_internal, len of docs {len(docs)}, docs:{docs_key}")
docs = merge_and_deduplicate(docs, docs_key)
print(f"after merge_and_deduplicate, len of docs:{docs}")
if USE_RANKING:
queryList = []
queryList.append(query)
@ -80,6 +83,30 @@ def search_docs(
data = kb.list_docs(file_name=file_name, metadata=metadata)
return data
def merge_and_deduplicate(list1: List[Tuple[Document, float]], list2: List[Tuple[Document, float]]) -> List[Tuple[Document, float]]:
# 使用字典来存储 page_content 和对应的元组 (Document, float)
merged_dict = {}
# 遍历第一个列表
for item in list1:
document, value = item
page_content = document.page_content
# 如果 page_content 不在字典中,将其加入字典
if page_content not in merged_dict:
merged_dict[page_content] = item
# 遍历第二个列表
for item in list2:
document, value = item
page_content = document.page_content
# 如果 page_content 不在字典中,将其加入字典
if page_content not in merged_dict:
merged_dict[page_content] = item
# 将字典的值转换为列表并返回
return list(merged_dict.values())
def search_content(
query: str = Body("", description="用户输入", examples=["国网安徽信通准入手续"]),
knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),

View File

@ -196,6 +196,13 @@ class KBService(ABC):
docs = self.searchbyContent(query,top_k)
return docs
def search_content_internal(self,
query: str,
top_k: int,
)->List[Tuple[Document, float]]:
docs = self.searchbyContentInternal(query,top_k)
return docs
def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
return []
@ -290,6 +297,16 @@ class KBService(ABC):
搜索知识库子类实自己逻辑
"""
pass
@abstractmethod
def searchbyContentInternal(self,
query: str,
top_k: int,
)->List[Tuple[Document, float]]:
"""
搜索知识库子类实自己逻辑
"""
pass
@abstractmethod
def do_add_doc(self,

View File

@ -197,6 +197,28 @@ class ESKBService(KBService):
))
return docs_and_scores
def searchbyContentInternal(self, query:str, top_k: int = 2):
if self.es_client_python.indices.exists(index=self.index_name):
print(f"******ESKBService searchbyContentInternal {self.index_name},query:{query}")
tem_query = {
"query": {"match": {
"context": "*" + query + "*"
}}
}
search_results = self.es_client_python.search(index=self.index_name, body=tem_query, size=top_k)
hits = [hit for hit in search_results["hits"]["hits"]]
docs_and_scores = [
(
Document(
page_content=hit["_source"]["context"],
metadata=hit["_source"]["metadata"],
),
1.3,
)
for hit in hits
]
return docs_and_scores
def del_doc_by_ids(self, ids: List[str]) -> bool:
for doc_id in ids:
try:

View File

@ -117,6 +117,10 @@ class FaissKBService(KBService):
def searchbyContent(self):
pass
def searchbyContentInternal(self):
pass
if __name__ == '__main__':
faissService = FaissKBService("test")
faissService.add_doc(KnowledgeFile("README.md", "test"))