将语义检索和词汇检索相结合提高检索准确性
This commit is contained in:
parent
3b49f2da54
commit
ca9b09549f
|
|
@ -21,8 +21,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
from configs import USE_RANKING
|
from configs import USE_RANKING
|
||||||
import jieba
|
import jieba
|
||||||
|
from typing import List, Dict,Tuple
|
||||||
|
|
||||||
|
|
||||||
def search_docs(
|
def search_docs(
|
||||||
query: str = Body("", description="用户输入", examples=["你好"]),
|
query: str = Body("", description="用户输入", examples=["你好"]),
|
||||||
|
|
@ -42,8 +41,12 @@ def search_docs(
|
||||||
if query:
|
if query:
|
||||||
print(f"search_docs, query:{query}")
|
print(f"search_docs, query:{query}")
|
||||||
docs = kb.search_docs(query, FIRST_VECTOR_SEARCH_TOP_K, score_threshold)
|
docs = kb.search_docs(query, FIRST_VECTOR_SEARCH_TOP_K, score_threshold)
|
||||||
print(f"search_docs, docs:{docs}")
|
print(f"search_docs,len of docs {len(docs)}, docs:{docs}")
|
||||||
|
|
||||||
|
docs_key = kb.search_content_internal(query,2)
|
||||||
|
print(f"search_content_internal, len of docs {len(docs)}, docs:{docs_key}")
|
||||||
|
docs = merge_and_deduplicate(docs, docs_key)
|
||||||
|
print(f"after merge_and_deduplicate, len of docs:{docs}")
|
||||||
if USE_RANKING:
|
if USE_RANKING:
|
||||||
queryList = []
|
queryList = []
|
||||||
queryList.append(query)
|
queryList.append(query)
|
||||||
|
|
@ -80,6 +83,30 @@ def search_docs(
|
||||||
data = kb.list_docs(file_name=file_name, metadata=metadata)
|
data = kb.list_docs(file_name=file_name, metadata=metadata)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def merge_and_deduplicate(list1: List[Tuple[Document, float]], list2: List[Tuple[Document, float]]) -> List[Tuple[Document, float]]:
|
||||||
|
# 使用字典来存储 page_content 和对应的元组 (Document, float)
|
||||||
|
merged_dict = {}
|
||||||
|
# 遍历第一个列表
|
||||||
|
for item in list1:
|
||||||
|
document, value = item
|
||||||
|
page_content = document.page_content
|
||||||
|
# 如果 page_content 不在字典中,将其加入字典
|
||||||
|
if page_content not in merged_dict:
|
||||||
|
merged_dict[page_content] = item
|
||||||
|
|
||||||
|
# 遍历第二个列表
|
||||||
|
for item in list2:
|
||||||
|
document, value = item
|
||||||
|
page_content = document.page_content
|
||||||
|
# 如果 page_content 不在字典中,将其加入字典
|
||||||
|
if page_content not in merged_dict:
|
||||||
|
merged_dict[page_content] = item
|
||||||
|
|
||||||
|
# 将字典的值转换为列表并返回
|
||||||
|
return list(merged_dict.values())
|
||||||
|
|
||||||
|
|
||||||
def search_content(
|
def search_content(
|
||||||
query: str = Body("", description="用户输入", examples=["国网安徽信通准入手续"]),
|
query: str = Body("", description="用户输入", examples=["国网安徽信通准入手续"]),
|
||||||
knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
|
knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
|
||||||
|
|
|
||||||
|
|
@ -196,6 +196,13 @@ class KBService(ABC):
|
||||||
docs = self.searchbyContent(query,top_k)
|
docs = self.searchbyContent(query,top_k)
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
def search_content_internal(self,
|
||||||
|
query: str,
|
||||||
|
top_k: int,
|
||||||
|
)->List[Tuple[Document, float]]:
|
||||||
|
docs = self.searchbyContentInternal(query,top_k)
|
||||||
|
return docs
|
||||||
|
|
||||||
def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
|
def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
@ -290,6 +297,16 @@ class KBService(ABC):
|
||||||
搜索知识库子类实自己逻辑
|
搜索知识库子类实自己逻辑
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def searchbyContentInternal(self,
|
||||||
|
query: str,
|
||||||
|
top_k: int,
|
||||||
|
)->List[Tuple[Document, float]]:
|
||||||
|
"""
|
||||||
|
搜索知识库子类实自己逻辑
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def do_add_doc(self,
|
def do_add_doc(self,
|
||||||
|
|
|
||||||
|
|
@ -197,6 +197,28 @@ class ESKBService(KBService):
|
||||||
))
|
))
|
||||||
return docs_and_scores
|
return docs_and_scores
|
||||||
|
|
||||||
|
def searchbyContentInternal(self, query:str, top_k: int = 2):
|
||||||
|
if self.es_client_python.indices.exists(index=self.index_name):
|
||||||
|
print(f"******ESKBService searchbyContentInternal {self.index_name},query:{query}")
|
||||||
|
tem_query = {
|
||||||
|
"query": {"match": {
|
||||||
|
"context": "*" + query + "*"
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
search_results = self.es_client_python.search(index=self.index_name, body=tem_query, size=top_k)
|
||||||
|
hits = [hit for hit in search_results["hits"]["hits"]]
|
||||||
|
docs_and_scores = [
|
||||||
|
(
|
||||||
|
Document(
|
||||||
|
page_content=hit["_source"]["context"],
|
||||||
|
metadata=hit["_source"]["metadata"],
|
||||||
|
),
|
||||||
|
1.3,
|
||||||
|
)
|
||||||
|
for hit in hits
|
||||||
|
]
|
||||||
|
return docs_and_scores
|
||||||
|
|
||||||
def del_doc_by_ids(self, ids: List[str]) -> bool:
|
def del_doc_by_ids(self, ids: List[str]) -> bool:
|
||||||
for doc_id in ids:
|
for doc_id in ids:
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -117,6 +117,10 @@ class FaissKBService(KBService):
|
||||||
def searchbyContent(self):
|
def searchbyContent(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def searchbyContentInternal(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
faissService = FaissKBService("test")
|
faissService = FaissKBService("test")
|
||||||
faissService.add_doc(KnowledgeFile("README.md", "test"))
|
faissService.add_doc(KnowledgeFile("README.md", "test"))
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue