将语义检索和词汇检索相结合提高检索准确性
This commit is contained in:
parent
3b49f2da54
commit
ca9b09549f
|
|
@ -21,8 +21,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from configs import USE_RANKING
|
||||
import jieba
|
||||
|
||||
|
||||
from typing import List, Dict,Tuple
|
||||
|
||||
def search_docs(
|
||||
query: str = Body("", description="用户输入", examples=["你好"]),
|
||||
|
|
@ -42,8 +41,12 @@ def search_docs(
|
|||
if query:
|
||||
print(f"search_docs, query:{query}")
|
||||
docs = kb.search_docs(query, FIRST_VECTOR_SEARCH_TOP_K, score_threshold)
|
||||
print(f"search_docs, docs:{docs}")
|
||||
|
||||
print(f"search_docs,len of docs {len(docs)}, docs:{docs}")
|
||||
|
||||
docs_key = kb.search_content_internal(query,2)
|
||||
print(f"search_content_internal, len of docs {len(docs)}, docs:{docs_key}")
|
||||
docs = merge_and_deduplicate(docs, docs_key)
|
||||
print(f"after merge_and_deduplicate, len of docs:{docs}")
|
||||
if USE_RANKING:
|
||||
queryList = []
|
||||
queryList.append(query)
|
||||
|
|
@ -80,6 +83,30 @@ def search_docs(
|
|||
data = kb.list_docs(file_name=file_name, metadata=metadata)
|
||||
return data
|
||||
|
||||
|
||||
def merge_and_deduplicate(list1: List[Tuple[Document, float]], list2: List[Tuple[Document, float]]) -> List[Tuple[Document, float]]:
|
||||
# 使用字典来存储 page_content 和对应的元组 (Document, float)
|
||||
merged_dict = {}
|
||||
# 遍历第一个列表
|
||||
for item in list1:
|
||||
document, value = item
|
||||
page_content = document.page_content
|
||||
# 如果 page_content 不在字典中,将其加入字典
|
||||
if page_content not in merged_dict:
|
||||
merged_dict[page_content] = item
|
||||
|
||||
# 遍历第二个列表
|
||||
for item in list2:
|
||||
document, value = item
|
||||
page_content = document.page_content
|
||||
# 如果 page_content 不在字典中,将其加入字典
|
||||
if page_content not in merged_dict:
|
||||
merged_dict[page_content] = item
|
||||
|
||||
# 将字典的值转换为列表并返回
|
||||
return list(merged_dict.values())
|
||||
|
||||
|
||||
def search_content(
|
||||
query: str = Body("", description="用户输入", examples=["国网安徽信通准入手续"]),
|
||||
knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
|
||||
|
|
|
|||
|
|
@ -196,6 +196,13 @@ class KBService(ABC):
|
|||
docs = self.searchbyContent(query,top_k)
|
||||
return docs
|
||||
|
||||
def search_content_internal(self,
|
||||
query: str,
|
||||
top_k: int,
|
||||
)->List[Tuple[Document, float]]:
|
||||
docs = self.searchbyContentInternal(query,top_k)
|
||||
return docs
|
||||
|
||||
def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
|
||||
return []
|
||||
|
||||
|
|
@ -290,6 +297,16 @@ class KBService(ABC):
|
|||
搜索知识库子类实自己逻辑
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def searchbyContentInternal(self,
|
||||
query: str,
|
||||
top_k: int,
|
||||
)->List[Tuple[Document, float]]:
|
||||
"""
|
||||
搜索知识库子类实自己逻辑
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def do_add_doc(self,
|
||||
|
|
|
|||
|
|
@ -197,6 +197,28 @@ class ESKBService(KBService):
|
|||
))
|
||||
return docs_and_scores
|
||||
|
||||
def searchbyContentInternal(self, query:str, top_k: int = 2):
|
||||
if self.es_client_python.indices.exists(index=self.index_name):
|
||||
print(f"******ESKBService searchbyContentInternal {self.index_name},query:{query}")
|
||||
tem_query = {
|
||||
"query": {"match": {
|
||||
"context": "*" + query + "*"
|
||||
}}
|
||||
}
|
||||
search_results = self.es_client_python.search(index=self.index_name, body=tem_query, size=top_k)
|
||||
hits = [hit for hit in search_results["hits"]["hits"]]
|
||||
docs_and_scores = [
|
||||
(
|
||||
Document(
|
||||
page_content=hit["_source"]["context"],
|
||||
metadata=hit["_source"]["metadata"],
|
||||
),
|
||||
1.3,
|
||||
)
|
||||
for hit in hits
|
||||
]
|
||||
return docs_and_scores
|
||||
|
||||
def del_doc_by_ids(self, ids: List[str]) -> bool:
|
||||
for doc_id in ids:
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -117,6 +117,10 @@ class FaissKBService(KBService):
|
|||
def searchbyContent(self):
|
||||
pass
|
||||
|
||||
def searchbyContentInternal(self):
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
faissService = FaissKBService("test")
|
||||
faissService.add_doc(KnowledgeFile("README.md", "test"))
|
||||
|
|
|
|||
Loading…
Reference in New Issue