es向量入库优化

This commit is contained in:
weiweiw 2025-02-25 12:50:40 +08:00
parent 92202e0f98
commit d9acc07c59
1 changed files with 20 additions and 7 deletions

View File

@ -242,7 +242,8 @@ class ESKBService(KBService):
}, },
"track_total_hits": True, "track_total_hits": True,
} }
print(f"***do_delete_doc: kb_file.filepath:{kb_file.filepath}") print(f"***do_delete_doc: kb_file.filepath:{kb_file.filepath}, kb.filename:{kb_file.filename}")
print(f"***do_delete_doc: kb.filename:{kb_file.filename}")
# 注意设置size默认返回10个。 # 注意设置size默认返回10个。
search_results = self.es_client_python.search(index=self.index_name, body=query,size=200) search_results = self.es_client_python.search(index=self.index_name, body=query,size=200)
delete_list = [hit["_id"] for hit in search_results['hits']['hits']] delete_list = [hit["_id"] for hit in search_results['hits']['hits']]
@ -277,22 +278,34 @@ class ESKBService(KBService):
if self.es_client_python.indices.exists(index=self.index_name): if self.es_client_python.indices.exists(index=self.index_name):
file_path = docs[0].metadata.get("source") file_path = docs[0].metadata.get("source")
print(f"****************do_add_doc, file_path:{file_path}")
# enhanced by weiweiwang 2025/2/24 to specific index name
# query = {
# "query": {
# "term": {"metadata.source.keyword": file_path},
# # "term": {"_index": self.index_name},
# }
# }
query = { query = {
"query": { "query": {
"term": {"metadata.source.keyword": file_path}, "bool": {
"term": {"_index": self.index_name}, "must": [
{ "term": { "metadata.source.keyword": file_path } },
{ "term": { "_index": self.index_name } }
]
}
} }
} }
# 注意设置size默认返回10个。 # 注意设置size默认返回10个。
search_results = self.es_client_python.search(body=query, size=50) search_results = self.es_client_python.search(body=query, size=200)
if len(search_results["hits"]["hits"]) == 0: if len(search_results["hits"]["hits"]) == 0:
raise ValueError("召回元素个数为0") raise ValueError("召回元素个数为0")
info_docs = [ info_docs = [
{"id": hit["_id"], "metadata": hit["_source"]["metadata"]} {"id": hit["_id"], "metadata": hit["_source"]["metadata"]}
for hit in search_results["hits"]["hits"] for hit in search_results["hits"]["hits"]
] ]
#size = len(info_docs) # size = len(info_docs)
#print(f"do_add_doc 召回元素个数:{size}") # print(f"do_add_doc 召回元素个数:{size}")
return info_docs return info_docs
def do_clear_vs(self): def do_clear_vs(self):