Langchain-Chatchat/server/knowledge_base/kb_service/zilliz_kb_service.py

101 lines
3.4 KiB
Python
Raw Normal View History

from typing import List, Dict, Optional
from langchain.embeddings.base import Embeddings
from langchain.schema import Document
from langchain.vectorstores import Zilliz
from configs import kbs_config
from server.knowledge_base.kb_service.base import KBService, SupportedVSType, EmbeddingsFunAdapter, \
score_threshold_process
from server.knowledge_base.utils import KnowledgeFile
class ZillizKBService(KBService):
zilliz: Zilliz
@staticmethod
def get_collection(zilliz_name):
from pymilvus import Collection
return Collection(zilliz_name)
# def save_vector_store(self):
# if self.zilliz.col:
# self.zilliz.col.flush()
def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
result = []
if self.zilliz.col:
data_list = self.zilliz.col.query(expr=f'pk in {ids}', output_fields=["*"])
for data in data_list:
text = data.pop("text")
result.append(Document(page_content=text, metadata=data))
return result
@staticmethod
def search(zilliz_name, content, limit=3):
search_params = {
"metric_type": "IP",
"params": {},
}
c = ZillizKBService.get_collection(zilliz_name)
return c.search(content, "embeddings", search_params, limit=limit, output_fields=["content"])
def do_create_kb(self):
pass
def vs_type(self) -> str:
return SupportedVSType.ZILLIZ
def _load_zilliz(self):
zilliz_args = kbs_config.get("zilliz")
self.zilliz = Zilliz(embedding_function=EmbeddingsFunAdapter(self.embed_model),
collection_name=self.kb_name, connection_args=zilliz_args)
def do_init(self):
self._load_zilliz()
def do_drop_kb(self):
if self.zilliz.col:
self.zilliz.col.release()
self.zilliz.col.drop()
def do_search(self, query: str, top_k: int, score_threshold: float):
self._load_zilliz()
支持在线 Embeddings, Lite 模式支持所有知识库相关功能 (#1924) 新功能: - 支持在线 Embeddings:zhipu-api, qwen-api, minimax-api, qianfan-api - API 增加 /other/embed_texts 接口 - init_database.py 增加 --embed-model 参数,可以指定使用的嵌入模型(本地或在线均可) - 对于 FAISS 知识库,支持多向量库,默认位置:{KB_PATH}/vector_store/{embed_model} - Lite 模式支持所有知识库相关功能。此模式下最主要的限制是: - 不能使用本地 LLM 和 Embeddings 模型 - 知识库不支持 PDF 文件 - init_database.py 重建知识库时不再默认情况数据库表,增加 clear-tables 参数手动控制。 - API 和 WEBUI 中 score_threshold 参数范围改为 [0, 2],以更好的适应在线嵌入模型 问题修复: - API 中 list_config_models 会删除 ONLINE_LLM_MODEL 中的敏感信息,导致第二轮API请求错误 开发者: - 统一向量库的识别:以(kb_name,embed_model)为判断向量库唯一性的依据,避免 FAISS 知识库缓存加载逻辑错误 - KBServiceFactory.get_service_by_name 中添加 default_embed_model 参数,用于在构建新知识库时设置 embed_model - 优化 kb_service 中 Embeddings 操作: - 统一加载接口: server.utils.load_embeddings,利用全局缓存避免各处 Embeddings 传参 - 统一文本嵌入接口:server.knowledge_base.kb_service.base.[embed_texts, embed_documents] - 重写 normalize 函数,去除对 scikit-learn/scipy 的依赖
2023-10-31 14:26:50 +08:00
embed_func = EmbeddingsFunAdapter(self.embed_model)
embeddings = embed_func.embed_query(query)
docs = self.zilliz.similarity_search_with_score_by_vector(embeddings, top_k)
return score_threshold_process(score_threshold, top_k, docs)
def do_add_doc(self, docs: List[Document], **kwargs) -> List[Dict]:
for doc in docs:
for k, v in doc.metadata.items():
doc.metadata[k] = str(v)
for field in self.zilliz.fields:
doc.metadata.setdefault(field, "")
doc.metadata.pop(self.zilliz._text_field, None)
doc.metadata.pop(self.zilliz._vector_field, None)
ids = self.zilliz.add_documents(docs)
doc_infos = [{"id": id, "metadata": doc.metadata} for id, doc in zip(ids, docs)]
return doc_infos
def do_delete_doc(self, kb_file: KnowledgeFile, **kwargs):
if self.zilliz.col:
filepath = kb_file.filepath.replace('\\', '\\\\')
delete_list = [item.get("pk") for item in
self.zilliz.col.query(expr=f'source == "{filepath}"', output_fields=["pk"])]
self.zilliz.col.delete(expr=f'pk in {delete_list}')
def do_clear_vs(self):
if self.zilliz.col:
self.do_drop_kb()
self.do_init()
if __name__ == '__main__':
from server.db.base import Base, engine
Base.metadata.create_all(bind=engine)
zillizService = ZillizKBService("test")