Langchain-Chatchat/libs/chatchat-server/chatchat/server/knowledge_base/kb_service/base.py

535 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import operator
import os
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from langchain.docstore.document import Document
from chatchat.settings import Settings
from chatchat.utils import build_logger
from chatchat.server.db.models.knowledge_base_model import KnowledgeBaseSchema
from chatchat.server.db.repository.knowledge_base_repository import (
add_kb_to_db,
delete_kb_from_db,
get_kb_detail,
kb_exists,
list_kbs_from_db,
load_kb_from_db,
)
from chatchat.server.db.repository.knowledge_file_repository import (
add_file_to_db,
count_files_from_db,
delete_file_from_db,
delete_files_from_db,
file_exists_in_db,
get_file_detail,
list_docs_from_db,
list_files_from_db,
)
from chatchat.server.knowledge_base.model.kb_document_model import DocumentWithVSId
from chatchat.server.knowledge_base.utils import (
KnowledgeFile,
get_doc_path,
get_kb_path,
list_files_from_folder,
list_kbs_from_folder,
)
from chatchat.server.utils import (
check_embed_model as _check_embed_model,
get_default_embedding,
)
logger = build_logger()
class SupportedVSType:
FAISS = "faiss"
MILVUS = "milvus"
DEFAULT = "default"
ZILLIZ = "zilliz"
PG = "pg"
RELYT = "relyt"
ES = "es"
CHROMADB = "chromadb"
class KBService(ABC):
def __init__(
self,
knowledge_base_name: str,
kb_info: str = None,
embed_model: str = get_default_embedding(),
):
self.kb_name = knowledge_base_name
self.kb_info = kb_info or Settings.kb_settings.KB_INFO.get(
knowledge_base_name, f"关于{knowledge_base_name}的知识库"
)
self.embed_model = embed_model
self.kb_path = get_kb_path(self.kb_name)
self.doc_path = get_doc_path(self.kb_name)
self.do_init()
def __repr__(self) -> str:
return f"{self.kb_name} @ {self.embed_model}"
def save_vector_store(self):
"""
保存向量库:FAISS保存到磁盘milvus保存到数据库。PGVector暂未支持
"""
pass
def check_embed_model(self) -> Tuple[bool, str]:
return _check_embed_model(self.embed_model)
def create_kb(self):
"""
创建知识库
"""
if not os.path.exists(self.doc_path):
os.makedirs(self.doc_path)
status = add_kb_to_db(
self.kb_name, self.kb_info, self.vs_type(), self.embed_model
)
if status:
self.do_create_kb()
return status
def clear_vs(self):
"""
删除向量库中所有内容
"""
self.do_clear_vs()
status = delete_files_from_db(self.kb_name)
return status
def drop_kb(self):
"""
删除知识库
"""
self.do_drop_kb()
status = delete_kb_from_db(self.kb_name)
return status
def add_doc(self, kb_file: KnowledgeFile, docs: List[Document] = [], **kwargs):
"""
向知识库添加文件
如果指定了docs则不再将文本向量化并将数据库对应条目标为custom_docs=True
"""
if not self.check_embed_model()[0]:
return False
if docs:
custom_docs = True
else:
docs = kb_file.file2text()
custom_docs = False
if docs:
# 将 metadata["source"] 改为相对路径
for doc in docs:
try:
doc.metadata.setdefault("source", kb_file.filename)
source = doc.metadata.get("source", "")
if os.path.isabs(source):
rel_path = Path(source).relative_to(self.doc_path)
doc.metadata["source"] = str(rel_path.as_posix().strip("/"))
except Exception as e:
print(
f"cannot convert absolute path ({source}) to relative path. error is : {e}"
)
self.delete_doc(kb_file)
doc_infos = self.do_add_doc(docs, **kwargs)
status = add_file_to_db(
kb_file,
custom_docs=custom_docs,
docs_count=len(docs),
doc_infos=doc_infos,
)
else:
status = False
return status
def delete_doc(
self, kb_file: KnowledgeFile, delete_content: bool = False, **kwargs
):
"""
从知识库删除文件
"""
self.do_delete_doc(kb_file, **kwargs)
status = delete_file_from_db(kb_file)
if delete_content and os.path.exists(kb_file.filepath):
os.remove(kb_file.filepath)
return status
def update_info(self, kb_info: str):
"""
更新知识库介绍
"""
self.kb_info = kb_info
status = add_kb_to_db(
self.kb_name, self.kb_info, self.vs_type(), self.embed_model
)
return status
def update_doc(self, kb_file: KnowledgeFile, docs: List[Document] = [], **kwargs):
"""
使用content中的文件更新向量库
如果指定了docs则使用自定义docs并将数据库对应条目标为custom_docs=True
"""
if not self.check_embed_model()[0]:
return False
if os.path.exists(kb_file.filepath):
self.delete_doc(kb_file, **kwargs)
return self.add_doc(kb_file, docs=docs, **kwargs)
def exist_doc(self, file_name: str):
return file_exists_in_db(
KnowledgeFile(knowledge_base_name=self.kb_name, filename=file_name)
)
def list_files(self):
return list_files_from_db(self.kb_name)
def count_files(self):
return count_files_from_db(self.kb_name)
def search_docs(
self,
query: str,
top_k: int = Settings.kb_settings.VECTOR_SEARCH_TOP_K,
score_threshold: float = Settings.kb_settings.SCORE_THRESHOLD,
) -> List[Document]:
if not self.check_embed_model()[0]:
return []
docs = self.do_search(query, top_k, score_threshold)
return docs
def search_content_internal(self,
query: str,
top_k: int,
)->List[Document]:
docs = self.searchbyContentInternal(query,top_k)
return docs
def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
return []
def del_doc_by_ids(self, ids: List[str]) -> bool:
raise NotImplementedError
def update_doc_by_ids(self, docs: Dict[str, Document]) -> bool:
"""
传入参数为: {doc_id: Document, ...}
如果对应 doc_id 的值为 None或其 page_content 为空,则删除该文档
"""
if not self.check_embed_model()[0]:
return False
self.del_doc_by_ids(list(docs.keys()))
pending_docs = []
ids = []
for _id, doc in docs.items():
if not doc or not doc.page_content.strip():
continue
ids.append(_id)
pending_docs.append(doc)
self.do_add_doc(docs=pending_docs, ids=ids)
return True
def list_docs(
self, file_name: str = None, metadata: Dict = {}
) -> List[DocumentWithVSId]:
"""
通过file_name或metadata检索Document
"""
doc_infos = list_docs_from_db(
kb_name=self.kb_name, file_name=file_name, metadata=metadata
)
docs = []
for x in doc_infos:
doc_info = self.get_doc_by_ids([x["id"]])[0]
if doc_info is not None:
# 处理非空的情况
doc_with_id = DocumentWithVSId(**{**doc_info.dict(), "id":x["id"]})
docs.append(doc_with_id)
else:
# 处理空的情况
# 可以选择跳过当前循环迭代或执行其他操作
pass
return docs
def get_relative_source_path(self, filepath: str):
"""
将文件路径转化为相对路径,保证查询时一致
"""
relative_path = filepath
if os.path.isabs(relative_path):
try:
relative_path = Path(filepath).relative_to(self.doc_path)
except Exception as e:
print(
f"cannot convert absolute path ({relative_path}) to relative path. error is : {e}"
)
relative_path = str(relative_path.as_posix().strip("/"))
return relative_path
@abstractmethod
def do_create_kb(self):
"""
创建知识库子类实自己逻辑
"""
pass
@staticmethod
def list_kbs_type():
return list(Settings.kb_settings.kbs_config.keys())
@classmethod
def list_kbs(cls):
return list_kbs_from_db()
def exists(self, kb_name: str = None):
kb_name = kb_name or self.kb_name
return kb_exists(kb_name)
@abstractmethod
def vs_type(self) -> str:
pass
@abstractmethod
def do_init(self):
pass
@abstractmethod
def do_drop_kb(self):
"""
删除知识库子类实自己逻辑
"""
pass
@abstractmethod
def do_search(
self,
query: str,
top_k: int,
score_threshold: float,
) -> List[Tuple[Document, float]]:
"""
搜索知识库子类实自己逻辑
"""
pass
@abstractmethod
def searchbyContentInternal(self,
query: str,
top_k: int,
)->List[Tuple[Document, float]]:
"""
搜索知识库子类实自己逻辑
"""
pass
@abstractmethod
def do_add_doc(
self,
docs: List[Document],
**kwargs,
) -> List[Dict]:
"""
向知识库添加文档子类实自己逻辑
"""
pass
@abstractmethod
def do_delete_doc(self, kb_file: KnowledgeFile):
"""
从知识库删除文档子类实自己逻辑
"""
pass
@abstractmethod
def do_clear_vs(self):
"""
从知识库删除全部向量子类实自己逻辑
"""
pass
class KBServiceFactory:
@staticmethod
def get_service(
kb_name: str,
vector_store_type: Union[str, SupportedVSType],
embed_model: str = get_default_embedding(),
kb_info: str = None,
) -> KBService:
if isinstance(vector_store_type, str):
vector_store_type = getattr(SupportedVSType, vector_store_type.upper())
params = {
"knowledge_base_name": kb_name,
"embed_model": embed_model,
"kb_info": kb_info,
}
if SupportedVSType.FAISS == vector_store_type:
from chatchat.server.knowledge_base.kb_service.faiss_kb_service import (
FaissKBService,
)
return FaissKBService(**params)
elif SupportedVSType.PG == vector_store_type:
from chatchat.server.knowledge_base.kb_service.pg_kb_service import (
PGKBService,
)
return PGKBService(**params)
elif SupportedVSType.RELYT == vector_store_type:
from chatchat.server.knowledge_base.kb_service.relyt_kb_service import (
RelytKBService,
)
return RelytKBService(**params)
elif SupportedVSType.MILVUS == vector_store_type:
from chatchat.server.knowledge_base.kb_service.milvus_kb_service import (
MilvusKBService,
)
return MilvusKBService(**params)
elif SupportedVSType.ZILLIZ == vector_store_type:
from chatchat.server.knowledge_base.kb_service.zilliz_kb_service import (
ZillizKBService,
)
return ZillizKBService(**params)
elif SupportedVSType.DEFAULT == vector_store_type:
from chatchat.server.knowledge_base.kb_service.milvus_kb_service import (
MilvusKBService,
)
return MilvusKBService(
**params
) # other milvus parameters are set in model_config.Settings.kb_settings.kbs_config
elif SupportedVSType.ES == vector_store_type:
from chatchat.server.knowledge_base.kb_service.es_kb_service import (
ESKBService,
)
return ESKBService(**params)
elif SupportedVSType.CHROMADB == vector_store_type:
from chatchat.server.knowledge_base.kb_service.chromadb_kb_service import (
ChromaKBService,
)
return ChromaKBService(**params)
elif (
SupportedVSType.DEFAULT == vector_store_type
): # kb_exists of default kbservice is False, to make validation easier.
from chatchat.server.knowledge_base.kb_service.default_kb_service import (
DefaultKBService,
)
return DefaultKBService(kb_name)
@staticmethod
def get_service_by_name(kb_name: str) -> KBService:
_, vs_type, embed_model = load_kb_from_db(kb_name)
if _ is None: # kb not in db, just return None
return None
return KBServiceFactory.get_service(kb_name, vs_type, embed_model)
@staticmethod
def get_default():
return KBServiceFactory.get_service("default", SupportedVSType.DEFAULT)
def get_kb_details() -> List[Dict]:
kbs_in_folder = list_kbs_from_folder()
kbs_in_db: List[KnowledgeBaseSchema] = KBService.list_kbs()
result = {}
for kb in kbs_in_folder:
result[kb] = {
"kb_name": kb,
"vs_type": "",
"kb_info": "",
"embed_model": "",
"file_count": 0,
"create_time": None,
"in_folder": True,
"in_db": False,
}
for kb_detail in kbs_in_db:
kb_detail = kb_detail.model_dump()
kb_name = kb_detail["kb_name"]
kb_detail["in_db"] = True
if kb_name in result:
result[kb_name].update(kb_detail)
else:
kb_detail["in_folder"] = False
result[kb_name] = kb_detail
data = []
for i, v in enumerate(result.values()):
v["No"] = i + 1
data.append(v)
return data
def get_kb_file_details(kb_name: str) -> List[Dict]:
kb = KBServiceFactory.get_service_by_name(kb_name)
if kb is None:
return []
files_in_folder = list_files_from_folder(kb_name)
files_in_db = kb.list_files()
result = {}
for doc in files_in_folder:
result[doc] = {
"kb_name": kb_name,
"file_name": doc,
"file_ext": os.path.splitext(doc)[-1],
"file_version": 0,
"document_loader": "",
"docs_count": 0,
"text_splitter": "",
"create_time": None,
"in_folder": True,
"in_db": False,
}
lower_names = {x.lower(): x for x in result}
for doc in files_in_db:
doc_detail = get_file_detail(kb_name, doc)
if doc_detail:
doc_detail["in_db"] = True
if doc.lower() in lower_names:
result[lower_names[doc.lower()]].update(doc_detail)
else:
doc_detail["in_folder"] = False
result[doc] = doc_detail
data = []
for i, v in enumerate(result.values()):
v["No"] = i + 1
data.append(v)
return data
def score_threshold_process(score_threshold, k, docs):
if score_threshold is not None:
cmp = operator.le
docs = [
(doc, similarity)
for doc, similarity in docs
if cmp(similarity, score_threshold)
]
return docs[:k]