新功能:知识库管理界面支持查看、编辑、删除向量库文档 (#2471)
* 新功能: - 知识库管理界面支持查看、编辑、删除向量库文档。暂不支持增加(aggrid添加新行比较麻烦,需要另外实现) - 去除知识库管理界面中重建知识库和删除知识库按钮,建议用户到终端命令操作 修复: - 所有与知识库名称、文件名称有关的数据库操作函数都改成大小写不敏感,所有路径统一为 posix 风格,避免因路径文本不一致导致数据重复和操作失效 (close #2232) 开发者: - 添加 update_docs_by_id 函数与 API 接口。当前仅支持 FAISS,暂时未用到,未将来对知识库做更细致的修改做准备 - 统一 DocumentWithScore 与 DocumentWithVsId - FAISS 返回的 Document.metadata 中包含 ID, 方便后续查找比对 - /knowledge_base/search_docs 接口支持 file_name, metadata 参数,可以据此检索文档 * fix bug
This commit is contained in:
parent
2e1442a5c1
commit
9ff7bef2c2
|
|
@ -149,7 +149,8 @@ def mount_knowledge_routes(app: FastAPI):
|
||||||
from server.knowledge_base.kb_api import list_kbs, create_kb, delete_kb
|
from server.knowledge_base.kb_api import list_kbs, create_kb, delete_kb
|
||||||
from server.knowledge_base.kb_doc_api import (list_files, upload_docs, delete_docs,
|
from server.knowledge_base.kb_doc_api import (list_files, upload_docs, delete_docs,
|
||||||
update_docs, download_doc, recreate_vector_store,
|
update_docs, download_doc, recreate_vector_store,
|
||||||
search_docs, DocumentWithScore, update_info)
|
search_docs, DocumentWithVSId, update_info,
|
||||||
|
update_docs_by_id,)
|
||||||
|
|
||||||
app.post("/chat/knowledge_base_chat",
|
app.post("/chat/knowledge_base_chat",
|
||||||
tags=["Chat"],
|
tags=["Chat"],
|
||||||
|
|
@ -190,10 +191,17 @@ def mount_knowledge_routes(app: FastAPI):
|
||||||
|
|
||||||
app.post("/knowledge_base/search_docs",
|
app.post("/knowledge_base/search_docs",
|
||||||
tags=["Knowledge Base Management"],
|
tags=["Knowledge Base Management"],
|
||||||
response_model=List[DocumentWithScore],
|
response_model=List[DocumentWithVSId],
|
||||||
summary="搜索知识库"
|
summary="搜索知识库"
|
||||||
)(search_docs)
|
)(search_docs)
|
||||||
|
|
||||||
|
app.post("/knowledge_base/update_docs_by_id",
|
||||||
|
tags=["Knowledge Base Management"],
|
||||||
|
response_model=BaseResponse,
|
||||||
|
summary="直接更新知识库文档"
|
||||||
|
)(update_docs_by_id)
|
||||||
|
|
||||||
|
|
||||||
app.post("/knowledge_base/upload_docs",
|
app.post("/knowledge_base/upload_docs",
|
||||||
tags=["Knowledge Base Management"],
|
tags=["Knowledge Base Management"],
|
||||||
response_model=BaseResponse,
|
response_model=BaseResponse,
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ async def chat(query: str = Body(..., description="用户输入", examples=["恼
|
||||||
),
|
),
|
||||||
stream: bool = Body(False, description="流式输出"),
|
stream: bool = Body(False, description="流式输出"),
|
||||||
model_name: str = Body(LLM_MODELS[0], description="LLM 模型名称。"),
|
model_name: str = Body(LLM_MODELS[0], description="LLM 模型名称。"),
|
||||||
temperature: float = Body(TEMPERATURE, description="LLM 采样温度", ge=0.0, le=1.0),
|
temperature: float = Body(TEMPERATURE, description="LLM 采样温度", ge=0.0, le=2.0),
|
||||||
max_tokens: Optional[int] = Body(None, description="限制LLM生成Token数量,默认None代表模型最大值"),
|
max_tokens: Optional[int] = Body(None, description="限制LLM生成Token数量,默认None代表模型最大值"),
|
||||||
# top_p: float = Body(TOP_P, description="LLM 核采样。勿与temperature同时设置", gt=0.0, lt=1.0),
|
# top_p: float = Body(TOP_P, description="LLM 核采样。勿与temperature同时设置", gt=0.0, lt=1.0),
|
||||||
prompt_name: str = Body("default", description="使用的prompt模板名称(在configs/prompt_config.py中配置)"),
|
prompt_name: str = Body("default", description="使用的prompt模板名称(在configs/prompt_config.py中配置)"),
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ from server.db.session import with_session
|
||||||
@with_session
|
@with_session
|
||||||
def add_kb_to_db(session, kb_name, kb_info, vs_type, embed_model):
|
def add_kb_to_db(session, kb_name, kb_info, vs_type, embed_model):
|
||||||
# 创建知识库实例
|
# 创建知识库实例
|
||||||
kb = session.query(KnowledgeBaseModel).filter_by(kb_name=kb_name).first()
|
kb = session.query(KnowledgeBaseModel).filter(KnowledgeBaseModel.kb_name.ilike(kb_name)).first()
|
||||||
if not kb:
|
if not kb:
|
||||||
kb = KnowledgeBaseModel(kb_name=kb_name, kb_info=kb_info, vs_type=vs_type, embed_model=embed_model)
|
kb = KnowledgeBaseModel(kb_name=kb_name, kb_info=kb_info, vs_type=vs_type, embed_model=embed_model)
|
||||||
session.add(kb)
|
session.add(kb)
|
||||||
|
|
@ -25,14 +25,14 @@ def list_kbs_from_db(session, min_file_count: int = -1):
|
||||||
|
|
||||||
@with_session
|
@with_session
|
||||||
def kb_exists(session, kb_name):
|
def kb_exists(session, kb_name):
|
||||||
kb = session.query(KnowledgeBaseModel).filter_by(kb_name=kb_name).first()
|
kb = session.query(KnowledgeBaseModel).filter(KnowledgeBaseModel.kb_name.ilike(kb_name)).first()
|
||||||
status = True if kb else False
|
status = True if kb else False
|
||||||
return status
|
return status
|
||||||
|
|
||||||
|
|
||||||
@with_session
|
@with_session
|
||||||
def load_kb_from_db(session, kb_name):
|
def load_kb_from_db(session, kb_name):
|
||||||
kb = session.query(KnowledgeBaseModel).filter_by(kb_name=kb_name).first()
|
kb = session.query(KnowledgeBaseModel).filter(KnowledgeBaseModel.kb_name.ilike(kb_name)).first()
|
||||||
if kb:
|
if kb:
|
||||||
kb_name, vs_type, embed_model = kb.kb_name, kb.vs_type, kb.embed_model
|
kb_name, vs_type, embed_model = kb.kb_name, kb.vs_type, kb.embed_model
|
||||||
else:
|
else:
|
||||||
|
|
@ -42,7 +42,7 @@ def load_kb_from_db(session, kb_name):
|
||||||
|
|
||||||
@with_session
|
@with_session
|
||||||
def delete_kb_from_db(session, kb_name):
|
def delete_kb_from_db(session, kb_name):
|
||||||
kb = session.query(KnowledgeBaseModel).filter_by(kb_name=kb_name).first()
|
kb = session.query(KnowledgeBaseModel).filter(KnowledgeBaseModel.kb_name.ilike(kb_name)).first()
|
||||||
if kb:
|
if kb:
|
||||||
session.delete(kb)
|
session.delete(kb)
|
||||||
return True
|
return True
|
||||||
|
|
@ -50,7 +50,7 @@ def delete_kb_from_db(session, kb_name):
|
||||||
|
|
||||||
@with_session
|
@with_session
|
||||||
def get_kb_detail(session, kb_name: str) -> dict:
|
def get_kb_detail(session, kb_name: str) -> dict:
|
||||||
kb: KnowledgeBaseModel = session.query(KnowledgeBaseModel).filter_by(kb_name=kb_name).first()
|
kb: KnowledgeBaseModel = session.query(KnowledgeBaseModel).filter(KnowledgeBaseModel.kb_name.ilike(kb_name)).first()
|
||||||
if kb:
|
if kb:
|
||||||
return {
|
return {
|
||||||
"kb_name": kb.kb_name,
|
"kb_name": kb.kb_name,
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ def list_docs_from_db(session,
|
||||||
列出某知识库某文件对应的所有Document。
|
列出某知识库某文件对应的所有Document。
|
||||||
返回形式:[{"id": str, "metadata": dict}, ...]
|
返回形式:[{"id": str, "metadata": dict}, ...]
|
||||||
'''
|
'''
|
||||||
docs = session.query(FileDocModel).filter_by(kb_name=kb_name)
|
docs = session.query(FileDocModel).filter(FileDocModel.kb_name.ilike(kb_name))
|
||||||
if file_name:
|
if file_name:
|
||||||
docs = docs.filter(FileDocModel.file_name.ilike(file_name))
|
docs = docs.filter(FileDocModel.file_name.ilike(file_name))
|
||||||
for k, v in metadata.items():
|
for k, v in metadata.items():
|
||||||
|
|
@ -34,10 +34,10 @@ def delete_docs_from_db(session,
|
||||||
返回形式:[{"id": str, "metadata": dict}, ...]
|
返回形式:[{"id": str, "metadata": dict}, ...]
|
||||||
'''
|
'''
|
||||||
docs = list_docs_from_db(kb_name=kb_name, file_name=file_name)
|
docs = list_docs_from_db(kb_name=kb_name, file_name=file_name)
|
||||||
query = session.query(FileDocModel).filter_by(kb_name=kb_name)
|
query = session.query(FileDocModel).filter(FileDocModel.kb_name.ilike(kb_name))
|
||||||
if file_name:
|
if file_name:
|
||||||
query = query.filter_by(file_name=file_name)
|
query = query.filter(FileDocModel.file_name.ilike(file_name))
|
||||||
query.delete()
|
query.delete(synchronize_session=False)
|
||||||
session.commit()
|
session.commit()
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
@ -68,12 +68,12 @@ def add_docs_to_db(session,
|
||||||
|
|
||||||
@with_session
|
@with_session
|
||||||
def count_files_from_db(session, kb_name: str) -> int:
|
def count_files_from_db(session, kb_name: str) -> int:
|
||||||
return session.query(KnowledgeFileModel).filter_by(kb_name=kb_name).count()
|
return session.query(KnowledgeFileModel).filter(KnowledgeFileModel.kb_name.ilike(kb_name)).count()
|
||||||
|
|
||||||
|
|
||||||
@with_session
|
@with_session
|
||||||
def list_files_from_db(session, kb_name):
|
def list_files_from_db(session, kb_name):
|
||||||
files = session.query(KnowledgeFileModel).filter_by(kb_name=kb_name).all()
|
files = session.query(KnowledgeFileModel).filter(KnowledgeFileModel.kb_name.ilike(kb_name)).all()
|
||||||
docs = [f.file_name for f in files]
|
docs = [f.file_name for f in files]
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
@ -89,8 +89,8 @@ def add_file_to_db(session,
|
||||||
if kb:
|
if kb:
|
||||||
# 如果已经存在该文件,则更新文件信息与版本号
|
# 如果已经存在该文件,则更新文件信息与版本号
|
||||||
existing_file: KnowledgeFileModel = (session.query(KnowledgeFileModel)
|
existing_file: KnowledgeFileModel = (session.query(KnowledgeFileModel)
|
||||||
.filter_by(file_name=kb_file.filename,
|
.filter(KnowledgeFileModel.kb_name.ilike(kb_file.kb_name),
|
||||||
kb_name=kb_file.kb_name)
|
KnowledgeFileModel.file_name.ilike(kb_file.filename))
|
||||||
.first())
|
.first())
|
||||||
mtime = kb_file.get_mtime()
|
mtime = kb_file.get_mtime()
|
||||||
size = kb_file.get_size()
|
size = kb_file.get_size()
|
||||||
|
|
@ -122,14 +122,16 @@ def add_file_to_db(session,
|
||||||
|
|
||||||
@with_session
|
@with_session
|
||||||
def delete_file_from_db(session, kb_file: KnowledgeFile):
|
def delete_file_from_db(session, kb_file: KnowledgeFile):
|
||||||
existing_file = session.query(KnowledgeFileModel).filter_by(file_name=kb_file.filename,
|
existing_file = (session.query(KnowledgeFileModel)
|
||||||
kb_name=kb_file.kb_name).first()
|
.filter(KnowledgeFileModel.file_name.ilike(kb_file.filename),
|
||||||
|
KnowledgeFileModel.kb_name.ilike(kb_file.kb_name))
|
||||||
|
.first())
|
||||||
if existing_file:
|
if existing_file:
|
||||||
session.delete(existing_file)
|
session.delete(existing_file)
|
||||||
delete_docs_from_db(kb_name=kb_file.kb_name, file_name=kb_file.filename)
|
delete_docs_from_db(kb_name=kb_file.kb_name, file_name=kb_file.filename)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
kb = session.query(KnowledgeBaseModel).filter_by(kb_name=kb_file.kb_name).first()
|
kb = session.query(KnowledgeBaseModel).filter(KnowledgeBaseModel.kb_name.ilike(kb_file.kb_name)).first()
|
||||||
if kb:
|
if kb:
|
||||||
kb.file_count -= 1
|
kb.file_count -= 1
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
@ -138,9 +140,9 @@ def delete_file_from_db(session, kb_file: KnowledgeFile):
|
||||||
|
|
||||||
@with_session
|
@with_session
|
||||||
def delete_files_from_db(session, knowledge_base_name: str):
|
def delete_files_from_db(session, knowledge_base_name: str):
|
||||||
session.query(KnowledgeFileModel).filter_by(kb_name=knowledge_base_name).delete()
|
session.query(KnowledgeFileModel).filter(KnowledgeFileModel.kb_name.ilike(knowledge_base_name)).delete(synchronize_session=False)
|
||||||
session.query(FileDocModel).filter_by(kb_name=knowledge_base_name).delete()
|
session.query(FileDocModel).filter(FileDocModel.kb_name.ilike(knowledge_base_name)).delete(synchronize_session=False)
|
||||||
kb = session.query(KnowledgeBaseModel).filter_by(kb_name=knowledge_base_name).first()
|
kb = session.query(KnowledgeBaseModel).filter(KnowledgeBaseModel.kb_name.ilike(knowledge_base_name)).first()
|
||||||
if kb:
|
if kb:
|
||||||
kb.file_count = 0
|
kb.file_count = 0
|
||||||
|
|
||||||
|
|
@ -150,16 +152,19 @@ def delete_files_from_db(session, knowledge_base_name: str):
|
||||||
|
|
||||||
@with_session
|
@with_session
|
||||||
def file_exists_in_db(session, kb_file: KnowledgeFile):
|
def file_exists_in_db(session, kb_file: KnowledgeFile):
|
||||||
existing_file = session.query(KnowledgeFileModel).filter_by(file_name=kb_file.filename,
|
existing_file = (session.query(KnowledgeFileModel)
|
||||||
kb_name=kb_file.kb_name).first()
|
.filter(KnowledgeFileModel.file_name.ilike(kb_file.filename),
|
||||||
|
KnowledgeFileModel.kb_name.ilike(kb_file.kb_name))
|
||||||
|
.first())
|
||||||
return True if existing_file else False
|
return True if existing_file else False
|
||||||
|
|
||||||
|
|
||||||
@with_session
|
@with_session
|
||||||
def get_file_detail(session, kb_name: str, filename: str) -> dict:
|
def get_file_detail(session, kb_name: str, filename: str) -> dict:
|
||||||
file: KnowledgeFileModel = (session.query(KnowledgeFileModel)
|
file: KnowledgeFileModel = (session.query(KnowledgeFileModel)
|
||||||
.filter_by(file_name=filename,
|
.filter(KnowledgeFileModel.file_name.ilike(filename),
|
||||||
kb_name=kb_name).first())
|
KnowledgeFileModel.kb_name.ilike(kb_name))
|
||||||
|
.first())
|
||||||
if file:
|
if file:
|
||||||
return {
|
return {
|
||||||
"kb_name": file.kb_name,
|
"kb_name": file.kb_name,
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ def list_summary_from_db(session,
|
||||||
列出某知识库chunk summary。
|
列出某知识库chunk summary。
|
||||||
返回形式:[{"id": str, "summary_context": str, "doc_ids": str}, ...]
|
返回形式:[{"id": str, "summary_context": str, "doc_ids": str}, ...]
|
||||||
'''
|
'''
|
||||||
docs = session.query(SummaryChunkModel).filter_by(kb_name=kb_name)
|
docs = session.query(SummaryChunkModel).filter(SummaryChunkModel.kb_name.ilike(kb_name))
|
||||||
|
|
||||||
for k, v in metadata.items():
|
for k, v in metadata.items():
|
||||||
docs = docs.filter(SummaryChunkModel.meta_data[k].as_string() == str(v))
|
docs = docs.filter(SummaryChunkModel.meta_data[k].as_string() == str(v))
|
||||||
|
|
@ -33,8 +33,8 @@ def delete_summary_from_db(session,
|
||||||
返回形式:[{"id": str, "summary_context": str, "doc_ids": str}, ...]
|
返回形式:[{"id": str, "summary_context": str, "doc_ids": str}, ...]
|
||||||
'''
|
'''
|
||||||
docs = list_summary_from_db(kb_name=kb_name)
|
docs = list_summary_from_db(kb_name=kb_name)
|
||||||
query = session.query(SummaryChunkModel).filter_by(kb_name=kb_name)
|
query = session.query(SummaryChunkModel).filter(SummaryChunkModel.kb_name.ilike(kb_name))
|
||||||
query.delete()
|
query.delete(synchronize_session=False)
|
||||||
session.commit()
|
session.commit()
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
|
@ -63,4 +63,4 @@ def add_summary_to_db(session,
|
||||||
|
|
||||||
@with_session
|
@with_session
|
||||||
def count_summary_from_db(session, kb_name: str) -> int:
|
def count_summary_from_db(session, kb_name: str) -> int:
|
||||||
return session.query(SummaryChunkModel).filter_by(kb_name=kb_name).count()
|
return session.query(SummaryChunkModel).filter(SummaryChunkModel.kb_name.ilike(kb_name)).count()
|
||||||
|
|
|
||||||
|
|
@ -4,10 +4,24 @@ from server.knowledge_base.kb_service.base import EmbeddingsFunAdapter
|
||||||
from server.utils import load_local_embeddings
|
from server.utils import load_local_embeddings
|
||||||
from server.knowledge_base.utils import get_vs_path
|
from server.knowledge_base.utils import get_vs_path
|
||||||
from langchain.vectorstores.faiss import FAISS
|
from langchain.vectorstores.faiss import FAISS
|
||||||
|
from langchain.docstore.in_memory import InMemoryDocstore
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
import os
|
import os
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
|
|
||||||
|
|
||||||
|
# patch FAISS to include doc id in Document.metadata
|
||||||
|
def _new_ds_search(self, search: str) -> Union[str, Document]:
|
||||||
|
if search not in self._dict:
|
||||||
|
return f"ID {search} not found."
|
||||||
|
else:
|
||||||
|
doc = self._dict[search]
|
||||||
|
if isinstance(doc, Document):
|
||||||
|
doc.metadata["id"] = search
|
||||||
|
return doc
|
||||||
|
InMemoryDocstore.search = _new_ds_search
|
||||||
|
|
||||||
|
|
||||||
class ThreadSafeFaiss(ThreadSafeObject):
|
class ThreadSafeFaiss(ThreadSafeObject):
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
cls = type(self).__name__
|
cls = type(self).__name__
|
||||||
|
|
|
||||||
|
|
@ -15,15 +15,12 @@ import json
|
||||||
from server.knowledge_base.kb_service.base import KBServiceFactory
|
from server.knowledge_base.kb_service.base import KBServiceFactory
|
||||||
from server.db.repository.knowledge_file_repository import get_file_detail
|
from server.db.repository.knowledge_file_repository import get_file_detail
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from typing import List
|
from server.knowledge_base.model.kb_document_model import DocumentWithVSId
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
class DocumentWithScore(Document):
|
|
||||||
score: float = None
|
|
||||||
|
|
||||||
|
|
||||||
def search_docs(
|
def search_docs(
|
||||||
query: str = Body(..., description="用户输入", examples=["你好"]),
|
query: str = Body("", description="用户输入", examples=["你好"]),
|
||||||
knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
|
knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
|
||||||
top_k: int = Body(VECTOR_SEARCH_TOP_K, description="匹配向量数"),
|
top_k: int = Body(VECTOR_SEARCH_TOP_K, description="匹配向量数"),
|
||||||
score_threshold: float = Body(SCORE_THRESHOLD,
|
score_threshold: float = Body(SCORE_THRESHOLD,
|
||||||
|
|
@ -31,13 +28,34 @@ def search_docs(
|
||||||
"SCORE越小,相关度越高,"
|
"SCORE越小,相关度越高,"
|
||||||
"取到1相当于不筛选,建议设置在0.5左右",
|
"取到1相当于不筛选,建议设置在0.5左右",
|
||||||
ge=0, le=1),
|
ge=0, le=1),
|
||||||
) -> List[DocumentWithScore]:
|
file_name: str = Body("", description="文件名称,支持 sql 通配符"),
|
||||||
|
metadata: dict = Body({}, description="根据 metadata 进行过滤,仅支持一级键"),
|
||||||
|
) -> List[DocumentWithVSId]:
|
||||||
|
kb = KBServiceFactory.get_service_by_name(knowledge_base_name)
|
||||||
|
data = []
|
||||||
|
if kb is not None:
|
||||||
|
if query:
|
||||||
|
docs = kb.search_docs(query, top_k, score_threshold)
|
||||||
|
data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs]
|
||||||
|
elif file_name or metadata:
|
||||||
|
data = kb.list_docs(file_name=file_name, metadata=metadata)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def update_docs_by_id(
|
||||||
|
knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
|
||||||
|
docs: Dict[str, Document] = Body(..., description="要更新的文档内容,形如:{id: Document, ...}")
|
||||||
|
) -> BaseResponse:
|
||||||
|
'''
|
||||||
|
按照文档 ID 更新文档内容
|
||||||
|
'''
|
||||||
kb = KBServiceFactory.get_service_by_name(knowledge_base_name)
|
kb = KBServiceFactory.get_service_by_name(knowledge_base_name)
|
||||||
if kb is None:
|
if kb is None:
|
||||||
return []
|
return BaseResponse(code=500, msg=f"指定的知识库 {knowledge_base_name} 不存在")
|
||||||
docs = kb.search_docs(query, top_k, score_threshold)
|
if kb.update_doc_by_ids(docs=docs):
|
||||||
data = [DocumentWithScore(**x[0].dict(), score=x[1]) for x in docs]
|
return BaseResponse(msg=f"文档更新成功")
|
||||||
return data
|
else:
|
||||||
|
return BaseResponse(msg=f"文档更新失败")
|
||||||
|
|
||||||
|
|
||||||
def list_files(
|
def list_files(
|
||||||
|
|
|
||||||
|
|
@ -121,6 +121,7 @@ class KBService(ABC):
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
try:
|
try:
|
||||||
source = doc.metadata.get("source", "")
|
source = doc.metadata.get("source", "")
|
||||||
|
if os.path.isabs(source):
|
||||||
rel_path = Path(source).relative_to(self.doc_path)
|
rel_path = Path(source).relative_to(self.doc_path)
|
||||||
doc.metadata["source"] = str(rel_path.as_posix().strip("/"))
|
doc.metadata["source"] = str(rel_path.as_posix().strip("/"))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -176,13 +177,33 @@ class KBService(ABC):
|
||||||
query: str,
|
query: str,
|
||||||
top_k: int = VECTOR_SEARCH_TOP_K,
|
top_k: int = VECTOR_SEARCH_TOP_K,
|
||||||
score_threshold: float = SCORE_THRESHOLD,
|
score_threshold: float = SCORE_THRESHOLD,
|
||||||
):
|
) ->List[Document]:
|
||||||
docs = self.do_search(query, top_k, score_threshold)
|
docs = self.do_search(query, top_k, score_threshold)
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
|
def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
def del_doc_by_ids(self, ids: List[str]) -> bool:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def update_doc_by_ids(self, docs: Dict[str, Document]) -> bool:
|
||||||
|
'''
|
||||||
|
传入参数为: {doc_id: Document, ...}
|
||||||
|
如果对应 doc_id 的值为 None,或其 page_content 为空,则删除该文档
|
||||||
|
TODO:是否要支持新增 docs ?
|
||||||
|
'''
|
||||||
|
self.del_doc_by_ids(list(docs.keys()))
|
||||||
|
docs = []
|
||||||
|
ids = []
|
||||||
|
for k, v in docs.items():
|
||||||
|
if not v or not v.page_content.strip():
|
||||||
|
continue
|
||||||
|
ids.append(k)
|
||||||
|
docs.append(v)
|
||||||
|
self.do_add_doc(docs=docs, ids=ids)
|
||||||
|
return True
|
||||||
|
|
||||||
def list_docs(self, file_name: str = None, metadata: Dict = {}) -> List[DocumentWithVSId]:
|
def list_docs(self, file_name: str = None, metadata: Dict = {}) -> List[DocumentWithVSId]:
|
||||||
'''
|
'''
|
||||||
通过file_name或metadata检索Document
|
通过file_name或metadata检索Document
|
||||||
|
|
@ -190,10 +211,10 @@ class KBService(ABC):
|
||||||
doc_infos = list_docs_from_db(kb_name=self.kb_name, file_name=file_name, metadata=metadata)
|
doc_infos = list_docs_from_db(kb_name=self.kb_name, file_name=file_name, metadata=metadata)
|
||||||
docs = []
|
docs = []
|
||||||
for x in doc_infos:
|
for x in doc_infos:
|
||||||
doc_info_s = self.get_doc_by_ids([x["id"]])
|
doc_info = self.get_doc_by_ids([x["id"]])[0]
|
||||||
if doc_info_s is not None and doc_info_s != []:
|
if doc_info is not None:
|
||||||
# 处理非空的情况
|
# 处理非空的情况
|
||||||
doc_with_id = DocumentWithVSId(**doc_info_s[0].dict(), id=x["id"])
|
doc_with_id = DocumentWithVSId(**doc_info.dict(), id=x["id"])
|
||||||
docs.append(doc_with_id)
|
docs.append(doc_with_id)
|
||||||
else:
|
else:
|
||||||
# 处理空的情况
|
# 处理空的情况
|
||||||
|
|
@ -249,6 +270,7 @@ class KBService(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def do_add_doc(self,
|
def do_add_doc(self,
|
||||||
docs: List[Document],
|
docs: List[Document],
|
||||||
|
**kwargs,
|
||||||
) -> List[Dict]:
|
) -> List[Dict]:
|
||||||
"""
|
"""
|
||||||
向知识库添加文档子类实自己逻辑
|
向知识库添加文档子类实自己逻辑
|
||||||
|
|
@ -371,12 +393,13 @@ def get_kb_file_details(kb_name: str) -> List[Dict]:
|
||||||
"in_folder": True,
|
"in_folder": True,
|
||||||
"in_db": False,
|
"in_db": False,
|
||||||
}
|
}
|
||||||
|
lower_names = {x.lower(): x for x in result}
|
||||||
for doc in files_in_db:
|
for doc in files_in_db:
|
||||||
doc_detail = get_file_detail(kb_name, doc)
|
doc_detail = get_file_detail(kb_name, doc)
|
||||||
if doc_detail:
|
if doc_detail:
|
||||||
doc_detail["in_db"] = True
|
doc_detail["in_db"] = True
|
||||||
if doc in result:
|
if doc.lower() in lower_names:
|
||||||
result[doc].update(doc_detail)
|
result[lower_names[doc.lower()]].update(doc_detail)
|
||||||
else:
|
else:
|
||||||
doc_detail["in_folder"] = False
|
doc_detail["in_folder"] = False
|
||||||
result[doc] = doc_detail
|
result[doc] = doc_detail
|
||||||
|
|
|
||||||
|
|
@ -145,6 +145,14 @@ class ESKBService(KBService):
|
||||||
k=top_k)
|
k=top_k)
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
def del_doc_by_ids(self, ids: List[str]) -> bool:
|
||||||
|
for doc_id in ids:
|
||||||
|
try:
|
||||||
|
self.es_client_python.delete(index=self.index_name,
|
||||||
|
id=doc_id,
|
||||||
|
refresh=True)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"ES Docs Delete Error! {e}")
|
||||||
|
|
||||||
def do_delete_doc(self, kb_file, **kwargs):
|
def do_delete_doc(self, kb_file, **kwargs):
|
||||||
if self.es_client_python.indices.exists(index=self.index_name):
|
if self.es_client_python.indices.exists(index=self.index_name):
|
||||||
|
|
@ -168,7 +176,7 @@ class ESKBService(KBService):
|
||||||
id=doc_id,
|
id=doc_id,
|
||||||
refresh=True)
|
refresh=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("ES Docs Delete Error!")
|
logger.error(f"ES Docs Delete Error! {e}")
|
||||||
|
|
||||||
# self.db_init.delete(ids=delete_list)
|
# self.db_init.delete(ids=delete_list)
|
||||||
#self.es_client_python.indices.refresh(index=self.index_name)
|
#self.es_client_python.indices.refresh(index=self.index_name)
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,10 @@ class FaissKBService(KBService):
|
||||||
with self.load_vector_store().acquire() as vs:
|
with self.load_vector_store().acquire() as vs:
|
||||||
return [vs.docstore._dict.get(id) for id in ids]
|
return [vs.docstore._dict.get(id) for id in ids]
|
||||||
|
|
||||||
|
def del_doc_by_ids(self, ids: List[str]) -> bool:
|
||||||
|
with self.load_vector_store().acquire() as vs:
|
||||||
|
vs.delete(ids)
|
||||||
|
|
||||||
def do_init(self):
|
def do_init(self):
|
||||||
self.vector_name = self.vector_name or self.embed_model
|
self.vector_name = self.vector_name or self.embed_model
|
||||||
self.kb_path = self.get_kb_path()
|
self.kb_path = self.get_kb_path()
|
||||||
|
|
@ -72,7 +76,8 @@ class FaissKBService(KBService):
|
||||||
|
|
||||||
with self.load_vector_store().acquire() as vs:
|
with self.load_vector_store().acquire() as vs:
|
||||||
ids = vs.add_embeddings(text_embeddings=zip(data["texts"], data["embeddings"]),
|
ids = vs.add_embeddings(text_embeddings=zip(data["texts"], data["embeddings"]),
|
||||||
metadatas=data["metadatas"])
|
metadatas=data["metadatas"],
|
||||||
|
ids=kwargs.get("ids"))
|
||||||
if not kwargs.get("not_refresh_vs_cache"):
|
if not kwargs.get("not_refresh_vs_cache"):
|
||||||
vs.save_local(self.vs_path)
|
vs.save_local(self.vs_path)
|
||||||
doc_infos = [{"id": id, "metadata": doc.metadata} for id, doc in zip(ids, docs)]
|
doc_infos = [{"id": id, "metadata": doc.metadata} for id, doc in zip(ids, docs)]
|
||||||
|
|
@ -83,7 +88,7 @@ class FaissKBService(KBService):
|
||||||
kb_file: KnowledgeFile,
|
kb_file: KnowledgeFile,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
with self.load_vector_store().acquire() as vs:
|
with self.load_vector_store().acquire() as vs:
|
||||||
ids = [k for k, v in vs.docstore._dict.items() if v.metadata.get("source") == kb_file.filename]
|
ids = [k for k, v in vs.docstore._dict.items() if v.metadata.get("source").lower() == kb_file.filename.lower()]
|
||||||
if len(ids) > 0:
|
if len(ids) > 0:
|
||||||
vs.delete(ids)
|
vs.delete(ids)
|
||||||
if not kwargs.get("not_refresh_vs_cache"):
|
if not kwargs.get("not_refresh_vs_cache"):
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,9 @@ class MilvusKBService(KBService):
|
||||||
result.append(Document(page_content=text, metadata=data))
|
result.append(Document(page_content=text, metadata=data))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def del_doc_by_ids(self, ids: List[str]) -> bool:
|
||||||
|
self.milvus.col.delete(expr=f'pk in {ids}')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def search(milvus_name, content, limit=3):
|
def search(milvus_name, content, limit=3):
|
||||||
search_params = {
|
search_params = {
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,10 @@ class PGKBService(KBService):
|
||||||
connect.execute(stmt, parameters={'ids': ids}).fetchall()]
|
connect.execute(stmt, parameters={'ids': ids}).fetchall()]
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
def del_doc_by_ids(self, ids: List[str]) -> bool:
|
||||||
|
return super().del_doc_by_ids(ids)
|
||||||
|
|
||||||
def do_init(self):
|
def do_init(self):
|
||||||
self._load_pg_vector()
|
self._load_pg_vector()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,9 @@ class ZillizKBService(KBService):
|
||||||
result.append(Document(page_content=text, metadata=data))
|
result.append(Document(page_content=text, metadata=data))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def del_doc_by_ids(self, ids: List[str]) -> bool:
|
||||||
|
self.zilliz.col.delete(expr=f'pk in {ids}')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def search(zilliz_name, content, limit=3):
|
def search(zilliz_name, content, limit=3):
|
||||||
search_params = {
|
search_params = {
|
||||||
|
|
|
||||||
|
|
@ -7,4 +7,4 @@ class DocumentWithVSId(Document):
|
||||||
矢量化后的文档
|
矢量化后的文档
|
||||||
"""
|
"""
|
||||||
id: str = None
|
id: str = None
|
||||||
|
score: float = 3.0
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,8 @@ def list_files_from_folder(kb_name: str):
|
||||||
for target_entry in target_it:
|
for target_entry in target_it:
|
||||||
process_entry(target_entry)
|
process_entry(target_entry)
|
||||||
elif entry.is_file():
|
elif entry.is_file():
|
||||||
result.append(os.path.relpath(entry.path, doc_path))
|
file_path = (Path(os.path.relpath(entry.path, doc_path)).as_posix()) # 路径统一为 posix 格式
|
||||||
|
result.append(file_path)
|
||||||
elif entry.is_dir():
|
elif entry.is_dir():
|
||||||
with os.scandir(entry.path) as it:
|
with os.scandir(entry.path) as it:
|
||||||
for sub_entry in it:
|
for sub_entry in it:
|
||||||
|
|
@ -272,7 +273,7 @@ class KnowledgeFile:
|
||||||
对应知识库目录中的文件,必须是磁盘上存在的才能进行向量化等操作。
|
对应知识库目录中的文件,必须是磁盘上存在的才能进行向量化等操作。
|
||||||
'''
|
'''
|
||||||
self.kb_name = knowledge_base_name
|
self.kb_name = knowledge_base_name
|
||||||
self.filename = filename
|
self.filename = str(Path(filename).as_posix())
|
||||||
self.ext = os.path.splitext(filename)[-1].lower()
|
self.ext = os.path.splitext(filename)[-1].lower()
|
||||||
if self.ext not in SUPPORTED_EXTS:
|
if self.ext not in SUPPORTED_EXTS:
|
||||||
raise ValueError(f"暂未支持的文件格式 {self.filename}")
|
raise ValueError(f"暂未支持的文件格式 {self.filename}")
|
||||||
|
|
|
||||||
|
|
@ -224,7 +224,7 @@ def dialogue_page(api: ApiRequest, is_lite: bool = False):
|
||||||
key="prompt_template_select",
|
key="prompt_template_select",
|
||||||
)
|
)
|
||||||
prompt_template_name = st.session_state.prompt_template_select
|
prompt_template_name = st.session_state.prompt_template_select
|
||||||
temperature = st.slider("Temperature:", 0.0, 1.0, TEMPERATURE, 0.05)
|
temperature = st.slider("Temperature:", 0.0, 2.0, TEMPERATURE, 0.05)
|
||||||
history_len = st.number_input("历史对话轮数:", 0, 20, HISTORY_LEN)
|
history_len = st.number_input("历史对话轮数:", 0, 20, HISTORY_LEN)
|
||||||
|
|
||||||
def on_kb_change():
|
def on_kb_change():
|
||||||
|
|
|
||||||
|
|
@ -190,6 +190,7 @@ def knowledge_base_page(api: ApiRequest, is_lite: bool = None):
|
||||||
# 知识库详情
|
# 知识库详情
|
||||||
# st.info("请选择文件,点击按钮进行操作。")
|
# st.info("请选择文件,点击按钮进行操作。")
|
||||||
doc_details = pd.DataFrame(get_kb_file_details(kb))
|
doc_details = pd.DataFrame(get_kb_file_details(kb))
|
||||||
|
selected_rows = []
|
||||||
if not len(doc_details):
|
if not len(doc_details):
|
||||||
st.info(f"知识库 `{kb}` 中暂无文件")
|
st.info(f"知识库 `{kb}` 中暂无文件")
|
||||||
else:
|
else:
|
||||||
|
|
@ -284,32 +285,80 @@ def knowledge_base_page(api: ApiRequest, is_lite: bool = None):
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
|
|
||||||
cols = st.columns(3)
|
# cols = st.columns(3)
|
||||||
|
|
||||||
if cols[0].button(
|
# if cols[0].button(
|
||||||
"依据源文件重建向量库",
|
# "依据源文件重建向量库",
|
||||||
# help="无需上传文件,通过其它方式将文档拷贝到对应知识库content目录下,点击本按钮即可重建知识库。",
|
# # help="无需上传文件,通过其它方式将文档拷贝到对应知识库content目录下,点击本按钮即可重建知识库。",
|
||||||
use_container_width=True,
|
# use_container_width=True,
|
||||||
type="primary",
|
# type="primary",
|
||||||
):
|
# ):
|
||||||
with st.spinner("向量库重构中,请耐心等待,勿刷新或关闭页面。"):
|
# with st.spinner("向量库重构中,请耐心等待,勿刷新或关闭页面。"):
|
||||||
empty = st.empty()
|
# empty = st.empty()
|
||||||
empty.progress(0.0, "")
|
# empty.progress(0.0, "")
|
||||||
for d in api.recreate_vector_store(kb,
|
# for d in api.recreate_vector_store(kb,
|
||||||
chunk_size=chunk_size,
|
# chunk_size=chunk_size,
|
||||||
chunk_overlap=chunk_overlap,
|
# chunk_overlap=chunk_overlap,
|
||||||
zh_title_enhance=zh_title_enhance):
|
# zh_title_enhance=zh_title_enhance):
|
||||||
if msg := check_error_msg(d):
|
# if msg := check_error_msg(d):
|
||||||
st.toast(msg)
|
# st.toast(msg)
|
||||||
|
# else:
|
||||||
|
# empty.progress(d["finished"] / d["total"], d["msg"])
|
||||||
|
# st.rerun()
|
||||||
|
|
||||||
|
# if cols[2].button(
|
||||||
|
# "删除知识库",
|
||||||
|
# use_container_width=True,
|
||||||
|
# ):
|
||||||
|
# ret = api.delete_knowledge_base(kb)
|
||||||
|
# st.toast(ret.get("msg", " "))
|
||||||
|
# time.sleep(1)
|
||||||
|
# st.rerun()
|
||||||
|
|
||||||
|
# with st.sidebar:
|
||||||
|
# keyword = st.text_input("查询关键字")
|
||||||
|
# top_k = st.slider("匹配条数", 1, 100, 3)
|
||||||
|
|
||||||
|
st.write("文件内文档列表。双击进行修改,在删除列填入 Y 可删除对应行。")
|
||||||
|
docs = []
|
||||||
|
df = pd.DataFrame([], columns=["seq", "id", "content", "source"])
|
||||||
|
if selected_rows:
|
||||||
|
file_name = selected_rows[0]["file_name"]
|
||||||
|
docs = api.search_kb_docs(knowledge_base_name=selected_kb, file_name=file_name)
|
||||||
|
data = [{"seq": i+1, "id": x["id"], "page_content": x["page_content"], "source": x["metadata"].get("source"),
|
||||||
|
"type": x["type"],
|
||||||
|
"metadata": json.dumps(x["metadata"], ensure_ascii=False),
|
||||||
|
"to_del": "",
|
||||||
|
} for i, x in enumerate(docs)]
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
|
gb = GridOptionsBuilder.from_dataframe(df)
|
||||||
|
gb.configure_columns(["id", "source", "type", "metadata"], hide=True)
|
||||||
|
gb.configure_column("seq", "No.", width=50)
|
||||||
|
gb.configure_column("page_content", "内容", editable=True, autoHeight=True, wrapText=True, flex=1,
|
||||||
|
cellEditor="agLargeTextCellEditor", cellEditorPopup=True)
|
||||||
|
gb.configure_column("to_del", "删除", editable=True, width=50, wrapHeaderText=True,
|
||||||
|
cellEditor="agCheckboxCellEditor", cellRender="agCheckboxCellRenderer")
|
||||||
|
gb.configure_selection()
|
||||||
|
edit_docs = AgGrid(df, gb.build())
|
||||||
|
|
||||||
|
if st.button("保存更改"):
|
||||||
|
# origin_docs = {x["id"]: {"page_content": x["page_content"], "type": x["type"], "metadata": x["metadata"]} for x in docs}
|
||||||
|
changed_docs = []
|
||||||
|
for index, row in edit_docs.data.iterrows():
|
||||||
|
# origin_doc = origin_docs[row["id"]]
|
||||||
|
# if row["page_content"] != origin_doc["page_content"]:
|
||||||
|
if row["to_del"] not in ["Y", "y", 1]:
|
||||||
|
changed_docs.append({
|
||||||
|
"page_content": row["page_content"],
|
||||||
|
"type": row["type"],
|
||||||
|
"metadata": json.loads(row["metadata"]),
|
||||||
|
})
|
||||||
|
|
||||||
|
if changed_docs:
|
||||||
|
if api.update_kb_docs(knowledge_base_name=selected_kb,
|
||||||
|
file_names=[file_name],
|
||||||
|
docs={file_name: changed_docs}):
|
||||||
|
st.toast("更新文档成功")
|
||||||
else:
|
else:
|
||||||
empty.progress(d["finished"] / d["total"], d["msg"])
|
st.toast("更新文档失败")
|
||||||
st.rerun()
|
|
||||||
|
|
||||||
if cols[2].button(
|
|
||||||
"删除知识库",
|
|
||||||
use_container_width=True,
|
|
||||||
):
|
|
||||||
ret = api.delete_knowledge_base(kb)
|
|
||||||
st.toast(ret.get("msg", " "))
|
|
||||||
time.sleep(1)
|
|
||||||
st.rerun()
|
|
||||||
|
|
|
||||||
|
|
@ -571,10 +571,12 @@ class ApiRequest:
|
||||||
|
|
||||||
def search_kb_docs(
|
def search_kb_docs(
|
||||||
self,
|
self,
|
||||||
query: str,
|
|
||||||
knowledge_base_name: str,
|
knowledge_base_name: str,
|
||||||
|
query: str = "",
|
||||||
top_k: int = VECTOR_SEARCH_TOP_K,
|
top_k: int = VECTOR_SEARCH_TOP_K,
|
||||||
score_threshold: int = SCORE_THRESHOLD,
|
score_threshold: int = SCORE_THRESHOLD,
|
||||||
|
file_name: str = "",
|
||||||
|
metadata: dict = {},
|
||||||
) -> List:
|
) -> List:
|
||||||
'''
|
'''
|
||||||
对应api.py/knowledge_base/search_docs接口
|
对应api.py/knowledge_base/search_docs接口
|
||||||
|
|
@ -584,6 +586,8 @@ class ApiRequest:
|
||||||
"knowledge_base_name": knowledge_base_name,
|
"knowledge_base_name": knowledge_base_name,
|
||||||
"top_k": top_k,
|
"top_k": top_k,
|
||||||
"score_threshold": score_threshold,
|
"score_threshold": score_threshold,
|
||||||
|
"file_name": file_name,
|
||||||
|
"metadata": metadata,
|
||||||
}
|
}
|
||||||
|
|
||||||
response = self.post(
|
response = self.post(
|
||||||
|
|
@ -592,6 +596,24 @@ class ApiRequest:
|
||||||
)
|
)
|
||||||
return self._get_response_value(response, as_json=True)
|
return self._get_response_value(response, as_json=True)
|
||||||
|
|
||||||
|
def update_docs_by_id(
|
||||||
|
self,
|
||||||
|
knowledge_base_name: str,
|
||||||
|
docs: Dict[str, Dict],
|
||||||
|
) -> bool:
|
||||||
|
'''
|
||||||
|
对应api.py/knowledge_base/update_docs_by_id接口
|
||||||
|
'''
|
||||||
|
data = {
|
||||||
|
"knowledge_base_name": knowledge_base_name,
|
||||||
|
"docs": docs,
|
||||||
|
}
|
||||||
|
response = self.post(
|
||||||
|
"/knowledge_base/update_docs_by_id",
|
||||||
|
json=data
|
||||||
|
)
|
||||||
|
return self._get_response_value(response)
|
||||||
|
|
||||||
def upload_kb_docs(
|
def upload_kb_docs(
|
||||||
self,
|
self,
|
||||||
files: List[Union[str, Path, bytes]],
|
files: List[Union[str, Path, bytes]],
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue