编辑分块功能

This commit is contained in:
wvivi2023 2024-03-21 11:11:34 +08:00
parent 7b9369e625
commit 319475e0f6
8 changed files with 224 additions and 76 deletions

View File

@ -144,7 +144,7 @@ def mount_knowledge_routes(app: FastAPI):
from server.knowledge_base.kb_doc_api import (list_files, upload_docs, delete_docs, from server.knowledge_base.kb_doc_api import (list_files, upload_docs, delete_docs,
update_docs, download_doc, recreate_vector_store, update_docs, download_doc, recreate_vector_store,
search_docs, DocumentWithVSId, update_info, search_docs, DocumentWithVSId, update_info,
update_docs_by_id,search_content) update_docs_by_id,search_content,delete_docs_by_ids)
app.post("/chat/knowledge_base_chat", app.post("/chat/knowledge_base_chat",
tags=["Chat"], tags=["Chat"],
@ -202,6 +202,12 @@ def mount_knowledge_routes(app: FastAPI):
)(update_docs_by_id) )(update_docs_by_id)
app.post("/knowledge_base/delete_docs_by_ids",
tags=["Knowledge Base Management"],
response_model=BaseResponse,
summary="根据ids删除知识库文档"
)(delete_docs_by_ids)
app.post("/knowledge_base/upload_docs", app.post("/knowledge_base/upload_docs",
tags=["Knowledge Base Management"], tags=["Knowledge Base Management"],
response_model=BaseResponse, response_model=BaseResponse,

View File

@ -41,6 +41,45 @@ def delete_docs_from_db(session,
session.commit() session.commit()
return docs return docs
@with_session
def delete_docs_from_db_by_ids(session,
ids:List[str]
):
for id in ids:
query = session.query(FileDocModel).filter(FileDocModel.doc_id.ilike(id))
query.delete(synchronize_session=False)
session.commit()
return True
@with_session
def count_docs_from_db(session, kb_name: str,file_name:str) -> int:
docs = list_docs_from_db(kb_name=kb_name, file_name=file_name)
return len(docs)
#return session.query(FileDocModel).filter(KnowledgeFileModel.kb_name.ilike(kb_name)).count()
@with_session
def update_file_to_db(session,
knowledge_base_name: str,
file_name:str):
kb = session.query(KnowledgeBaseModel).filter_by(kb_name=knowledge_base_name).first()
if kb:
# 如果已经存在该文件,则更新文件信息与版本号
existing_file: KnowledgeFileModel = (session.query(KnowledgeFileModel)
.filter(KnowledgeFileModel.kb_name.ilike(knowledge_base_name),
KnowledgeFileModel.file_name.ilike(file_name))
.first())
if existing_file:
existing_file.file_version += 1
count = count_docs_from_db(knowledge_base_name, file_name)
print(f"*****update_file_to_db 后count 是{count}")
existing_file.docs_count= count
print(f"******knowledge_file_repository 更新knowledge_file***existing_file name :{existing_file.file_name}")
else:
print(f"无效的kb")
@with_session @with_session
def add_docs_to_db(session, def add_docs_to_db(session,
@ -101,6 +140,7 @@ def add_file_to_db(session,
existing_file.docs_count = docs_count existing_file.docs_count = docs_count
existing_file.custom_docs = custom_docs existing_file.custom_docs = custom_docs
existing_file.file_version += 1 existing_file.file_version += 1
print(f"******knowledge_file_repository 更新knowledge_file***existing_file name :{existing_file.file_name}")
# 否则,添加新文件 # 否则,添加新文件
else: else:
new_file = KnowledgeFileModel( new_file = KnowledgeFileModel(

View File

@ -137,6 +137,22 @@ def update_docs_by_id(
else: else:
return BaseResponse(msg=f"文档更新失败") return BaseResponse(msg=f"文档更新失败")
def delete_docs_by_ids(knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
file_name:str = Body(..., description="文件名"),
ids: List[str] = Body(..., description="要更新的文档内容,形如:{id1,id2....}")
) -> BaseResponse:
'''
按照文档 ID 删除文档内容
'''
kb = KBServiceFactory.get_service_by_name(knowledge_base_name)
if kb is None:
return BaseResponse(code=500, msg=f"指定的知识库 {knowledge_base_name} 不存在")
result1 = kb.del_doc_by_ids(ids)
result2 = kb.del_doc_by_ids_from_db(knowledge_base_name,file_name,ids)
if result1 and result2:
return BaseResponse(msg=f"文档删除成功")
else:
return BaseResponse(msg=f"文档删除失败")
def list_files( def list_files(
knowledge_base_name: str knowledge_base_name: str
@ -339,6 +355,7 @@ def update_docs(
failed_files = {} failed_files = {}
kb_files = [] kb_files = []
print(f"111111 kb_doc_api update_docs file_name:{file_names},更新的doc 长度:{len(docs)}")
# 生成需要加载docs的文件列表 # 生成需要加载docs的文件列表
for file_name in file_names: for file_name in file_names:
file_detail = get_file_detail(kb_name=knowledge_base_name, filename=file_name) file_detail = get_file_detail(kb_name=knowledge_base_name, filename=file_name)
@ -346,13 +363,9 @@ def update_docs(
if file_detail.get("custom_docs") and not override_custom_docs: if file_detail.get("custom_docs") and not override_custom_docs:
continue continue
if file_name not in docs: if file_name not in docs:
print(f"****kb_doc_api update_docs file_name not in docs")
try: try:
kb_files.append(KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name)) kb_files.append(KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name))
except Exception as e:
msg = f"加载文档 {file_name} 时出错:{e}"
logger.error(f'{e.__class__.__name__}: {msg}',
exc_info=e if log_verbose else None)
failed_files[file_name] = msg
# 从文件生成docs并进行向量化。 # 从文件生成docs并进行向量化。
# 这里利用了KnowledgeFile的缓存功能在多线程中加载Document然后传给KnowledgeFile # 这里利用了KnowledgeFile的缓存功能在多线程中加载Document然后传给KnowledgeFile
@ -371,8 +384,17 @@ def update_docs(
kb_name, file_name, error = result kb_name, file_name, error = result
failed_files[file_name] = error failed_files[file_name] = error
except Exception as e:
msg = f"加载文档 {file_name} 时出错:{e}"
logger.error(f'{e.__class__.__name__}: {msg}',
exc_info=e if log_verbose else None)
failed_files[file_name] = msg
else:
print(f"****kb_doc_api update_docs file_name in docs")
# 将自定义的docs进行向量化 # 将自定义的docs进行向量化
for file_name, v in docs.items(): for file_name, v in docs.items():
print(f"222222 kb_doc_api update_docs file_name:{file_name},更新的doc 长度:{len(docs)}")
try: try:
print(f"kb_doc_api update_docs 自定义的docs 向量化filename:{file_name}") print(f"kb_doc_api update_docs 自定义的docs 向量化filename:{file_name}")
v = [x if isinstance(x, Document) else Document(**x) for x in v] v = [x if isinstance(x, Document) else Document(**x) for x in v]

View File

@ -14,7 +14,7 @@ from server.db.repository.knowledge_base_repository import (
from server.db.repository.knowledge_file_repository import ( from server.db.repository.knowledge_file_repository import (
add_file_to_db, delete_file_from_db, delete_files_from_db, file_exists_in_db, add_file_to_db, delete_file_from_db, delete_files_from_db, file_exists_in_db,
count_files_from_db, list_files_from_db, get_file_detail, delete_file_from_db, count_files_from_db, list_files_from_db, get_file_detail, delete_file_from_db,
list_docs_from_db, list_docs_from_db,delete_docs_from_db_by_ids,update_file_to_db
) )
from configs import (kbs_config, VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD, from configs import (kbs_config, VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD,
@ -112,9 +112,11 @@ class KBService(ABC):
custom_docs = True custom_docs = True
for doc in docs: for doc in docs:
doc.metadata.setdefault("source", kb_file.filename) doc.metadata.setdefault("source", kb_file.filename)
print(f"kb_doc_api add_doc docs 不为空len(docs){len(docs)}")
else: else:
docs = kb_file.file2text() docs = kb_file.file2text()
custom_docs = False custom_docs = False
print(f"kb_doc_api add_doc docs 为空len(docs){len(docs)}")
if docs: if docs:
# 将 metadata["source"] 改为相对路径 # 将 metadata["source"] 改为相对路径
@ -165,9 +167,9 @@ class KBService(ABC):
使用content中的文件更新向量库 使用content中的文件更新向量库
如果指定了docs则使用自定义docs并将数据库对应条目标为custom_docs=True 如果指定了docs则使用自定义docs并将数据库对应条目标为custom_docs=True
""" """
if os.path.exists(kb_file.filepath): if os.path.exists(kb_file.filepath) and docs is None:
print(f"{kb_file.filename} exists")
self.delete_doc(kb_file, **kwargs) self.delete_doc(kb_file, **kwargs)
return self.add_doc(kb_file, docs=docs, **kwargs) return self.add_doc(kb_file, docs=docs, **kwargs)
def exist_doc(self, file_name: str): def exist_doc(self, file_name: str):
@ -209,6 +211,13 @@ class KBService(ABC):
def del_doc_by_ids(self, ids: List[str]) -> bool: def del_doc_by_ids(self, ids: List[str]) -> bool:
raise NotImplementedError raise NotImplementedError
def del_doc_by_ids_from_db(self, knowledge_base_name: str , file_name:str, ids: List[str]) -> bool:
delete_docs_from_db_by_ids(ids)
update_file_to_db(knowledge_base_name = knowledge_base_name,file_name = file_name)
print(f"*******KBService del_doc_by_ids_from_db")
return True
def update_doc_by_ids(self, docs: Dict[str, Document]) -> bool: def update_doc_by_ids(self, docs: Dict[str, Document]) -> bool:
''' '''
传入参数为 {doc_id: Document, ...} 传入参数为 {doc_id: Document, ...}
@ -230,16 +239,25 @@ class KBService(ABC):
通过file_name或metadata检索Document 通过file_name或metadata检索Document
''' '''
doc_infos = list_docs_from_db(kb_name=self.kb_name, file_name=file_name, metadata=metadata) doc_infos = list_docs_from_db(kb_name=self.kb_name, file_name=file_name, metadata=metadata)
print(f"kb_doc_api list_docs_from_db: {doc_infos}")
docs = [] docs = []
for x in doc_infos: for x in doc_infos:
doc_info = self.get_doc_by_ids([x["id"]])[0] doc_info = self.get_doc_by_ids([x["id"]])
if doc_info is not None: #print(f"kb_doc_api doc_info: {doc_info}")
#if doc_info is not None:
if doc_info is not None and isinstance(doc_info, list):
if doc_info:
# 处理非空的情况 # 处理非空的情况
doc_with_id = DocumentWithVSId(**doc_info.dict(), id=x["id"]) #data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs]
doc_with_id = DocumentWithVSId(**doc_info[0].dict(), id=x["id"])
docs.append(doc_with_id) docs.append(doc_with_id)
else: else:
# 处理空的情况 # 处理 doc_info 为空列表的情况
pass
else:
# 处理 doc_info 是 NoneType 或者不是列表的情况
# 可以选择跳过当前循环迭代或执行其他操作 # 可以选择跳过当前循环迭代或执行其他操作
print("base.py list_docs 返回为空")
pass pass
return docs return docs

View File

@ -219,7 +219,24 @@ class ESKBService(KBService):
] ]
return docs_and_scores return docs_and_scores
def del_doc_by_ids(self, ids: List[str]) -> bool: def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
result_list = []
for doc_id in ids:
try:
result = self.es_client_python.get(index=self.index_name,
id=doc_id)
#print(f"es_kb_service:result:{result}")
result_list.append(Document(
page_content=result["_source"]["context"],
metadata=result["_source"]["metadata"],
))
except Exception as e:
logger.error(f"ES Docs Get Error! {e}")
return result_list
def del_doc_by_ids(self,ids: List[str]) -> bool:
print(f"es_kb_service del_doc_by_ids")
for doc_id in ids: for doc_id in ids:
try: try:
self.es_client_python.delete(index=self.index_name, self.es_client_python.delete(index=self.index_name,
@ -228,6 +245,7 @@ class ESKBService(KBService):
except Exception as e: except Exception as e:
logger.error(f"ES Docs Delete Error! {e}") logger.error(f"ES Docs Delete Error! {e}")
def do_delete_doc(self, kb_file, **kwargs): def do_delete_doc(self, kb_file, **kwargs):
base_file_name = os.path.basename(kb_file.filepath) base_file_name = os.path.basename(kb_file.filepath)
if self.es_client_python.indices.exists(index=self.index_name): if self.es_client_python.indices.exists(index=self.index_name):

View File

@ -40,6 +40,7 @@ class FaissKBService(KBService):
with self.load_vector_store().acquire() as vs: with self.load_vector_store().acquire() as vs:
vs.delete(ids) vs.delete(ids)
def do_init(self): def do_init(self):
self.vector_name = self.vector_name or self.embed_model self.vector_name = self.vector_name or self.embed_model
self.kb_path = self.get_kb_path() self.kb_path = self.get_kb_path()

View File

@ -51,6 +51,10 @@ def file_exists(kb: str, selected_rows: List) -> Tuple[str, str]:
return file_name, file_path return file_name, file_path
return "", "" return "", ""
def get_limited_string(data, max_length):
value = data.get('your_column_name', '')
return value[:max_length] if len(value) > max_length else value
def knowledge_base_page(api: ApiRequest, is_lite: bool = None): def knowledge_base_page(api: ApiRequest, is_lite: bool = None):
try: try:
@ -315,53 +319,72 @@ def knowledge_base_page(api: ApiRequest, is_lite: bool = None):
time.sleep(1) time.sleep(1)
st.rerun() st.rerun()
# with st.sidebar: with st.sidebar:
# keyword = st.text_input("查询关键字") keyword = st.text_input("查询关键字")
# top_k = st.slider("匹配条数", 1, 100, 3) top_k = st.slider("匹配条数", 1, 100, 3)
# st.write("文件内文档列表。双击进行修改,在删除列填入 Y 可删除对应行。") st.write("文件内文档列表。双击进行修改,在删除列填入 Y 可删除对应行。")
# docs = [] docs = []
# df = pd.DataFrame([], columns=["seq", "id", "content", "source"]) df = pd.DataFrame([], columns=["seq", "id", "content", "source"])
# if selected_rows: if selected_rows:
# file_name = selected_rows[0]["file_name"] file_name = selected_rows[0]["file_name"]
# docs = api.search_kb_docs(knowledge_base_name=selected_kb, file_name=file_name) print(f"选中的file_name:{file_name},kb:{selected_kb}")
# data = [ docs = api.search_kb_docs(knowledge_base_name=selected_kb, file_name=file_name)
# {"seq": i + 1, "id": x["id"], "page_content": x["page_content"], "source": x["metadata"].get("source"), print(f"选中的file_nameapi.search_kb_docs{docs}")
# "type": x["type"], if isinstance(docs, list):
# "metadata": json.dumps(x["metadata"], ensure_ascii=False), data = [
# "to_del": "", {"seq": i + 1, "id": x["id"], "page_content": x["page_content"], "source": x["metadata"].get("source"),
# } for i, x in enumerate(docs)] "type": x["type"],
# df = pd.DataFrame(data) "metadata": json.dumps(x["metadata"], ensure_ascii=False),
"to_del": "",
} for i, x in enumerate(docs)]
df = pd.DataFrame(data)
# # gb = GridOptionsBuilder.from_dataframe(df) gb = GridOptionsBuilder.from_dataframe(df)
# # gb.configure_columns(["id", "source", "type", "metadata"], hide=True) gb.configure_columns(["id", "source", "type", "metadata"], hide=True)
# # gb.configure_column("seq", "No.", width=50) gb.configure_column("seq", "No.", width=50)
# # gb.configure_column("page_content", "内容", editable=True, autoHeight=True, wrapText=True, flex=1, gb.configure_column("page_content", "内容", editable=True, autoHeight=True, wrapText=True, flex=1,
# # cellEditor="agLargeTextCellEditor", cellEditorPopup=True) cellEditor="agLargeTextCellEditor", cellEditorPopup=True, autoWidth=True,cellEditorParams= { "maxLength": 1000})
# # gb.configure_column("to_del", "删除", editable=True, width=50, wrapHeaderText=True, gb.configure_column("to_del", "删除", editable=True, width=50, wrapHeaderText=True,
# # cellEditor="agCheckboxCellEditor", cellRender="agCheckboxCellRenderer") cellEditor="agCheckboxCellEditor", cellRender="agCheckboxCellRenderer")
# # gb.configure_selection() gb.configure_selection()
# # edit_docs = AgGrid(df, gb.build()) edit_docs = AgGrid(df, gb.build())
# if st.button("保存更改"): if st.button("保存更改"):
# origin_docs = { origin_docs = {
# x["id"]: {"page_content": x["page_content"], "type": x["type"], "metadata": x["metadata"]} for x in x["id"]: {"page_content": x["page_content"], "type": x["type"], "metadata": x["metadata"]} for x in
# docs} docs}
# changed_docs = [] changed_docs = []
# for index, row in edit_docs.data.iterrows(): delete_docs_ids = []
# origin_doc = origin_docs[row["id"]] for index, row in edit_docs.data.iterrows():
# if row["page_content"] != origin_doc["page_content"]: id_string = row["id"]
# if row["to_del"] not in ["Y", "y", 1]: print(f"""edit_docs: index:{index},row[id]:{id_string}""")
# changed_docs.append({ origin_doc = origin_docs[row["id"]]
# "page_content": row["page_content"], #if row["page_content"] != origin_doc["page_content"]:
# "type": row["type"], if row["to_del"] not in ["Y", "y", 1]:
# "metadata": json.loads(row["metadata"]), changed_docs.append({
# }) "page_content": row["page_content"],
"type": row["type"],
"metadata": json.loads(row["metadata"]),
})
elif row["to_del"] in ["Y", "y", 1]:
delete_docs_ids.append(id_string)
print(f"""删除的文档id,row[id]:{id_string}, 文档内容:{row["page_content"]}""")
# if changed_docs: if changed_docs:
# if api.update_kb_docs(knowledge_base_name=selected_kb, print(f"更新的文档有 长度:{len(changed_docs)}")
# file_names=[file_name], if api.update_kb_docs(knowledge_base_name=selected_kb,
# docs={file_name: changed_docs}): file_names=[file_name],
# st.toast("更新文档成功") docs={file_name: changed_docs}):
# else: print("更新文档成功")
# st.toast("更新文档失败") st.toast("更新文档成功")
else:
print("更新文档失败")
st.toast("更新文档失败")
if delete_docs_ids:
if api.delete_docs_by_ids(knowledge_base_name=selected_kb,file_name = file_name, ids= delete_docs_ids):
print("删除文档成功")
st.toast("删除文档成功")
else:
print("删除文档失败")
st.toast("删除文档失败")

View File

@ -596,6 +596,26 @@ class ApiRequest:
) )
return self._get_response_value(response) return self._get_response_value(response)
def delete_docs_by_ids(
self,
knowledge_base_name: str,
file_name:str,
ids: list[str],
) -> bool:
'''
对应api.py/knowledge_base/delete_doc_by_ids接口
'''
data = {
"knowledge_base_name": knowledge_base_name,
"file_name":file_name,
"ids": ids,
}
response = self.post(
"/knowledge_base/delete_docs_by_ids",
json=data
)
return self._get_response_value(response)
def upload_kb_docs( def upload_kb_docs(
self, self,
files: List[Union[str, Path, bytes]], files: List[Union[str, Path, bytes]],