编辑分块功能

This commit is contained in:
wvivi2023 2024-03-21 11:11:34 +08:00
parent 7b9369e625
commit 319475e0f6
8 changed files with 224 additions and 76 deletions

View File

@ -144,7 +144,7 @@ def mount_knowledge_routes(app: FastAPI):
from server.knowledge_base.kb_doc_api import (list_files, upload_docs, delete_docs,
update_docs, download_doc, recreate_vector_store,
search_docs, DocumentWithVSId, update_info,
update_docs_by_id,search_content)
update_docs_by_id,search_content,delete_docs_by_ids)
app.post("/chat/knowledge_base_chat",
tags=["Chat"],
@ -202,6 +202,12 @@ def mount_knowledge_routes(app: FastAPI):
)(update_docs_by_id)
app.post("/knowledge_base/delete_docs_by_ids",
tags=["Knowledge Base Management"],
response_model=BaseResponse,
summary="根据ids删除知识库文档"
)(delete_docs_by_ids)
app.post("/knowledge_base/upload_docs",
tags=["Knowledge Base Management"],
response_model=BaseResponse,

View File

@ -41,7 +41,46 @@ def delete_docs_from_db(session,
session.commit()
return docs
@with_session
def delete_docs_from_db_by_ids(session,
ids:List[str]
):
for id in ids:
query = session.query(FileDocModel).filter(FileDocModel.doc_id.ilike(id))
query.delete(synchronize_session=False)
session.commit()
return True
@with_session
def count_docs_from_db(session, kb_name: str,file_name:str) -> int:
docs = list_docs_from_db(kb_name=kb_name, file_name=file_name)
return len(docs)
#return session.query(FileDocModel).filter(KnowledgeFileModel.kb_name.ilike(kb_name)).count()
@with_session
def update_file_to_db(session,
knowledge_base_name: str,
file_name:str):
kb = session.query(KnowledgeBaseModel).filter_by(kb_name=knowledge_base_name).first()
if kb:
# 如果已经存在该文件,则更新文件信息与版本号
existing_file: KnowledgeFileModel = (session.query(KnowledgeFileModel)
.filter(KnowledgeFileModel.kb_name.ilike(knowledge_base_name),
KnowledgeFileModel.file_name.ilike(file_name))
.first())
if existing_file:
existing_file.file_version += 1
count = count_docs_from_db(knowledge_base_name, file_name)
print(f"*****update_file_to_db 后count 是{count}")
existing_file.docs_count= count
print(f"******knowledge_file_repository 更新knowledge_file***existing_file name :{existing_file.file_name}")
else:
print(f"无效的kb")
@with_session
def add_docs_to_db(session,
kb_name: str,
@ -101,6 +140,7 @@ def add_file_to_db(session,
existing_file.docs_count = docs_count
existing_file.custom_docs = custom_docs
existing_file.file_version += 1
print(f"******knowledge_file_repository 更新knowledge_file***existing_file name :{existing_file.file_name}")
# 否则,添加新文件
else:
new_file = KnowledgeFileModel(

View File

@ -137,7 +137,23 @@ def update_docs_by_id(
else:
return BaseResponse(msg=f"文档更新失败")
def delete_docs_by_ids(knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
file_name:str = Body(..., description="文件名"),
ids: List[str] = Body(..., description="要更新的文档内容,形如:{id1,id2....}")
) -> BaseResponse:
'''
按照文档 ID 删除文档内容
'''
kb = KBServiceFactory.get_service_by_name(knowledge_base_name)
if kb is None:
return BaseResponse(code=500, msg=f"指定的知识库 {knowledge_base_name} 不存在")
result1 = kb.del_doc_by_ids(ids)
result2 = kb.del_doc_by_ids_from_db(knowledge_base_name,file_name,ids)
if result1 and result2:
return BaseResponse(msg=f"文档删除成功")
else:
return BaseResponse(msg=f"文档删除失败")
def list_files(
knowledge_base_name: str
) -> ListResponse:
@ -339,6 +355,7 @@ def update_docs(
failed_files = {}
kb_files = []
print(f"111111 kb_doc_api update_docs file_name:{file_names},更新的doc 长度:{len(docs)}")
# 生成需要加载docs的文件列表
for file_name in file_names:
file_detail = get_file_detail(kb_name=knowledge_base_name, filename=file_name)
@ -346,33 +363,38 @@ def update_docs(
if file_detail.get("custom_docs") and not override_custom_docs:
continue
if file_name not in docs:
print(f"****kb_doc_api update_docs file_name not in docs")
try:
kb_files.append(KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name))
# 从文件生成docs并进行向量化。
# 这里利用了KnowledgeFile的缓存功能在多线程中加载Document然后传给KnowledgeFile
for status, result in files2docs_in_thread(kb_files,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance):
if status:
print(f"kb_doc_api update_docs 文件生成docs并向量化filename:{file_name}")
kb_name, file_name, new_docs = result
kb_file = KnowledgeFile(filename=file_name,
knowledge_base_name=knowledge_base_name)
kb_file.splited_docs = new_docs
kb.update_doc(kb_file, not_refresh_vs_cache=True)
else:
kb_name, file_name, error = result
failed_files[file_name] = error
except Exception as e:
msg = f"加载文档 {file_name} 时出错:{e}"
logger.error(f'{e.__class__.__name__}: {msg}',
exc_info=e if log_verbose else None)
failed_files[file_name] = msg
# 从文件生成docs并进行向量化。
# 这里利用了KnowledgeFile的缓存功能在多线程中加载Document然后传给KnowledgeFile
for status, result in files2docs_in_thread(kb_files,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance):
if status:
print(f"kb_doc_api update_docs 文件生成docs并向量化filename:{file_name}")
kb_name, file_name, new_docs = result
kb_file = KnowledgeFile(filename=file_name,
knowledge_base_name=knowledge_base_name)
kb_file.splited_docs = new_docs
kb.update_doc(kb_file, not_refresh_vs_cache=True)
else:
kb_name, file_name, error = result
failed_files[file_name] = error
print(f"****kb_doc_api update_docs file_name in docs")
# 将自定义的docs进行向量化
for file_name, v in docs.items():
print(f"222222 kb_doc_api update_docs file_name:{file_name},更新的doc 长度:{len(docs)}")
try:
print(f"kb_doc_api update_docs 自定义的docs 向量化filename:{file_name}")
v = [x if isinstance(x, Document) else Document(**x) for x in v]

View File

@ -14,7 +14,7 @@ from server.db.repository.knowledge_base_repository import (
from server.db.repository.knowledge_file_repository import (
add_file_to_db, delete_file_from_db, delete_files_from_db, file_exists_in_db,
count_files_from_db, list_files_from_db, get_file_detail, delete_file_from_db,
list_docs_from_db,
list_docs_from_db,delete_docs_from_db_by_ids,update_file_to_db
)
from configs import (kbs_config, VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD,
@ -112,9 +112,11 @@ class KBService(ABC):
custom_docs = True
for doc in docs:
doc.metadata.setdefault("source", kb_file.filename)
print(f"kb_doc_api add_doc docs 不为空len(docs){len(docs)}")
else:
docs = kb_file.file2text()
custom_docs = False
print(f"kb_doc_api add_doc docs 为空len(docs){len(docs)}")
if docs:
# 将 metadata["source"] 改为相对路径
@ -165,10 +167,10 @@ class KBService(ABC):
使用content中的文件更新向量库
如果指定了docs则使用自定义docs并将数据库对应条目标为custom_docs=True
"""
if os.path.exists(kb_file.filepath):
print(f"{kb_file.filename} exists")
if os.path.exists(kb_file.filepath) and docs is None:
self.delete_doc(kb_file, **kwargs)
return self.add_doc(kb_file, docs=docs, **kwargs)
return self.add_doc(kb_file, docs=docs, **kwargs)
def exist_doc(self, file_name: str):
return file_exists_in_db(KnowledgeFile(knowledge_base_name=self.kb_name,
@ -209,6 +211,13 @@ class KBService(ABC):
def del_doc_by_ids(self, ids: List[str]) -> bool:
raise NotImplementedError
def del_doc_by_ids_from_db(self, knowledge_base_name: str , file_name:str, ids: List[str]) -> bool:
delete_docs_from_db_by_ids(ids)
update_file_to_db(knowledge_base_name = knowledge_base_name,file_name = file_name)
print(f"*******KBService del_doc_by_ids_from_db")
return True
def update_doc_by_ids(self, docs: Dict[str, Document]) -> bool:
'''
传入参数为 {doc_id: Document, ...}
@ -230,16 +239,25 @@ class KBService(ABC):
通过file_name或metadata检索Document
'''
doc_infos = list_docs_from_db(kb_name=self.kb_name, file_name=file_name, metadata=metadata)
print(f"kb_doc_api list_docs_from_db: {doc_infos}")
docs = []
for x in doc_infos:
doc_info = self.get_doc_by_ids([x["id"]])[0]
if doc_info is not None:
doc_info = self.get_doc_by_ids([x["id"]])
#print(f"kb_doc_api doc_info: {doc_info}")
#if doc_info is not None:
if doc_info is not None and isinstance(doc_info, list):
if doc_info:
# 处理非空的情况
doc_with_id = DocumentWithVSId(**doc_info.dict(), id=x["id"])
docs.append(doc_with_id)
#data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs]
doc_with_id = DocumentWithVSId(**doc_info[0].dict(), id=x["id"])
docs.append(doc_with_id)
else:
# 处理 doc_info 为空列表的情况
pass
else:
# 处理空的情况
# 处理 doc_info 是 NoneType 或者不是列表的情况
# 可以选择跳过当前循环迭代或执行其他操作
print("base.py list_docs 返回为空")
pass
return docs

View File

@ -218,8 +218,25 @@ class ESKBService(KBService):
for hit in hits
]
return docs_and_scores
def del_doc_by_ids(self, ids: List[str]) -> bool:
def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
result_list = []
for doc_id in ids:
try:
result = self.es_client_python.get(index=self.index_name,
id=doc_id)
#print(f"es_kb_service:result:{result}")
result_list.append(Document(
page_content=result["_source"]["context"],
metadata=result["_source"]["metadata"],
))
except Exception as e:
logger.error(f"ES Docs Get Error! {e}")
return result_list
def del_doc_by_ids(self,ids: List[str]) -> bool:
print(f"es_kb_service del_doc_by_ids")
for doc_id in ids:
try:
self.es_client_python.delete(index=self.index_name,
@ -228,6 +245,7 @@ class ESKBService(KBService):
except Exception as e:
logger.error(f"ES Docs Delete Error! {e}")
def do_delete_doc(self, kb_file, **kwargs):
base_file_name = os.path.basename(kb_file.filepath)
if self.es_client_python.indices.exists(index=self.index_name):

View File

@ -36,10 +36,11 @@ class FaissKBService(KBService):
with self.load_vector_store().acquire() as vs:
return [vs.docstore._dict.get(id) for id in ids]
def del_doc_by_ids(self, ids: List[str]) -> bool:
def del_doc_by_ids(self, ids: List[str]) -> bool:
with self.load_vector_store().acquire() as vs:
vs.delete(ids)
def do_init(self):
self.vector_name = self.vector_name or self.embed_model
self.kb_path = self.get_kb_path()

View File

@ -51,6 +51,10 @@ def file_exists(kb: str, selected_rows: List) -> Tuple[str, str]:
return file_name, file_path
return "", ""
def get_limited_string(data, max_length):
value = data.get('your_column_name', '')
return value[:max_length] if len(value) > max_length else value
def knowledge_base_page(api: ApiRequest, is_lite: bool = None):
try:
@ -315,53 +319,72 @@ def knowledge_base_page(api: ApiRequest, is_lite: bool = None):
time.sleep(1)
st.rerun()
# with st.sidebar:
# keyword = st.text_input("查询关键字")
# top_k = st.slider("匹配条数", 1, 100, 3)
with st.sidebar:
keyword = st.text_input("查询关键字")
top_k = st.slider("匹配条数", 1, 100, 3)
# st.write("文件内文档列表。双击进行修改,在删除列填入 Y 可删除对应行。")
# docs = []
# df = pd.DataFrame([], columns=["seq", "id", "content", "source"])
# if selected_rows:
# file_name = selected_rows[0]["file_name"]
# docs = api.search_kb_docs(knowledge_base_name=selected_kb, file_name=file_name)
# data = [
# {"seq": i + 1, "id": x["id"], "page_content": x["page_content"], "source": x["metadata"].get("source"),
# "type": x["type"],
# "metadata": json.dumps(x["metadata"], ensure_ascii=False),
# "to_del": "",
# } for i, x in enumerate(docs)]
# df = pd.DataFrame(data)
st.write("文件内文档列表。双击进行修改,在删除列填入 Y 可删除对应行。")
docs = []
df = pd.DataFrame([], columns=["seq", "id", "content", "source"])
if selected_rows:
file_name = selected_rows[0]["file_name"]
print(f"选中的file_name:{file_name},kb:{selected_kb}")
docs = api.search_kb_docs(knowledge_base_name=selected_kb, file_name=file_name)
print(f"选中的file_nameapi.search_kb_docs{docs}")
if isinstance(docs, list):
data = [
{"seq": i + 1, "id": x["id"], "page_content": x["page_content"], "source": x["metadata"].get("source"),
"type": x["type"],
"metadata": json.dumps(x["metadata"], ensure_ascii=False),
"to_del": "",
} for i, x in enumerate(docs)]
df = pd.DataFrame(data)
# # gb = GridOptionsBuilder.from_dataframe(df)
# # gb.configure_columns(["id", "source", "type", "metadata"], hide=True)
# # gb.configure_column("seq", "No.", width=50)
# # gb.configure_column("page_content", "内容", editable=True, autoHeight=True, wrapText=True, flex=1,
# # cellEditor="agLargeTextCellEditor", cellEditorPopup=True)
# # gb.configure_column("to_del", "删除", editable=True, width=50, wrapHeaderText=True,
# # cellEditor="agCheckboxCellEditor", cellRender="agCheckboxCellRenderer")
# # gb.configure_selection()
# # edit_docs = AgGrid(df, gb.build())
gb = GridOptionsBuilder.from_dataframe(df)
gb.configure_columns(["id", "source", "type", "metadata"], hide=True)
gb.configure_column("seq", "No.", width=50)
gb.configure_column("page_content", "内容", editable=True, autoHeight=True, wrapText=True, flex=1,
cellEditor="agLargeTextCellEditor", cellEditorPopup=True, autoWidth=True,cellEditorParams= { "maxLength": 1000})
gb.configure_column("to_del", "删除", editable=True, width=50, wrapHeaderText=True,
cellEditor="agCheckboxCellEditor", cellRender="agCheckboxCellRenderer")
gb.configure_selection()
edit_docs = AgGrid(df, gb.build())
# if st.button("保存更改"):
# origin_docs = {
# x["id"]: {"page_content": x["page_content"], "type": x["type"], "metadata": x["metadata"]} for x in
# docs}
# changed_docs = []
# for index, row in edit_docs.data.iterrows():
# origin_doc = origin_docs[row["id"]]
# if row["page_content"] != origin_doc["page_content"]:
# if row["to_del"] not in ["Y", "y", 1]:
# changed_docs.append({
# "page_content": row["page_content"],
# "type": row["type"],
# "metadata": json.loads(row["metadata"]),
# })
if st.button("保存更改"):
origin_docs = {
x["id"]: {"page_content": x["page_content"], "type": x["type"], "metadata": x["metadata"]} for x in
docs}
changed_docs = []
delete_docs_ids = []
for index, row in edit_docs.data.iterrows():
id_string = row["id"]
print(f"""edit_docs: index:{index},row[id]:{id_string}""")
origin_doc = origin_docs[row["id"]]
#if row["page_content"] != origin_doc["page_content"]:
if row["to_del"] not in ["Y", "y", 1]:
changed_docs.append({
"page_content": row["page_content"],
"type": row["type"],
"metadata": json.loads(row["metadata"]),
})
elif row["to_del"] in ["Y", "y", 1]:
delete_docs_ids.append(id_string)
print(f"""删除的文档id,row[id]:{id_string}, 文档内容:{row["page_content"]}""")
# if changed_docs:
# if api.update_kb_docs(knowledge_base_name=selected_kb,
# file_names=[file_name],
# docs={file_name: changed_docs}):
# st.toast("更新文档成功")
# else:
# st.toast("更新文档失败")
if changed_docs:
print(f"更新的文档有 长度:{len(changed_docs)}")
if api.update_kb_docs(knowledge_base_name=selected_kb,
file_names=[file_name],
docs={file_name: changed_docs}):
print("更新文档成功")
st.toast("更新文档成功")
else:
print("更新文档失败")
st.toast("更新文档失败")
if delete_docs_ids:
if api.delete_docs_by_ids(knowledge_base_name=selected_kb,file_name = file_name, ids= delete_docs_ids):
print("删除文档成功")
st.toast("删除文档成功")
else:
print("删除文档失败")
st.toast("删除文档失败")

View File

@ -596,6 +596,26 @@ class ApiRequest:
)
return self._get_response_value(response)
def delete_docs_by_ids(
self,
knowledge_base_name: str,
file_name:str,
ids: list[str],
) -> bool:
'''
对应api.py/knowledge_base/delete_doc_by_ids接口
'''
data = {
"knowledge_base_name": knowledge_base_name,
"file_name":file_name,
"ids": ids,
}
response = self.post(
"/knowledge_base/delete_docs_by_ids",
json=data
)
return self._get_response_value(response)
def upload_kb_docs(
self,
files: List[Union[str, Path, bytes]],