From 319475e0f6bdd487303828d9237c6790c5e24eee Mon Sep 17 00:00:00 2001 From: wvivi2023 Date: Thu, 21 Mar 2024 11:11:34 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BC=96=E8=BE=91=E5=88=86=E5=9D=97=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/api.py | 8 +- .../repository/knowledge_file_repository.py | 40 ++++++ server/knowledge_base/kb_doc_api.py | 56 ++++++--- server/knowledge_base/kb_service/base.py | 36 ++++-- .../kb_service/es_kb_service.py | 22 +++- .../kb_service/faiss_kb_service.py | 3 +- webui_pages/knowledge_base/knowledge_base.py | 115 +++++++++++------- webui_pages/utils.py | 20 +++ 8 files changed, 224 insertions(+), 76 deletions(-) diff --git a/server/api.py b/server/api.py index 669324f..21d07e3 100644 --- a/server/api.py +++ b/server/api.py @@ -144,7 +144,7 @@ def mount_knowledge_routes(app: FastAPI): from server.knowledge_base.kb_doc_api import (list_files, upload_docs, delete_docs, update_docs, download_doc, recreate_vector_store, search_docs, DocumentWithVSId, update_info, - update_docs_by_id,search_content) + update_docs_by_id,search_content,delete_docs_by_ids) app.post("/chat/knowledge_base_chat", tags=["Chat"], @@ -202,6 +202,12 @@ def mount_knowledge_routes(app: FastAPI): )(update_docs_by_id) + app.post("/knowledge_base/delete_docs_by_ids", + tags=["Knowledge Base Management"], + response_model=BaseResponse, + summary="根据ids删除知识库文档" + )(delete_docs_by_ids) + app.post("/knowledge_base/upload_docs", tags=["Knowledge Base Management"], response_model=BaseResponse, diff --git a/server/db/repository/knowledge_file_repository.py b/server/db/repository/knowledge_file_repository.py index 4388e7a..70d04e4 100644 --- a/server/db/repository/knowledge_file_repository.py +++ b/server/db/repository/knowledge_file_repository.py @@ -41,7 +41,46 @@ def delete_docs_from_db(session, session.commit() return docs +@with_session +def delete_docs_from_db_by_ids(session, + ids:List[str] +): + for id in ids: + query = session.query(FileDocModel).filter(FileDocModel.doc_id.ilike(id)) + query.delete(synchronize_session=False) + session.commit() + + return True +@with_session +def count_docs_from_db(session, kb_name: str,file_name:str) -> int: + docs = list_docs_from_db(kb_name=kb_name, file_name=file_name) + return len(docs) + #return session.query(FileDocModel).filter(KnowledgeFileModel.kb_name.ilike(kb_name)).count() + +@with_session +def update_file_to_db(session, + knowledge_base_name: str, + file_name:str): + kb = session.query(KnowledgeBaseModel).filter_by(kb_name=knowledge_base_name).first() + if kb: + # 如果已经存在该文件,则更新文件信息与版本号 + existing_file: KnowledgeFileModel = (session.query(KnowledgeFileModel) + .filter(KnowledgeFileModel.kb_name.ilike(knowledge_base_name), + KnowledgeFileModel.file_name.ilike(file_name)) + .first()) + + + if existing_file: + existing_file.file_version += 1 + count = count_docs_from_db(knowledge_base_name, file_name) + print(f"*****update_file_to_db 后count 是{count}") + existing_file.docs_count= count + print(f"******knowledge_file_repository 更新knowledge_file***existing_file name :{existing_file.file_name}") + else: + print(f"无效的kb") + + @with_session def add_docs_to_db(session, kb_name: str, @@ -101,6 +140,7 @@ def add_file_to_db(session, existing_file.docs_count = docs_count existing_file.custom_docs = custom_docs existing_file.file_version += 1 + print(f"******knowledge_file_repository 更新knowledge_file***existing_file name :{existing_file.file_name}") # 否则,添加新文件 else: new_file = KnowledgeFileModel( diff --git a/server/knowledge_base/kb_doc_api.py b/server/knowledge_base/kb_doc_api.py index dc7ba0f..a38f93a 100644 --- a/server/knowledge_base/kb_doc_api.py +++ b/server/knowledge_base/kb_doc_api.py @@ -137,7 +137,23 @@ def update_docs_by_id( else: return BaseResponse(msg=f"文档更新失败") - +def delete_docs_by_ids(knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]), + file_name:str = Body(..., description="文件名"), + ids: List[str] = Body(..., description="要更新的文档内容,形如:{id1,id2....}") + ) -> BaseResponse: + ''' + 按照文档 ID 删除文档内容 + ''' + kb = KBServiceFactory.get_service_by_name(knowledge_base_name) + if kb is None: + return BaseResponse(code=500, msg=f"指定的知识库 {knowledge_base_name} 不存在") + result1 = kb.del_doc_by_ids(ids) + result2 = kb.del_doc_by_ids_from_db(knowledge_base_name,file_name,ids) + if result1 and result2: + return BaseResponse(msg=f"文档删除成功") + else: + return BaseResponse(msg=f"文档删除失败") + def list_files( knowledge_base_name: str ) -> ListResponse: @@ -339,6 +355,7 @@ def update_docs( failed_files = {} kb_files = [] + print(f"111111 kb_doc_api update_docs file_name:{file_names},更新的doc 长度:{len(docs)}") # 生成需要加载docs的文件列表 for file_name in file_names: file_detail = get_file_detail(kb_name=knowledge_base_name, filename=file_name) @@ -346,33 +363,38 @@ def update_docs( if file_detail.get("custom_docs") and not override_custom_docs: continue if file_name not in docs: + print(f"****kb_doc_api update_docs file_name not in docs") try: kb_files.append(KnowledgeFile(filename=file_name, knowledge_base_name=knowledge_base_name)) + + # 从文件生成docs,并进行向量化。 + # 这里利用了KnowledgeFile的缓存功能,在多线程中加载Document,然后传给KnowledgeFile + for status, result in files2docs_in_thread(kb_files, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + zh_title_enhance=zh_title_enhance): + if status: + print(f"kb_doc_api update_docs 文件生成docs并向量化,filename:{file_name}") + kb_name, file_name, new_docs = result + kb_file = KnowledgeFile(filename=file_name, + knowledge_base_name=knowledge_base_name) + kb_file.splited_docs = new_docs + kb.update_doc(kb_file, not_refresh_vs_cache=True) + else: + kb_name, file_name, error = result + failed_files[file_name] = error + except Exception as e: msg = f"加载文档 {file_name} 时出错:{e}" logger.error(f'{e.__class__.__name__}: {msg}', exc_info=e if log_verbose else None) failed_files[file_name] = msg - - # 从文件生成docs,并进行向量化。 - # 这里利用了KnowledgeFile的缓存功能,在多线程中加载Document,然后传给KnowledgeFile - for status, result in files2docs_in_thread(kb_files, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - zh_title_enhance=zh_title_enhance): - if status: - print(f"kb_doc_api update_docs 文件生成docs并向量化,filename:{file_name}") - kb_name, file_name, new_docs = result - kb_file = KnowledgeFile(filename=file_name, - knowledge_base_name=knowledge_base_name) - kb_file.splited_docs = new_docs - kb.update_doc(kb_file, not_refresh_vs_cache=True) else: - kb_name, file_name, error = result - failed_files[file_name] = error + print(f"****kb_doc_api update_docs file_name in docs") # 将自定义的docs进行向量化 for file_name, v in docs.items(): + print(f"222222 kb_doc_api update_docs file_name:{file_name},更新的doc 长度:{len(docs)}") try: print(f"kb_doc_api update_docs 自定义的docs 向量化,filename:{file_name}") v = [x if isinstance(x, Document) else Document(**x) for x in v] diff --git a/server/knowledge_base/kb_service/base.py b/server/knowledge_base/kb_service/base.py index b35a313..980422f 100644 --- a/server/knowledge_base/kb_service/base.py +++ b/server/knowledge_base/kb_service/base.py @@ -14,7 +14,7 @@ from server.db.repository.knowledge_base_repository import ( from server.db.repository.knowledge_file_repository import ( add_file_to_db, delete_file_from_db, delete_files_from_db, file_exists_in_db, count_files_from_db, list_files_from_db, get_file_detail, delete_file_from_db, - list_docs_from_db, + list_docs_from_db,delete_docs_from_db_by_ids,update_file_to_db ) from configs import (kbs_config, VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD, @@ -112,9 +112,11 @@ class KBService(ABC): custom_docs = True for doc in docs: doc.metadata.setdefault("source", kb_file.filename) + print(f"kb_doc_api add_doc docs 不为空,len(docs):{len(docs)}") else: docs = kb_file.file2text() custom_docs = False + print(f"kb_doc_api add_doc docs 为空,len(docs):{len(docs)}") if docs: # 将 metadata["source"] 改为相对路径 @@ -165,10 +167,10 @@ class KBService(ABC): 使用content中的文件更新向量库 如果指定了docs,则使用自定义docs,并将数据库对应条目标为custom_docs=True """ - if os.path.exists(kb_file.filepath): - print(f"{kb_file.filename} exists") + if os.path.exists(kb_file.filepath) and docs is None: self.delete_doc(kb_file, **kwargs) - return self.add_doc(kb_file, docs=docs, **kwargs) + + return self.add_doc(kb_file, docs=docs, **kwargs) def exist_doc(self, file_name: str): return file_exists_in_db(KnowledgeFile(knowledge_base_name=self.kb_name, @@ -209,6 +211,13 @@ class KBService(ABC): def del_doc_by_ids(self, ids: List[str]) -> bool: raise NotImplementedError + def del_doc_by_ids_from_db(self, knowledge_base_name: str , file_name:str, ids: List[str]) -> bool: + delete_docs_from_db_by_ids(ids) + update_file_to_db(knowledge_base_name = knowledge_base_name,file_name = file_name) + print(f"*******KBService del_doc_by_ids_from_db") + return True + + def update_doc_by_ids(self, docs: Dict[str, Document]) -> bool: ''' 传入参数为: {doc_id: Document, ...} @@ -230,16 +239,25 @@ class KBService(ABC): 通过file_name或metadata检索Document ''' doc_infos = list_docs_from_db(kb_name=self.kb_name, file_name=file_name, metadata=metadata) + print(f"kb_doc_api list_docs_from_db: {doc_infos}") docs = [] for x in doc_infos: - doc_info = self.get_doc_by_ids([x["id"]])[0] - if doc_info is not None: + doc_info = self.get_doc_by_ids([x["id"]]) + #print(f"kb_doc_api doc_info: {doc_info}") + #if doc_info is not None: + if doc_info is not None and isinstance(doc_info, list): + if doc_info: # 处理非空的情况 - doc_with_id = DocumentWithVSId(**doc_info.dict(), id=x["id"]) - docs.append(doc_with_id) + #data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs] + doc_with_id = DocumentWithVSId(**doc_info[0].dict(), id=x["id"]) + docs.append(doc_with_id) + else: + # 处理 doc_info 为空列表的情况 + pass else: - # 处理空的情况 + # 处理 doc_info 是 NoneType 或者不是列表的情况 # 可以选择跳过当前循环迭代或执行其他操作 + print("base.py list_docs 返回为空") pass return docs diff --git a/server/knowledge_base/kb_service/es_kb_service.py b/server/knowledge_base/kb_service/es_kb_service.py index 4e46223..2ef1872 100644 --- a/server/knowledge_base/kb_service/es_kb_service.py +++ b/server/knowledge_base/kb_service/es_kb_service.py @@ -218,8 +218,25 @@ class ESKBService(KBService): for hit in hits ] return docs_and_scores - - def del_doc_by_ids(self, ids: List[str]) -> bool: + + def get_doc_by_ids(self, ids: List[str]) -> List[Document]: + result_list = [] + for doc_id in ids: + try: + result = self.es_client_python.get(index=self.index_name, + id=doc_id) + #print(f"es_kb_service:result:{result}") + result_list.append(Document( + page_content=result["_source"]["context"], + metadata=result["_source"]["metadata"], + )) + except Exception as e: + logger.error(f"ES Docs Get Error! {e}") + return result_list + + + def del_doc_by_ids(self,ids: List[str]) -> bool: + print(f"es_kb_service del_doc_by_ids") for doc_id in ids: try: self.es_client_python.delete(index=self.index_name, @@ -228,6 +245,7 @@ class ESKBService(KBService): except Exception as e: logger.error(f"ES Docs Delete Error! {e}") + def do_delete_doc(self, kb_file, **kwargs): base_file_name = os.path.basename(kb_file.filepath) if self.es_client_python.indices.exists(index=self.index_name): diff --git a/server/knowledge_base/kb_service/faiss_kb_service.py b/server/knowledge_base/kb_service/faiss_kb_service.py index 7519beb..0278b41 100644 --- a/server/knowledge_base/kb_service/faiss_kb_service.py +++ b/server/knowledge_base/kb_service/faiss_kb_service.py @@ -36,10 +36,11 @@ class FaissKBService(KBService): with self.load_vector_store().acquire() as vs: return [vs.docstore._dict.get(id) for id in ids] - def del_doc_by_ids(self, ids: List[str]) -> bool: + def del_doc_by_ids(self, ids: List[str]) -> bool: with self.load_vector_store().acquire() as vs: vs.delete(ids) + def do_init(self): self.vector_name = self.vector_name or self.embed_model self.kb_path = self.get_kb_path() diff --git a/webui_pages/knowledge_base/knowledge_base.py b/webui_pages/knowledge_base/knowledge_base.py index b536cd0..0dc02b2 100644 --- a/webui_pages/knowledge_base/knowledge_base.py +++ b/webui_pages/knowledge_base/knowledge_base.py @@ -51,6 +51,10 @@ def file_exists(kb: str, selected_rows: List) -> Tuple[str, str]: return file_name, file_path return "", "" +def get_limited_string(data, max_length): + value = data.get('your_column_name', '') + return value[:max_length] if len(value) > max_length else value + def knowledge_base_page(api: ApiRequest, is_lite: bool = None): try: @@ -315,53 +319,72 @@ def knowledge_base_page(api: ApiRequest, is_lite: bool = None): time.sleep(1) st.rerun() - # with st.sidebar: - # keyword = st.text_input("查询关键字") - # top_k = st.slider("匹配条数", 1, 100, 3) + with st.sidebar: + keyword = st.text_input("查询关键字") + top_k = st.slider("匹配条数", 1, 100, 3) - # st.write("文件内文档列表。双击进行修改,在删除列填入 Y 可删除对应行。") - # docs = [] - # df = pd.DataFrame([], columns=["seq", "id", "content", "source"]) - # if selected_rows: - # file_name = selected_rows[0]["file_name"] - # docs = api.search_kb_docs(knowledge_base_name=selected_kb, file_name=file_name) - # data = [ - # {"seq": i + 1, "id": x["id"], "page_content": x["page_content"], "source": x["metadata"].get("source"), - # "type": x["type"], - # "metadata": json.dumps(x["metadata"], ensure_ascii=False), - # "to_del": "", - # } for i, x in enumerate(docs)] - # df = pd.DataFrame(data) + st.write("文件内文档列表。双击进行修改,在删除列填入 Y 可删除对应行。") + docs = [] + df = pd.DataFrame([], columns=["seq", "id", "content", "source"]) + if selected_rows: + file_name = selected_rows[0]["file_name"] + print(f"选中的file_name:{file_name},kb:{selected_kb}") + docs = api.search_kb_docs(knowledge_base_name=selected_kb, file_name=file_name) + print(f"选中的file_name:api.search_kb_docs:{docs}") + if isinstance(docs, list): + data = [ + {"seq": i + 1, "id": x["id"], "page_content": x["page_content"], "source": x["metadata"].get("source"), + "type": x["type"], + "metadata": json.dumps(x["metadata"], ensure_ascii=False), + "to_del": "", + } for i, x in enumerate(docs)] + df = pd.DataFrame(data) - # # gb = GridOptionsBuilder.from_dataframe(df) - # # gb.configure_columns(["id", "source", "type", "metadata"], hide=True) - # # gb.configure_column("seq", "No.", width=50) - # # gb.configure_column("page_content", "内容", editable=True, autoHeight=True, wrapText=True, flex=1, - # # cellEditor="agLargeTextCellEditor", cellEditorPopup=True) - # # gb.configure_column("to_del", "删除", editable=True, width=50, wrapHeaderText=True, - # # cellEditor="agCheckboxCellEditor", cellRender="agCheckboxCellRenderer") - # # gb.configure_selection() - # # edit_docs = AgGrid(df, gb.build()) + gb = GridOptionsBuilder.from_dataframe(df) + gb.configure_columns(["id", "source", "type", "metadata"], hide=True) + gb.configure_column("seq", "No.", width=50) + gb.configure_column("page_content", "内容", editable=True, autoHeight=True, wrapText=True, flex=1, + cellEditor="agLargeTextCellEditor", cellEditorPopup=True, autoWidth=True,cellEditorParams= { "maxLength": 1000}) + gb.configure_column("to_del", "删除", editable=True, width=50, wrapHeaderText=True, + cellEditor="agCheckboxCellEditor", cellRender="agCheckboxCellRenderer") + gb.configure_selection() + edit_docs = AgGrid(df, gb.build()) - # if st.button("保存更改"): - # origin_docs = { - # x["id"]: {"page_content": x["page_content"], "type": x["type"], "metadata": x["metadata"]} for x in - # docs} - # changed_docs = [] - # for index, row in edit_docs.data.iterrows(): - # origin_doc = origin_docs[row["id"]] - # if row["page_content"] != origin_doc["page_content"]: - # if row["to_del"] not in ["Y", "y", 1]: - # changed_docs.append({ - # "page_content": row["page_content"], - # "type": row["type"], - # "metadata": json.loads(row["metadata"]), - # }) + if st.button("保存更改"): + origin_docs = { + x["id"]: {"page_content": x["page_content"], "type": x["type"], "metadata": x["metadata"]} for x in + docs} + changed_docs = [] + delete_docs_ids = [] + for index, row in edit_docs.data.iterrows(): + id_string = row["id"] + print(f"""edit_docs: index:{index},row[id]:{id_string}""") + origin_doc = origin_docs[row["id"]] + #if row["page_content"] != origin_doc["page_content"]: + if row["to_del"] not in ["Y", "y", 1]: + changed_docs.append({ + "page_content": row["page_content"], + "type": row["type"], + "metadata": json.loads(row["metadata"]), + }) + elif row["to_del"] in ["Y", "y", 1]: + delete_docs_ids.append(id_string) + print(f"""删除的文档id:,row[id]:{id_string}, 文档内容:{row["page_content"]}""") - # if changed_docs: - # if api.update_kb_docs(knowledge_base_name=selected_kb, - # file_names=[file_name], - # docs={file_name: changed_docs}): - # st.toast("更新文档成功") - # else: - # st.toast("更新文档失败") + if changed_docs: + print(f"更新的文档有 长度:{len(changed_docs)}") + if api.update_kb_docs(knowledge_base_name=selected_kb, + file_names=[file_name], + docs={file_name: changed_docs}): + print("更新文档成功") + st.toast("更新文档成功") + else: + print("更新文档失败") + st.toast("更新文档失败") + if delete_docs_ids: + if api.delete_docs_by_ids(knowledge_base_name=selected_kb,file_name = file_name, ids= delete_docs_ids): + print("删除文档成功") + st.toast("删除文档成功") + else: + print("删除文档失败") + st.toast("删除文档失败") diff --git a/webui_pages/utils.py b/webui_pages/utils.py index 95c4ef0..e57478c 100644 --- a/webui_pages/utils.py +++ b/webui_pages/utils.py @@ -596,6 +596,26 @@ class ApiRequest: ) return self._get_response_value(response) + def delete_docs_by_ids( + self, + knowledge_base_name: str, + file_name:str, + ids: list[str], + ) -> bool: + ''' + 对应api.py/knowledge_base/delete_doc_by_ids接口 + ''' + data = { + "knowledge_base_name": knowledge_base_name, + "file_name":file_name, + "ids": ids, + } + response = self.post( + "/knowledge_base/delete_docs_by_ids", + json=data + ) + return self._get_response_value(response) + def upload_kb_docs( self, files: List[Union[str, Path, bytes]],