From 5d1a0ec15dc1de2fdf031aa9900c818f4f264dc2 Mon Sep 17 00:00:00 2001 From: liunux4odoo Date: Fri, 4 Aug 2023 20:26:14 +0800 Subject: [PATCH] add recreate_vector_store to kb_doc_api, and the router in api.py. --- server/api.py | 6 +++++- server/knowledge_base/kb_doc_api.py | 32 ++++++++++++++++++++++++++++- webui_pages/utils.py | 29 ++++++++++++++++++++++++-- 3 files changed, 63 insertions(+), 4 deletions(-) diff --git a/server/api.py b/server/api.py index 350843f..d030d46 100644 --- a/server/api.py +++ b/server/api.py @@ -11,7 +11,8 @@ from starlette.responses import RedirectResponse from server.chat import (chat, knowledge_base_chat, openai_chat, search_engine_chat) from server.knowledge_base import (list_kbs, create_kb, delete_kb, - list_docs, upload_doc, delete_doc, update_doc) + list_docs, upload_doc, delete_doc, + update_doc, recreate_vector_store) from server.utils import BaseResponse, ListResponse nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path @@ -97,6 +98,9 @@ def create_app(): response_model=BaseResponse, summary="上传文件到知识库,并删除另一个文件" )(update_doc) + app.post("/knowledge_base/recreate_vector_store", + summary="根据content中文档重建向量库,流式输出处理进度。" + )(recreate_vector_store) return app app = create_app() diff --git a/server/knowledge_base/kb_doc_api.py b/server/knowledge_base/kb_doc_api.py index 2773143..06601f9 100644 --- a/server/knowledge_base/kb_doc_api.py +++ b/server/knowledge_base/kb_doc_api.py @@ -5,7 +5,10 @@ from fastapi import File, Form, UploadFile from server.utils import BaseResponse, ListResponse from server.knowledge_base.utils import (validate_kb_name, get_kb_path, get_doc_path, get_file_path, file2text, docs2vs, - refresh_vs_cache, ) + refresh_vs_cache, get_vs_path, ) +from fastapi.responses import StreamingResponse +import json +import shutil async def list_docs(knowledge_base_name: str): @@ -98,3 +101,30 @@ async def update_doc(): async def download_doc(): # TODO: 下载文件 pass + + +async def recreate_vector_store(knowledge_base_name: str): + ''' + recreate vector store from the content. + this is usefull when user can copy files to content folder directly instead of upload through network. + ''' + async def output(kb): + vs_path = get_vs_path(kb) + if os.path.isdir(vs_path): + shutil.rmtree(vs_path) + os.mkdir(vs_path) + print(f"start to recreate vectore in {vs_path}") + + docs = (await list_docs(kb)).data + for i, filename in enumerate(docs): + filepath = get_file_path(kb, filename) + print(f"processing {filepath} to vector store.") + docs = file2text(filepath) + docs2vs(docs, kb) + yield json.dumps({ + "total": len(docs), + "finished": i + 1, + "doc": filename, + }) + + return StreamingResponse(output(knowledge_base_name), media_type="text/event-stream") diff --git a/webui_pages/utils.py b/webui_pages/utils.py index 8388e25..5745fc5 100644 --- a/webui_pages/utils.py +++ b/webui_pages/utils.py @@ -444,9 +444,31 @@ class ApiRequest: ) return response.json() + def recreate_vector_store( + self, + knowledge_base_name: str, + no_remote_api: bool = None, + ): + ''' + 对应api.py/knowledge_base/recreate_vector_store接口 + ''' + if no_remote_api is None: + no_remote_api = self.no_remote_api + + if no_remote_api: + from server.knowledge_base.kb_doc_api import recreate_vector_store + response = run_async(recreate_vector_store(knowledge_base_name)) + return self._fastapi_stream2generator(response, as_json=True) + else: + response = self.post( + "/knowledge_base/recreate_vector_store", + json={"knowledge_base_name": knowledge_base_name}, + ) + return self._httpx_stream2generator(response, as_json=True) + if __name__ == "__main__": - api = ApiRequest() + api = ApiRequest(no_remote_api=True) # print(api.chat_fastchat( # messages=[{"role": "user", "content": "hello"}] @@ -464,4 +486,7 @@ if __name__ == "__main__": # for t in r: # print(t) - print(api.list_knowledge_bases()) + # print(api.list_knowledge_bases()) + + for t in api.recreate_vector_store('kblog'): + print(t)