From a261fda20b06a37e5e4a236bed67646b3ade6080 Mon Sep 17 00:00:00 2001 From: liunux4odoo Date: Fri, 11 Aug 2023 08:33:05 +0800 Subject: [PATCH] add tools to migrate knowledge base --- README.md | 10 +++ configs/model_config.py.example | 3 + init_database.py | 31 +++++++++ server/knowledge_base/migrate.py | 110 +++++++++++++++++++++++++++++++ 4 files changed, 154 insertions(+) create mode 100644 init_database.py create mode 100644 server/knowledge_base/migrate.py diff --git a/README.md b/README.md index 73bce4c..8395ef0 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,16 @@ 参见 [版本更新日志](https://github.com/imClumsyPanda/langchain-ChatGLM/releases)。 +## 旧版本升级与知识库迁移 + +从`0.1.x`升级过来的用户请注意,在完成“开发部署 3 设置配置项”之后,需要将现有知识库迁移到新格式: +```shell +# 如果知识库已经建立,且知识库类型、嵌入模型无变化,只需以下命令将现有知识库信息添加到数据库即可。 +$ python init_database.py + +# 如果知识库未建立,或者配置文件中的知识库类型、嵌入模型发生变化,需要以下命令进行知识库迁移。 +$ python init_database.py --recreate-vs +``` ## 模型支持 本项目中默认使用的 LLM 模型为 [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b),默认使用的 Embedding 模型为 [moka-ai/m3e-base](https://huggingface.co/moka-ai/m3e-base) 为例。 diff --git a/configs/model_config.py.example b/configs/model_config.py.example index f893d57..e2cdda7 100644 --- a/configs/model_config.py.example +++ b/configs/model_config.py.example @@ -104,6 +104,9 @@ KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowled DB_ROOT_PATH = os.path.join(KB_ROOT_PATH, "info.db") SQLALCHEMY_DATABASE_URI = f"sqlite:///{DB_ROOT_PATH}" +# 默认向量库类型。可选:faiss, milvus, pg. +DEFAULT_VS_TYPE = "faiss" + # 缓存向量库数量 CACHED_VS_NUM = 1 diff --git a/init_database.py b/init_database.py new file mode 100644 index 0000000..2d3f67a --- /dev/null +++ b/init_database.py @@ -0,0 +1,31 @@ +from server.knowledge_base.migrate import create_tables, folder2db, recreate_all_vs, list_kbs_from_folder + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.formatter_class = argparse.RawTextHelpFormatter + + parser.add_argument( + "--recreate-vs", + action="store_true", + help=(''' + recreate all vector store. + use this option if you have copied document files to the content folder, but vector store has not been populated or DEFAUL_VS_TYPE/EMBEDDING_MODEL changed. + if your vector store is ready with the configs, just skip this option to fill info to database only. + ''' + ) + ) + args = parser.parse_args() + + create_tables() + print("database talbes created") + + if args.recreate_vs: + print("recreating all vector stores") + recreate_all_vs() + else: + print("filling kb infos to database") + for kb in list_kbs_from_folder(): + folder2db(kb, "fill_info_only") diff --git a/server/knowledge_base/migrate.py b/server/knowledge_base/migrate.py new file mode 100644 index 0000000..721862a --- /dev/null +++ b/server/knowledge_base/migrate.py @@ -0,0 +1,110 @@ +from configs.model_config import EMBEDDING_MODEL, DEFAULT_VS_TYPE +from server.knowledge_base.utils import get_file_path, list_kbs_from_folder, list_docs_from_folder, KnowledgeFile +from server.knowledge_base.kb_service.base import KBServiceFactory +from server.db.repository.knowledge_file_repository import add_doc_to_db +from server.db.base import Base, engine +import os +from typing import Literal + + +def create_tables(): + Base.metadata.create_all(bind=engine) + + +def reset_tables(): + Base.metadata.drop_all(bind=engine) + create_tables() + + +def folder2db( + kb_name: str, + mode: Literal["recreate_vs", "fill_info_only", "update_in_db", "increament"], + vs_type: Literal["faiss", "milvus", "pg", "chromadb"] = DEFAULT_VS_TYPE, + embed_model: str = EMBEDDING_MODEL, +): + ''' + use existed files in local folder to populate database and/or vector store. + set parameter `mode` to: + recreate_vs: recreate all vector store and fill info to database using existed files in local folder + fill_info_only: do not create vector store, fill info to db using existed files only + update_in_db: update vector store and database info using local files that existed in database only + increament: create vector store and database info for local files that not existed in database only + ''' + kb = KBServiceFactory.get_service(kb_name, vs_type, embed_model) + kb.create_kb() + + if mode == "recreate_vs": + kb.clear_vs() + for doc in list_docs_from_folder(kb_name): + try: + kb_file = KnowledgeFile(doc, kb_name) + kb.add_doc(kb_file) + except Exception as e: + print(e) + elif mode == "fill_info_only": + for doc in list_docs_from_folder(kb_name): + try: + kb_file = KnowledgeFile(doc, kb_name) + add_doc_to_db(kb_file) + except Exception as e: + print(e) + elif mode == "update_in_db": + for doc in kb.list_docs(): + try: + kb_file = KnowledgeFile(doc, kb_name) + kb.update_doc(kb_file) + except Exception as e: + print(e) + elif mode == "increament": + db_docs = kb.list_docs() + folder_docs = list_docs_from_folder(kb_name) + docs = list(set(folder_docs) - set(db_docs)) + for doc in docs: + try: + kb_file = KnowledgeFile(doc, kb_name) + kb.add_doc(kb_file) + except Exception as e: + print(e) + else: + raise ValueError(f"unspported migrate mode: {mode}") + + +def recreate_all_vs( + mode: Literal["recreate_vs", "fill_info_only", "update_in_db", "increament"], + vs_type: Literal["faiss", "milvus", "pg", "chromadb"] = DEFAULT_VS_TYPE, + embed_mode: str = EMBEDDING_MODEL, +): + ''' + used to recreate a vector store or change current vector store to another type or embed_model + ''' + for kb_name in list_kbs_from_folder(): + folder2db(kb_name, mode, vs_type, embed_mode) + + +def prune_db_docs(kb_name: str): + ''' + delete docs in database that not existed in local folder. + it is used to delete database docs after user deleted some doc files in file browser + ''' + kb = KBServiceFactory.get_service_by_name(kb_name) + if kb.exists(): + docs_in_db = kb.list_docs() + docs_in_folder = list_docs_from_folder(kb_name) + docs = list(set(docs_in_db) - set(docs_in_folder)) + for doc in docs: + kb.delete_doc(KnowledgeFile(doc, kb_name)) + return docs + +def prune_folder_docs(kb_name: str): + ''' + delete doc files in local folder that not existed in database. + is is used to free local disk space by delete unused doc files. + ''' + kb = KBServiceFactory.get_service_by_name(kb_name) + if kb.exists(): + docs_in_db = kb.list_docs() + docs_in_folder = list_docs_from_folder(kb_name) + docs = list(set(docs_in_folder) - set(docs_in_db)) + for doc in docs: + os.remove(get_file_path(kb_name, doc)) + return docs