add tools to migrate knowledge base
This commit is contained in:
parent
3c44cf65cd
commit
a261fda20b
10
README.md
10
README.md
|
|
@ -45,6 +45,16 @@
|
||||||
|
|
||||||
参见 [版本更新日志](https://github.com/imClumsyPanda/langchain-ChatGLM/releases)。
|
参见 [版本更新日志](https://github.com/imClumsyPanda/langchain-ChatGLM/releases)。
|
||||||
|
|
||||||
|
## 旧版本升级与知识库迁移
|
||||||
|
|
||||||
|
从`0.1.x`升级过来的用户请注意,在完成“开发部署 3 设置配置项”之后,需要将现有知识库迁移到新格式:
|
||||||
|
```shell
|
||||||
|
# 如果知识库已经建立,且知识库类型、嵌入模型无变化,只需以下命令将现有知识库信息添加到数据库即可。
|
||||||
|
$ python init_database.py
|
||||||
|
|
||||||
|
# 如果知识库未建立,或者配置文件中的知识库类型、嵌入模型发生变化,需要以下命令进行知识库迁移。
|
||||||
|
$ python init_database.py --recreate-vs
|
||||||
|
```
|
||||||
## 模型支持
|
## 模型支持
|
||||||
|
|
||||||
本项目中默认使用的 LLM 模型为 [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b),默认使用的 Embedding 模型为 [moka-ai/m3e-base](https://huggingface.co/moka-ai/m3e-base) 为例。
|
本项目中默认使用的 LLM 模型为 [THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b),默认使用的 Embedding 模型为 [moka-ai/m3e-base](https://huggingface.co/moka-ai/m3e-base) 为例。
|
||||||
|
|
|
||||||
|
|
@ -104,6 +104,9 @@ KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowled
|
||||||
DB_ROOT_PATH = os.path.join(KB_ROOT_PATH, "info.db")
|
DB_ROOT_PATH = os.path.join(KB_ROOT_PATH, "info.db")
|
||||||
SQLALCHEMY_DATABASE_URI = f"sqlite:///{DB_ROOT_PATH}"
|
SQLALCHEMY_DATABASE_URI = f"sqlite:///{DB_ROOT_PATH}"
|
||||||
|
|
||||||
|
# 默认向量库类型。可选:faiss, milvus, pg.
|
||||||
|
DEFAULT_VS_TYPE = "faiss"
|
||||||
|
|
||||||
# 缓存向量库数量
|
# 缓存向量库数量
|
||||||
CACHED_VS_NUM = 1
|
CACHED_VS_NUM = 1
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,31 @@
|
||||||
|
from server.knowledge_base.migrate import create_tables, folder2db, recreate_all_vs, list_kbs_from_folder
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.formatter_class = argparse.RawTextHelpFormatter
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--recreate-vs",
|
||||||
|
action="store_true",
|
||||||
|
help=('''
|
||||||
|
recreate all vector store.
|
||||||
|
use this option if you have copied document files to the content folder, but vector store has not been populated or DEFAUL_VS_TYPE/EMBEDDING_MODEL changed.
|
||||||
|
if your vector store is ready with the configs, just skip this option to fill info to database only.
|
||||||
|
'''
|
||||||
|
)
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
create_tables()
|
||||||
|
print("database talbes created")
|
||||||
|
|
||||||
|
if args.recreate_vs:
|
||||||
|
print("recreating all vector stores")
|
||||||
|
recreate_all_vs()
|
||||||
|
else:
|
||||||
|
print("filling kb infos to database")
|
||||||
|
for kb in list_kbs_from_folder():
|
||||||
|
folder2db(kb, "fill_info_only")
|
||||||
|
|
@ -0,0 +1,110 @@
|
||||||
|
from configs.model_config import EMBEDDING_MODEL, DEFAULT_VS_TYPE
|
||||||
|
from server.knowledge_base.utils import get_file_path, list_kbs_from_folder, list_docs_from_folder, KnowledgeFile
|
||||||
|
from server.knowledge_base.kb_service.base import KBServiceFactory
|
||||||
|
from server.db.repository.knowledge_file_repository import add_doc_to_db
|
||||||
|
from server.db.base import Base, engine
|
||||||
|
import os
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
|
||||||
|
def create_tables():
|
||||||
|
Base.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
|
|
||||||
|
def reset_tables():
|
||||||
|
Base.metadata.drop_all(bind=engine)
|
||||||
|
create_tables()
|
||||||
|
|
||||||
|
|
||||||
|
def folder2db(
|
||||||
|
kb_name: str,
|
||||||
|
mode: Literal["recreate_vs", "fill_info_only", "update_in_db", "increament"],
|
||||||
|
vs_type: Literal["faiss", "milvus", "pg", "chromadb"] = DEFAULT_VS_TYPE,
|
||||||
|
embed_model: str = EMBEDDING_MODEL,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
use existed files in local folder to populate database and/or vector store.
|
||||||
|
set parameter `mode` to:
|
||||||
|
recreate_vs: recreate all vector store and fill info to database using existed files in local folder
|
||||||
|
fill_info_only: do not create vector store, fill info to db using existed files only
|
||||||
|
update_in_db: update vector store and database info using local files that existed in database only
|
||||||
|
increament: create vector store and database info for local files that not existed in database only
|
||||||
|
'''
|
||||||
|
kb = KBServiceFactory.get_service(kb_name, vs_type, embed_model)
|
||||||
|
kb.create_kb()
|
||||||
|
|
||||||
|
if mode == "recreate_vs":
|
||||||
|
kb.clear_vs()
|
||||||
|
for doc in list_docs_from_folder(kb_name):
|
||||||
|
try:
|
||||||
|
kb_file = KnowledgeFile(doc, kb_name)
|
||||||
|
kb.add_doc(kb_file)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
elif mode == "fill_info_only":
|
||||||
|
for doc in list_docs_from_folder(kb_name):
|
||||||
|
try:
|
||||||
|
kb_file = KnowledgeFile(doc, kb_name)
|
||||||
|
add_doc_to_db(kb_file)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
elif mode == "update_in_db":
|
||||||
|
for doc in kb.list_docs():
|
||||||
|
try:
|
||||||
|
kb_file = KnowledgeFile(doc, kb_name)
|
||||||
|
kb.update_doc(kb_file)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
elif mode == "increament":
|
||||||
|
db_docs = kb.list_docs()
|
||||||
|
folder_docs = list_docs_from_folder(kb_name)
|
||||||
|
docs = list(set(folder_docs) - set(db_docs))
|
||||||
|
for doc in docs:
|
||||||
|
try:
|
||||||
|
kb_file = KnowledgeFile(doc, kb_name)
|
||||||
|
kb.add_doc(kb_file)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"unspported migrate mode: {mode}")
|
||||||
|
|
||||||
|
|
||||||
|
def recreate_all_vs(
|
||||||
|
mode: Literal["recreate_vs", "fill_info_only", "update_in_db", "increament"],
|
||||||
|
vs_type: Literal["faiss", "milvus", "pg", "chromadb"] = DEFAULT_VS_TYPE,
|
||||||
|
embed_mode: str = EMBEDDING_MODEL,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
used to recreate a vector store or change current vector store to another type or embed_model
|
||||||
|
'''
|
||||||
|
for kb_name in list_kbs_from_folder():
|
||||||
|
folder2db(kb_name, mode, vs_type, embed_mode)
|
||||||
|
|
||||||
|
|
||||||
|
def prune_db_docs(kb_name: str):
|
||||||
|
'''
|
||||||
|
delete docs in database that not existed in local folder.
|
||||||
|
it is used to delete database docs after user deleted some doc files in file browser
|
||||||
|
'''
|
||||||
|
kb = KBServiceFactory.get_service_by_name(kb_name)
|
||||||
|
if kb.exists():
|
||||||
|
docs_in_db = kb.list_docs()
|
||||||
|
docs_in_folder = list_docs_from_folder(kb_name)
|
||||||
|
docs = list(set(docs_in_db) - set(docs_in_folder))
|
||||||
|
for doc in docs:
|
||||||
|
kb.delete_doc(KnowledgeFile(doc, kb_name))
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def prune_folder_docs(kb_name: str):
|
||||||
|
'''
|
||||||
|
delete doc files in local folder that not existed in database.
|
||||||
|
is is used to free local disk space by delete unused doc files.
|
||||||
|
'''
|
||||||
|
kb = KBServiceFactory.get_service_by_name(kb_name)
|
||||||
|
if kb.exists():
|
||||||
|
docs_in_db = kb.list_docs()
|
||||||
|
docs_in_folder = list_docs_from_folder(kb_name)
|
||||||
|
docs = list(set(docs_in_folder) - set(docs_in_db))
|
||||||
|
for doc in docs:
|
||||||
|
os.remove(get_file_path(kb_name, doc))
|
||||||
|
return docs
|
||||||
Loading…
Reference in New Issue