From 16d8809c9a32b7111016ee08ea48eafd95057552 Mon Sep 17 00:00:00 2001 From: liunux4odoo <41217877+liunux4odoo@users.noreply.github.com> Date: Wed, 13 Sep 2023 11:19:47 +0800 Subject: [PATCH] =?UTF-8?q?api=E5=92=8Cwebui=E7=9F=A5=E8=AF=86=E5=BA=93?= =?UTF-8?q?=E6=93=8D=E4=BD=9C=E6=94=AF=E6=8C=81chunk=5Fsize/overlap=5Fsize?= =?UTF-8?q?/zh=5Ftitle=5Fenhance=E5=8F=82=E6=95=B0=20(#1459)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- configs/model_config.py.example | 4 +++ server/knowledge_base/kb_doc_api.py | 25 +++++++++++++-- server/knowledge_base/migrate.py | 21 ++++++++----- server/knowledge_base/utils.py | 14 ++++++--- webui_pages/knowledge_base/knowledge_base.py | 32 +++++++++++++++----- webui_pages/utils.py | 21 +++++++++++++ 6 files changed, 95 insertions(+), 22 deletions(-) diff --git a/configs/model_config.py.example b/configs/model_config.py.example index d5b624b..a552441 100644 --- a/configs/model_config.py.example +++ b/configs/model_config.py.example @@ -104,6 +104,10 @@ LLM_MODEL = "chatglm2-6b" # 历史对话轮数 HISTORY_LEN = 3 +# LLM通用对话参数 +TEMPERATURE = 0.7 +# TOP_P = 0.95 # ChatOpenAI暂不支持该参数 + # LLM 运行设备。设为"auto"会自动检测,也可手动设定为"cuda","mps","cpu"其中之一。 LLM_DEVICE = "auto" diff --git a/server/knowledge_base/kb_doc_api.py b/server/knowledge_base/kb_doc_api.py index 9074415..02ad222 100644 --- a/server/knowledge_base/kb_doc_api.py +++ b/server/knowledge_base/kb_doc_api.py @@ -3,6 +3,7 @@ import urllib from fastapi import File, Form, Body, Query, UploadFile from configs.model_config import (DEFAULT_VS_TYPE, EMBEDDING_MODEL, VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD, + CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE, logger, log_verbose,) from server.utils import BaseResponse, ListResponse, run_in_thread_pool from server.knowledge_base.utils import (validate_kb_name, list_files_from_folder,get_file_path, @@ -121,6 +122,9 @@ def upload_docs(files: List[UploadFile] = File(..., description="上传文件, knowledge_base_name: str = Form(..., description="知识库名称", examples=["samples"]), override: bool = Form(False, description="覆盖已有文件"), to_vector_store: bool = Form(True, description="上传文件后是否进行向量化"), + chunk_size: int = Body(CHUNK_SIZE, description="知识库中单段文本最大长度"), + chunk_overlap: int = Body(OVERLAP_SIZE, description="知识库中相邻文本重合长度"), + zh_title_enhance: bool = Body(ZH_TITLE_ENHANCE, description="是否开启中文标题加强"), docs: Json = Form({}, description="自定义的docs", examples=[{"test.txt": [Document(page_content="custom doc")]}]), not_refresh_vs_cache: bool = Form(False, description="暂不保存向量库(用于FAISS)"), ) -> BaseResponse: @@ -152,6 +156,9 @@ def upload_docs(files: List[UploadFile] = File(..., description="上传文件, knowledge_base_name=knowledge_base_name, file_names=file_names, override_custom_docs=True, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + zh_title_enhance=zh_title_enhance, docs=docs, not_refresh_vs_cache=True, ) @@ -199,6 +206,9 @@ def delete_docs(knowledge_base_name: str = Body(..., examples=["samples"]), def update_docs( knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]), file_names: List[str] = Body(..., description="文件名称,支持多文件", examples=["file_name"]), + chunk_size: int = Body(CHUNK_SIZE, description="知识库中单段文本最大长度"), + chunk_overlap: int = Body(OVERLAP_SIZE, description="知识库中相邻文本重合长度"), + zh_title_enhance: bool = Body(ZH_TITLE_ENHANCE, description="是否开启中文标题加强"), override_custom_docs: bool = Body(False, description="是否覆盖之前自定义的docs"), docs: Json = Body({}, description="自定义的docs", examples=[{"test.txt": [Document(page_content="custom doc")]}]), not_refresh_vs_cache: bool = Body(False, description="暂不保存向量库(用于FAISS)"), @@ -233,7 +243,10 @@ def update_docs( # 从文件生成docs,并进行向量化。 # 这里利用了KnowledgeFile的缓存功能,在多线程中加载Document,然后传给KnowledgeFile - for status, result in files2docs_in_thread(kb_files): + for status, result in files2docs_in_thread(kb_files, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + zh_title_enhance=zh_title_enhance): if status: kb_name, file_name, new_docs = result kb_file = KnowledgeFile(filename=file_name, @@ -307,7 +320,10 @@ def recreate_vector_store( allow_empty_kb: bool = Body(True), vs_type: str = Body(DEFAULT_VS_TYPE), embed_model: str = Body(EMBEDDING_MODEL), - ): + chunk_size: int = Body(CHUNK_SIZE, description="知识库中单段文本最大长度"), + chunk_overlap: int = Body(OVERLAP_SIZE, description="知识库中相邻文本重合长度"), + zh_title_enhance: bool = Body(ZH_TITLE_ENHANCE, description="是否开启中文标题加强"), +): ''' recreate vector store from the content. this is usefull when user can copy files to content folder directly instead of upload through network. @@ -325,7 +341,10 @@ def recreate_vector_store( files = list_files_from_folder(knowledge_base_name) kb_files = [(file, knowledge_base_name) for file in files] i = 0 - for status, result in files2docs_in_thread(kb_files): + for status, result in files2docs_in_thread(kb_files, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + zh_title_enhance=zh_title_enhance): if status: kb_name, file_name, docs = result kb_file = KnowledgeFile(filename=file_name, knowledge_base_name=kb_name) diff --git a/server/knowledge_base/migrate.py b/server/knowledge_base/migrate.py index 893c37d..896d9b4 100644 --- a/server/knowledge_base/migrate.py +++ b/server/knowledge_base/migrate.py @@ -1,4 +1,6 @@ -from configs.model_config import EMBEDDING_MODEL, DEFAULT_VS_TYPE, logger, log_verbose +from configs.model_config import (EMBEDDING_MODEL, DEFAULT_VS_TYPE, + CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE, + logger, log_verbose) from server.knowledge_base.utils import (get_file_path, list_kbs_from_folder, list_files_from_folder,files2docs_in_thread, KnowledgeFile,) @@ -6,13 +8,9 @@ from server.knowledge_base.kb_service.base import KBServiceFactory, SupportedVST from server.db.repository.knowledge_file_repository import add_file_to_db from server.db.base import Base, engine import os -from concurrent.futures import ThreadPoolExecutor from typing import Literal, Any, List -pool = ThreadPoolExecutor(os.cpu_count()) - - def create_tables(): Base.metadata.create_all(bind=engine) @@ -40,6 +38,9 @@ def folder2db( mode: Literal["recreate_vs", "fill_info_only", "update_in_db", "increament"], vs_type: Literal["faiss", "milvus", "pg", "chromadb"] = DEFAULT_VS_TYPE, embed_model: str = EMBEDDING_MODEL, + chunk_size: int = CHUNK_SIZE, + chunk_overlap: int = OVERLAP_SIZE, + zh_title_enhance: bool = ZH_TITLE_ENHANCE, ): ''' use existed files in local folder to populate database and/or vector store. @@ -60,7 +61,10 @@ def folder2db( print(f"清理后,知识库 {kb_name} 中共有 {files_count} 个文档。") kb_files = file_to_kbfile(kb_name, list_files_from_folder(kb_name)) - for success, result in files2docs_in_thread(kb_files, pool=pool): + for success, result in files2docs_in_thread(kb_files, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + zh_title_enhance=zh_title_enhance): if success: _, filename, docs = result print(f"正在将 {kb_name}/{filename} 添加到向量库,共包含{len(docs)}条文档") @@ -89,7 +93,10 @@ def folder2db( files = list(set(folder_files) - set(db_files)) kb_files = file_to_kbfile(kb_name, files) - for success, result in files2docs_in_thread(kb_files, pool=pool): + for success, result in files2docs_in_thread(kb_files, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + zh_title_enhance=zh_title_enhance): if success: _, filename, docs = result print(f"正在将 {kb_name}/{filename} 添加到向量库") diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index f822035..e8b9103 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -237,7 +237,7 @@ class KnowledgeFile: def docs2texts( self, docs: List[Document] = None, - using_zh_title_enhance=ZH_TITLE_ENHANCE, + zh_title_enhance: bool = ZH_TITLE_ENHANCE, refresh: bool = False, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = OVERLAP_SIZE, @@ -252,14 +252,14 @@ class KnowledgeFile: docs = text_splitter.split_documents(docs) print(f"文档切分示例:{docs[0]}") - if using_zh_title_enhance: + if zh_title_enhance: docs = zh_title_enhance(docs) self.splited_docs = docs return self.splited_docs def file2text( self, - using_zh_title_enhance=ZH_TITLE_ENHANCE, + zh_title_enhance: bool = ZH_TITLE_ENHANCE, refresh: bool = False, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = OVERLAP_SIZE, @@ -268,7 +268,7 @@ class KnowledgeFile: if self.splited_docs is None or refresh: docs = self.file2docs() self.splited_docs = self.docs2texts(docs=docs, - using_zh_title_enhance=using_zh_title_enhance, + zh_title_enhance=zh_title_enhance, refresh=refresh, chunk_size=chunk_size, chunk_overlap=chunk_overlap, @@ -287,6 +287,9 @@ class KnowledgeFile: def files2docs_in_thread( files: List[Union[KnowledgeFile, Tuple[str, str], Dict]], + chunk_size: int = CHUNK_SIZE, + chunk_overlap: int = OVERLAP_SIZE, + zh_title_enhance: bool = ZH_TITLE_ENHANCE, pool: ThreadPoolExecutor = None, ) -> Generator: ''' @@ -314,6 +317,9 @@ def files2docs_in_thread( kwargs = file file = KnowledgeFile(filename=filename, knowledge_base_name=kb_name) kwargs["file"] = file + kwargs["chunk_size"] = chunk_size + kwargs["chunk_overlap"] = chunk_overlap + kwargs["zh_title_enhance"] = zh_title_enhance kwargs_list.append(kwargs) for result in run_in_thread_pool(func=file2docs, params=kwargs_list, pool=pool): diff --git a/webui_pages/knowledge_base/knowledge_base.py b/webui_pages/knowledge_base/knowledge_base.py index 2fafc51..c4069ab 100644 --- a/webui_pages/knowledge_base/knowledge_base.py +++ b/webui_pages/knowledge_base/knowledge_base.py @@ -6,7 +6,9 @@ import pandas as pd from server.knowledge_base.utils import get_file_path, LOADER_DICT from server.knowledge_base.kb_service.base import get_kb_details, get_kb_file_details from typing import Literal, Dict, Tuple -from configs.model_config import embedding_model_dict, kbs_config, EMBEDDING_MODEL, DEFAULT_VS_TYPE +from configs.model_config import (embedding_model_dict, kbs_config, + EMBEDDING_MODEL, DEFAULT_VS_TYPE, + CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE) import os import time @@ -125,25 +127,32 @@ def knowledge_base_page(api: ApiRequest): elif selected_kb: kb = selected_kb + with st.sidebar: + chunk_size = st.number_input("单段文本最大长度:", 1, 1000, CHUNK_SIZE) + chunk_overlap = st.number_input("相邻文本重合长度:", 0, 500, OVERLAP_SIZE) + zh_title_enhance = st.checkbox("开启中文标题加强:", ZH_TITLE_ENHANCE) + # 上传文件 - # sentence_size = st.slider("文本入库分句长度限制", 1, 1000, SENTENCE_SIZE, disabled=True) - files = st.file_uploader("上传知识文件", + files = st.file_uploader("上传知识文件:", [i for ls in LOADER_DICT.values() for i in ls], accept_multiple_files=True, ) if st.button( "添加文件到知识库", - # help="请先上传文件,再点击添加", # use_container_width=True, disabled=len(files) == 0, ): - ret = api.upload_kb_docs(files, knowledge_base_name=kb, override=True) + ret = api.upload_kb_docs(files, + knowledge_base_name=kb, + override=True, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + zh_title_enhance=zh_title_enhance) if msg := check_success_msg(ret): st.toast(msg, icon="✔") elif msg := check_error_msg(ret): st.toast(msg, icon="✖") - st.session_state.files = [] st.divider() @@ -216,7 +225,11 @@ def knowledge_base_page(api: ApiRequest): use_container_width=True, ): file_names = [row["file_name"] for row in selected_rows] - api.update_kb_docs(kb, file_names=file_names) + api.update_kb_docs(kb, + file_names=file_names, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + zh_title_enhance=zh_title_enhance) st.experimental_rerun() # 将文件从向量库中删除,但不删除文件本身。 @@ -251,7 +264,10 @@ def knowledge_base_page(api: ApiRequest): with st.spinner("向量库重构中,请耐心等待,勿刷新或关闭页面。"): empty = st.empty() empty.progress(0.0, "") - for d in api.recreate_vector_store(kb): + for d in api.recreate_vector_store(kb, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + zh_title_enhance=zh_title_enhance): if msg := check_error_msg(d): st.toast(msg) else: diff --git a/webui_pages/utils.py b/webui_pages/utils.py index 3aed735..26e5320 100644 --- a/webui_pages/utils.py +++ b/webui_pages/utils.py @@ -10,6 +10,9 @@ from configs.model_config import ( HISTORY_LEN, TEMPERATURE, SCORE_THRESHOLD, + CHUNK_SIZE, + OVERLAP_SIZE, + ZH_TITLE_ENHANCE, VECTOR_SEARCH_TOP_K, SEARCH_ENGINE_TOP_K, logger, log_verbose, @@ -575,6 +578,9 @@ class ApiRequest: knowledge_base_name: str, override: bool = False, to_vector_store: bool = True, + chunk_size=CHUNK_SIZE, + chunk_overlap=OVERLAP_SIZE, + zh_title_enhance=ZH_TITLE_ENHANCE, docs: Dict = {}, not_refresh_vs_cache: bool = False, no_remote_api: bool = None, @@ -600,6 +606,9 @@ class ApiRequest: "knowledge_base_name": knowledge_base_name, "override": override, "to_vector_store": to_vector_store, + "chunk_size": chunk_size, + "chunk_overlap": chunk_overlap, + "zh_title_enhance": zh_title_enhance, "docs": docs, "not_refresh_vs_cache": not_refresh_vs_cache, } @@ -665,6 +674,9 @@ class ApiRequest: knowledge_base_name: str, file_names: List[str], override_custom_docs: bool = False, + chunk_size=CHUNK_SIZE, + chunk_overlap=OVERLAP_SIZE, + zh_title_enhance=ZH_TITLE_ENHANCE, docs: Dict = {}, not_refresh_vs_cache: bool = False, no_remote_api: bool = None, @@ -679,6 +691,9 @@ class ApiRequest: "knowledge_base_name": knowledge_base_name, "file_names": file_names, "override_custom_docs": override_custom_docs, + "chunk_size": chunk_size, + "chunk_overlap": chunk_overlap, + "zh_title_enhance": zh_title_enhance, "docs": docs, "not_refresh_vs_cache": not_refresh_vs_cache, } @@ -701,6 +716,9 @@ class ApiRequest: allow_empty_kb: bool = True, vs_type: str = DEFAULT_VS_TYPE, embed_model: str = EMBEDDING_MODEL, + chunk_size=CHUNK_SIZE, + chunk_overlap=OVERLAP_SIZE, + zh_title_enhance=ZH_TITLE_ENHANCE, no_remote_api: bool = None, ): ''' @@ -714,6 +732,9 @@ class ApiRequest: "allow_empty_kb": allow_empty_kb, "vs_type": vs_type, "embed_model": embed_model, + "chunk_size": chunk_size, + "chunk_overlap": chunk_overlap, + "zh_title_enhance": zh_title_enhance, } if no_remote_api: