api和webui知识库操作支持chunk_size/overlap_size/zh_title_enhance参数 (#1459)

This commit is contained in:
liunux4odoo 2023-09-13 11:19:47 +08:00 committed by GitHub
parent 9defa4332e
commit 16d8809c9a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 95 additions and 22 deletions

View File

@ -104,6 +104,10 @@ LLM_MODEL = "chatglm2-6b"
# 历史对话轮数 # 历史对话轮数
HISTORY_LEN = 3 HISTORY_LEN = 3
# LLM通用对话参数
TEMPERATURE = 0.7
# TOP_P = 0.95 # ChatOpenAI暂不支持该参数
# LLM 运行设备。设为"auto"会自动检测,也可手动设定为"cuda","mps","cpu"其中之一。 # LLM 运行设备。设为"auto"会自动检测,也可手动设定为"cuda","mps","cpu"其中之一。
LLM_DEVICE = "auto" LLM_DEVICE = "auto"

View File

@ -3,6 +3,7 @@ import urllib
from fastapi import File, Form, Body, Query, UploadFile from fastapi import File, Form, Body, Query, UploadFile
from configs.model_config import (DEFAULT_VS_TYPE, EMBEDDING_MODEL, from configs.model_config import (DEFAULT_VS_TYPE, EMBEDDING_MODEL,
VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD, VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD,
CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE,
logger, log_verbose,) logger, log_verbose,)
from server.utils import BaseResponse, ListResponse, run_in_thread_pool from server.utils import BaseResponse, ListResponse, run_in_thread_pool
from server.knowledge_base.utils import (validate_kb_name, list_files_from_folder,get_file_path, from server.knowledge_base.utils import (validate_kb_name, list_files_from_folder,get_file_path,
@ -121,6 +122,9 @@ def upload_docs(files: List[UploadFile] = File(..., description="上传文件,
knowledge_base_name: str = Form(..., description="知识库名称", examples=["samples"]), knowledge_base_name: str = Form(..., description="知识库名称", examples=["samples"]),
override: bool = Form(False, description="覆盖已有文件"), override: bool = Form(False, description="覆盖已有文件"),
to_vector_store: bool = Form(True, description="上传文件后是否进行向量化"), to_vector_store: bool = Form(True, description="上传文件后是否进行向量化"),
chunk_size: int = Body(CHUNK_SIZE, description="知识库中单段文本最大长度"),
chunk_overlap: int = Body(OVERLAP_SIZE, description="知识库中相邻文本重合长度"),
zh_title_enhance: bool = Body(ZH_TITLE_ENHANCE, description="是否开启中文标题加强"),
docs: Json = Form({}, description="自定义的docs", examples=[{"test.txt": [Document(page_content="custom doc")]}]), docs: Json = Form({}, description="自定义的docs", examples=[{"test.txt": [Document(page_content="custom doc")]}]),
not_refresh_vs_cache: bool = Form(False, description="暂不保存向量库用于FAISS"), not_refresh_vs_cache: bool = Form(False, description="暂不保存向量库用于FAISS"),
) -> BaseResponse: ) -> BaseResponse:
@ -152,6 +156,9 @@ def upload_docs(files: List[UploadFile] = File(..., description="上传文件,
knowledge_base_name=knowledge_base_name, knowledge_base_name=knowledge_base_name,
file_names=file_names, file_names=file_names,
override_custom_docs=True, override_custom_docs=True,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance,
docs=docs, docs=docs,
not_refresh_vs_cache=True, not_refresh_vs_cache=True,
) )
@ -199,6 +206,9 @@ def delete_docs(knowledge_base_name: str = Body(..., examples=["samples"]),
def update_docs( def update_docs(
knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]), knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
file_names: List[str] = Body(..., description="文件名称,支持多文件", examples=["file_name"]), file_names: List[str] = Body(..., description="文件名称,支持多文件", examples=["file_name"]),
chunk_size: int = Body(CHUNK_SIZE, description="知识库中单段文本最大长度"),
chunk_overlap: int = Body(OVERLAP_SIZE, description="知识库中相邻文本重合长度"),
zh_title_enhance: bool = Body(ZH_TITLE_ENHANCE, description="是否开启中文标题加强"),
override_custom_docs: bool = Body(False, description="是否覆盖之前自定义的docs"), override_custom_docs: bool = Body(False, description="是否覆盖之前自定义的docs"),
docs: Json = Body({}, description="自定义的docs", examples=[{"test.txt": [Document(page_content="custom doc")]}]), docs: Json = Body({}, description="自定义的docs", examples=[{"test.txt": [Document(page_content="custom doc")]}]),
not_refresh_vs_cache: bool = Body(False, description="暂不保存向量库用于FAISS"), not_refresh_vs_cache: bool = Body(False, description="暂不保存向量库用于FAISS"),
@ -233,7 +243,10 @@ def update_docs(
# 从文件生成docs并进行向量化。 # 从文件生成docs并进行向量化。
# 这里利用了KnowledgeFile的缓存功能在多线程中加载Document然后传给KnowledgeFile # 这里利用了KnowledgeFile的缓存功能在多线程中加载Document然后传给KnowledgeFile
for status, result in files2docs_in_thread(kb_files): for status, result in files2docs_in_thread(kb_files,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance):
if status: if status:
kb_name, file_name, new_docs = result kb_name, file_name, new_docs = result
kb_file = KnowledgeFile(filename=file_name, kb_file = KnowledgeFile(filename=file_name,
@ -307,7 +320,10 @@ def recreate_vector_store(
allow_empty_kb: bool = Body(True), allow_empty_kb: bool = Body(True),
vs_type: str = Body(DEFAULT_VS_TYPE), vs_type: str = Body(DEFAULT_VS_TYPE),
embed_model: str = Body(EMBEDDING_MODEL), embed_model: str = Body(EMBEDDING_MODEL),
): chunk_size: int = Body(CHUNK_SIZE, description="知识库中单段文本最大长度"),
chunk_overlap: int = Body(OVERLAP_SIZE, description="知识库中相邻文本重合长度"),
zh_title_enhance: bool = Body(ZH_TITLE_ENHANCE, description="是否开启中文标题加强"),
):
''' '''
recreate vector store from the content. recreate vector store from the content.
this is usefull when user can copy files to content folder directly instead of upload through network. this is usefull when user can copy files to content folder directly instead of upload through network.
@ -325,7 +341,10 @@ def recreate_vector_store(
files = list_files_from_folder(knowledge_base_name) files = list_files_from_folder(knowledge_base_name)
kb_files = [(file, knowledge_base_name) for file in files] kb_files = [(file, knowledge_base_name) for file in files]
i = 0 i = 0
for status, result in files2docs_in_thread(kb_files): for status, result in files2docs_in_thread(kb_files,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance):
if status: if status:
kb_name, file_name, docs = result kb_name, file_name, docs = result
kb_file = KnowledgeFile(filename=file_name, knowledge_base_name=kb_name) kb_file = KnowledgeFile(filename=file_name, knowledge_base_name=kb_name)

View File

@ -1,4 +1,6 @@
from configs.model_config import EMBEDDING_MODEL, DEFAULT_VS_TYPE, logger, log_verbose from configs.model_config import (EMBEDDING_MODEL, DEFAULT_VS_TYPE,
CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE,
logger, log_verbose)
from server.knowledge_base.utils import (get_file_path, list_kbs_from_folder, from server.knowledge_base.utils import (get_file_path, list_kbs_from_folder,
list_files_from_folder,files2docs_in_thread, list_files_from_folder,files2docs_in_thread,
KnowledgeFile,) KnowledgeFile,)
@ -6,13 +8,9 @@ from server.knowledge_base.kb_service.base import KBServiceFactory, SupportedVST
from server.db.repository.knowledge_file_repository import add_file_to_db from server.db.repository.knowledge_file_repository import add_file_to_db
from server.db.base import Base, engine from server.db.base import Base, engine
import os import os
from concurrent.futures import ThreadPoolExecutor
from typing import Literal, Any, List from typing import Literal, Any, List
pool = ThreadPoolExecutor(os.cpu_count())
def create_tables(): def create_tables():
Base.metadata.create_all(bind=engine) Base.metadata.create_all(bind=engine)
@ -40,6 +38,9 @@ def folder2db(
mode: Literal["recreate_vs", "fill_info_only", "update_in_db", "increament"], mode: Literal["recreate_vs", "fill_info_only", "update_in_db", "increament"],
vs_type: Literal["faiss", "milvus", "pg", "chromadb"] = DEFAULT_VS_TYPE, vs_type: Literal["faiss", "milvus", "pg", "chromadb"] = DEFAULT_VS_TYPE,
embed_model: str = EMBEDDING_MODEL, embed_model: str = EMBEDDING_MODEL,
chunk_size: int = CHUNK_SIZE,
chunk_overlap: int = OVERLAP_SIZE,
zh_title_enhance: bool = ZH_TITLE_ENHANCE,
): ):
''' '''
use existed files in local folder to populate database and/or vector store. use existed files in local folder to populate database and/or vector store.
@ -60,7 +61,10 @@ def folder2db(
print(f"清理后,知识库 {kb_name} 中共有 {files_count} 个文档。") print(f"清理后,知识库 {kb_name} 中共有 {files_count} 个文档。")
kb_files = file_to_kbfile(kb_name, list_files_from_folder(kb_name)) kb_files = file_to_kbfile(kb_name, list_files_from_folder(kb_name))
for success, result in files2docs_in_thread(kb_files, pool=pool): for success, result in files2docs_in_thread(kb_files,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance):
if success: if success:
_, filename, docs = result _, filename, docs = result
print(f"正在将 {kb_name}/{filename} 添加到向量库,共包含{len(docs)}条文档") print(f"正在将 {kb_name}/{filename} 添加到向量库,共包含{len(docs)}条文档")
@ -89,7 +93,10 @@ def folder2db(
files = list(set(folder_files) - set(db_files)) files = list(set(folder_files) - set(db_files))
kb_files = file_to_kbfile(kb_name, files) kb_files = file_to_kbfile(kb_name, files)
for success, result in files2docs_in_thread(kb_files, pool=pool): for success, result in files2docs_in_thread(kb_files,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance):
if success: if success:
_, filename, docs = result _, filename, docs = result
print(f"正在将 {kb_name}/{filename} 添加到向量库") print(f"正在将 {kb_name}/{filename} 添加到向量库")

View File

@ -237,7 +237,7 @@ class KnowledgeFile:
def docs2texts( def docs2texts(
self, self,
docs: List[Document] = None, docs: List[Document] = None,
using_zh_title_enhance=ZH_TITLE_ENHANCE, zh_title_enhance: bool = ZH_TITLE_ENHANCE,
refresh: bool = False, refresh: bool = False,
chunk_size: int = CHUNK_SIZE, chunk_size: int = CHUNK_SIZE,
chunk_overlap: int = OVERLAP_SIZE, chunk_overlap: int = OVERLAP_SIZE,
@ -252,14 +252,14 @@ class KnowledgeFile:
docs = text_splitter.split_documents(docs) docs = text_splitter.split_documents(docs)
print(f"文档切分示例:{docs[0]}") print(f"文档切分示例:{docs[0]}")
if using_zh_title_enhance: if zh_title_enhance:
docs = zh_title_enhance(docs) docs = zh_title_enhance(docs)
self.splited_docs = docs self.splited_docs = docs
return self.splited_docs return self.splited_docs
def file2text( def file2text(
self, self,
using_zh_title_enhance=ZH_TITLE_ENHANCE, zh_title_enhance: bool = ZH_TITLE_ENHANCE,
refresh: bool = False, refresh: bool = False,
chunk_size: int = CHUNK_SIZE, chunk_size: int = CHUNK_SIZE,
chunk_overlap: int = OVERLAP_SIZE, chunk_overlap: int = OVERLAP_SIZE,
@ -268,7 +268,7 @@ class KnowledgeFile:
if self.splited_docs is None or refresh: if self.splited_docs is None or refresh:
docs = self.file2docs() docs = self.file2docs()
self.splited_docs = self.docs2texts(docs=docs, self.splited_docs = self.docs2texts(docs=docs,
using_zh_title_enhance=using_zh_title_enhance, zh_title_enhance=zh_title_enhance,
refresh=refresh, refresh=refresh,
chunk_size=chunk_size, chunk_size=chunk_size,
chunk_overlap=chunk_overlap, chunk_overlap=chunk_overlap,
@ -287,6 +287,9 @@ class KnowledgeFile:
def files2docs_in_thread( def files2docs_in_thread(
files: List[Union[KnowledgeFile, Tuple[str, str], Dict]], files: List[Union[KnowledgeFile, Tuple[str, str], Dict]],
chunk_size: int = CHUNK_SIZE,
chunk_overlap: int = OVERLAP_SIZE,
zh_title_enhance: bool = ZH_TITLE_ENHANCE,
pool: ThreadPoolExecutor = None, pool: ThreadPoolExecutor = None,
) -> Generator: ) -> Generator:
''' '''
@ -314,6 +317,9 @@ def files2docs_in_thread(
kwargs = file kwargs = file
file = KnowledgeFile(filename=filename, knowledge_base_name=kb_name) file = KnowledgeFile(filename=filename, knowledge_base_name=kb_name)
kwargs["file"] = file kwargs["file"] = file
kwargs["chunk_size"] = chunk_size
kwargs["chunk_overlap"] = chunk_overlap
kwargs["zh_title_enhance"] = zh_title_enhance
kwargs_list.append(kwargs) kwargs_list.append(kwargs)
for result in run_in_thread_pool(func=file2docs, params=kwargs_list, pool=pool): for result in run_in_thread_pool(func=file2docs, params=kwargs_list, pool=pool):

View File

@ -6,7 +6,9 @@ import pandas as pd
from server.knowledge_base.utils import get_file_path, LOADER_DICT from server.knowledge_base.utils import get_file_path, LOADER_DICT
from server.knowledge_base.kb_service.base import get_kb_details, get_kb_file_details from server.knowledge_base.kb_service.base import get_kb_details, get_kb_file_details
from typing import Literal, Dict, Tuple from typing import Literal, Dict, Tuple
from configs.model_config import embedding_model_dict, kbs_config, EMBEDDING_MODEL, DEFAULT_VS_TYPE from configs.model_config import (embedding_model_dict, kbs_config,
EMBEDDING_MODEL, DEFAULT_VS_TYPE,
CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE)
import os import os
import time import time
@ -125,25 +127,32 @@ def knowledge_base_page(api: ApiRequest):
elif selected_kb: elif selected_kb:
kb = selected_kb kb = selected_kb
with st.sidebar:
chunk_size = st.number_input("单段文本最大长度:", 1, 1000, CHUNK_SIZE)
chunk_overlap = st.number_input("相邻文本重合长度:", 0, 500, OVERLAP_SIZE)
zh_title_enhance = st.checkbox("开启中文标题加强:", ZH_TITLE_ENHANCE)
# 上传文件 # 上传文件
# sentence_size = st.slider("文本入库分句长度限制", 1, 1000, SENTENCE_SIZE, disabled=True) files = st.file_uploader("上传知识文件:",
files = st.file_uploader("上传知识文件",
[i for ls in LOADER_DICT.values() for i in ls], [i for ls in LOADER_DICT.values() for i in ls],
accept_multiple_files=True, accept_multiple_files=True,
) )
if st.button( if st.button(
"添加文件到知识库", "添加文件到知识库",
# help="请先上传文件,再点击添加",
# use_container_width=True, # use_container_width=True,
disabled=len(files) == 0, disabled=len(files) == 0,
): ):
ret = api.upload_kb_docs(files, knowledge_base_name=kb, override=True) ret = api.upload_kb_docs(files,
knowledge_base_name=kb,
override=True,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance)
if msg := check_success_msg(ret): if msg := check_success_msg(ret):
st.toast(msg, icon="") st.toast(msg, icon="")
elif msg := check_error_msg(ret): elif msg := check_error_msg(ret):
st.toast(msg, icon="") st.toast(msg, icon="")
st.session_state.files = []
st.divider() st.divider()
@ -216,7 +225,11 @@ def knowledge_base_page(api: ApiRequest):
use_container_width=True, use_container_width=True,
): ):
file_names = [row["file_name"] for row in selected_rows] file_names = [row["file_name"] for row in selected_rows]
api.update_kb_docs(kb, file_names=file_names) api.update_kb_docs(kb,
file_names=file_names,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance)
st.experimental_rerun() st.experimental_rerun()
# 将文件从向量库中删除,但不删除文件本身。 # 将文件从向量库中删除,但不删除文件本身。
@ -251,7 +264,10 @@ def knowledge_base_page(api: ApiRequest):
with st.spinner("向量库重构中,请耐心等待,勿刷新或关闭页面。"): with st.spinner("向量库重构中,请耐心等待,勿刷新或关闭页面。"):
empty = st.empty() empty = st.empty()
empty.progress(0.0, "") empty.progress(0.0, "")
for d in api.recreate_vector_store(kb): for d in api.recreate_vector_store(kb,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
zh_title_enhance=zh_title_enhance):
if msg := check_error_msg(d): if msg := check_error_msg(d):
st.toast(msg) st.toast(msg)
else: else:

View File

@ -10,6 +10,9 @@ from configs.model_config import (
HISTORY_LEN, HISTORY_LEN,
TEMPERATURE, TEMPERATURE,
SCORE_THRESHOLD, SCORE_THRESHOLD,
CHUNK_SIZE,
OVERLAP_SIZE,
ZH_TITLE_ENHANCE,
VECTOR_SEARCH_TOP_K, VECTOR_SEARCH_TOP_K,
SEARCH_ENGINE_TOP_K, SEARCH_ENGINE_TOP_K,
logger, log_verbose, logger, log_verbose,
@ -575,6 +578,9 @@ class ApiRequest:
knowledge_base_name: str, knowledge_base_name: str,
override: bool = False, override: bool = False,
to_vector_store: bool = True, to_vector_store: bool = True,
chunk_size=CHUNK_SIZE,
chunk_overlap=OVERLAP_SIZE,
zh_title_enhance=ZH_TITLE_ENHANCE,
docs: Dict = {}, docs: Dict = {},
not_refresh_vs_cache: bool = False, not_refresh_vs_cache: bool = False,
no_remote_api: bool = None, no_remote_api: bool = None,
@ -600,6 +606,9 @@ class ApiRequest:
"knowledge_base_name": knowledge_base_name, "knowledge_base_name": knowledge_base_name,
"override": override, "override": override,
"to_vector_store": to_vector_store, "to_vector_store": to_vector_store,
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap,
"zh_title_enhance": zh_title_enhance,
"docs": docs, "docs": docs,
"not_refresh_vs_cache": not_refresh_vs_cache, "not_refresh_vs_cache": not_refresh_vs_cache,
} }
@ -665,6 +674,9 @@ class ApiRequest:
knowledge_base_name: str, knowledge_base_name: str,
file_names: List[str], file_names: List[str],
override_custom_docs: bool = False, override_custom_docs: bool = False,
chunk_size=CHUNK_SIZE,
chunk_overlap=OVERLAP_SIZE,
zh_title_enhance=ZH_TITLE_ENHANCE,
docs: Dict = {}, docs: Dict = {},
not_refresh_vs_cache: bool = False, not_refresh_vs_cache: bool = False,
no_remote_api: bool = None, no_remote_api: bool = None,
@ -679,6 +691,9 @@ class ApiRequest:
"knowledge_base_name": knowledge_base_name, "knowledge_base_name": knowledge_base_name,
"file_names": file_names, "file_names": file_names,
"override_custom_docs": override_custom_docs, "override_custom_docs": override_custom_docs,
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap,
"zh_title_enhance": zh_title_enhance,
"docs": docs, "docs": docs,
"not_refresh_vs_cache": not_refresh_vs_cache, "not_refresh_vs_cache": not_refresh_vs_cache,
} }
@ -701,6 +716,9 @@ class ApiRequest:
allow_empty_kb: bool = True, allow_empty_kb: bool = True,
vs_type: str = DEFAULT_VS_TYPE, vs_type: str = DEFAULT_VS_TYPE,
embed_model: str = EMBEDDING_MODEL, embed_model: str = EMBEDDING_MODEL,
chunk_size=CHUNK_SIZE,
chunk_overlap=OVERLAP_SIZE,
zh_title_enhance=ZH_TITLE_ENHANCE,
no_remote_api: bool = None, no_remote_api: bool = None,
): ):
''' '''
@ -714,6 +732,9 @@ class ApiRequest:
"allow_empty_kb": allow_empty_kb, "allow_empty_kb": allow_empty_kb,
"vs_type": vs_type, "vs_type": vs_type,
"embed_model": embed_model, "embed_model": embed_model,
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap,
"zh_title_enhance": zh_title_enhance,
} }
if no_remote_api: if no_remote_api: