api和webui知识库操作支持chunk_size/overlap_size/zh_title_enhance参数 (#1459)
This commit is contained in:
parent
9defa4332e
commit
16d8809c9a
|
|
@ -104,6 +104,10 @@ LLM_MODEL = "chatglm2-6b"
|
|||
# 历史对话轮数
|
||||
HISTORY_LEN = 3
|
||||
|
||||
# LLM通用对话参数
|
||||
TEMPERATURE = 0.7
|
||||
# TOP_P = 0.95 # ChatOpenAI暂不支持该参数
|
||||
|
||||
# LLM 运行设备。设为"auto"会自动检测,也可手动设定为"cuda","mps","cpu"其中之一。
|
||||
LLM_DEVICE = "auto"
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ import urllib
|
|||
from fastapi import File, Form, Body, Query, UploadFile
|
||||
from configs.model_config import (DEFAULT_VS_TYPE, EMBEDDING_MODEL,
|
||||
VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD,
|
||||
CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE,
|
||||
logger, log_verbose,)
|
||||
from server.utils import BaseResponse, ListResponse, run_in_thread_pool
|
||||
from server.knowledge_base.utils import (validate_kb_name, list_files_from_folder,get_file_path,
|
||||
|
|
@ -121,6 +122,9 @@ def upload_docs(files: List[UploadFile] = File(..., description="上传文件,
|
|||
knowledge_base_name: str = Form(..., description="知识库名称", examples=["samples"]),
|
||||
override: bool = Form(False, description="覆盖已有文件"),
|
||||
to_vector_store: bool = Form(True, description="上传文件后是否进行向量化"),
|
||||
chunk_size: int = Body(CHUNK_SIZE, description="知识库中单段文本最大长度"),
|
||||
chunk_overlap: int = Body(OVERLAP_SIZE, description="知识库中相邻文本重合长度"),
|
||||
zh_title_enhance: bool = Body(ZH_TITLE_ENHANCE, description="是否开启中文标题加强"),
|
||||
docs: Json = Form({}, description="自定义的docs", examples=[{"test.txt": [Document(page_content="custom doc")]}]),
|
||||
not_refresh_vs_cache: bool = Form(False, description="暂不保存向量库(用于FAISS)"),
|
||||
) -> BaseResponse:
|
||||
|
|
@ -152,6 +156,9 @@ def upload_docs(files: List[UploadFile] = File(..., description="上传文件,
|
|||
knowledge_base_name=knowledge_base_name,
|
||||
file_names=file_names,
|
||||
override_custom_docs=True,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
zh_title_enhance=zh_title_enhance,
|
||||
docs=docs,
|
||||
not_refresh_vs_cache=True,
|
||||
)
|
||||
|
|
@ -199,6 +206,9 @@ def delete_docs(knowledge_base_name: str = Body(..., examples=["samples"]),
|
|||
def update_docs(
|
||||
knowledge_base_name: str = Body(..., description="知识库名称", examples=["samples"]),
|
||||
file_names: List[str] = Body(..., description="文件名称,支持多文件", examples=["file_name"]),
|
||||
chunk_size: int = Body(CHUNK_SIZE, description="知识库中单段文本最大长度"),
|
||||
chunk_overlap: int = Body(OVERLAP_SIZE, description="知识库中相邻文本重合长度"),
|
||||
zh_title_enhance: bool = Body(ZH_TITLE_ENHANCE, description="是否开启中文标题加强"),
|
||||
override_custom_docs: bool = Body(False, description="是否覆盖之前自定义的docs"),
|
||||
docs: Json = Body({}, description="自定义的docs", examples=[{"test.txt": [Document(page_content="custom doc")]}]),
|
||||
not_refresh_vs_cache: bool = Body(False, description="暂不保存向量库(用于FAISS)"),
|
||||
|
|
@ -233,7 +243,10 @@ def update_docs(
|
|||
|
||||
# 从文件生成docs,并进行向量化。
|
||||
# 这里利用了KnowledgeFile的缓存功能,在多线程中加载Document,然后传给KnowledgeFile
|
||||
for status, result in files2docs_in_thread(kb_files):
|
||||
for status, result in files2docs_in_thread(kb_files,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
zh_title_enhance=zh_title_enhance):
|
||||
if status:
|
||||
kb_name, file_name, new_docs = result
|
||||
kb_file = KnowledgeFile(filename=file_name,
|
||||
|
|
@ -307,7 +320,10 @@ def recreate_vector_store(
|
|||
allow_empty_kb: bool = Body(True),
|
||||
vs_type: str = Body(DEFAULT_VS_TYPE),
|
||||
embed_model: str = Body(EMBEDDING_MODEL),
|
||||
):
|
||||
chunk_size: int = Body(CHUNK_SIZE, description="知识库中单段文本最大长度"),
|
||||
chunk_overlap: int = Body(OVERLAP_SIZE, description="知识库中相邻文本重合长度"),
|
||||
zh_title_enhance: bool = Body(ZH_TITLE_ENHANCE, description="是否开启中文标题加强"),
|
||||
):
|
||||
'''
|
||||
recreate vector store from the content.
|
||||
this is usefull when user can copy files to content folder directly instead of upload through network.
|
||||
|
|
@ -325,7 +341,10 @@ def recreate_vector_store(
|
|||
files = list_files_from_folder(knowledge_base_name)
|
||||
kb_files = [(file, knowledge_base_name) for file in files]
|
||||
i = 0
|
||||
for status, result in files2docs_in_thread(kb_files):
|
||||
for status, result in files2docs_in_thread(kb_files,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
zh_title_enhance=zh_title_enhance):
|
||||
if status:
|
||||
kb_name, file_name, docs = result
|
||||
kb_file = KnowledgeFile(filename=file_name, knowledge_base_name=kb_name)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,6 @@
|
|||
from configs.model_config import EMBEDDING_MODEL, DEFAULT_VS_TYPE, logger, log_verbose
|
||||
from configs.model_config import (EMBEDDING_MODEL, DEFAULT_VS_TYPE,
|
||||
CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE,
|
||||
logger, log_verbose)
|
||||
from server.knowledge_base.utils import (get_file_path, list_kbs_from_folder,
|
||||
list_files_from_folder,files2docs_in_thread,
|
||||
KnowledgeFile,)
|
||||
|
|
@ -6,13 +8,9 @@ from server.knowledge_base.kb_service.base import KBServiceFactory, SupportedVST
|
|||
from server.db.repository.knowledge_file_repository import add_file_to_db
|
||||
from server.db.base import Base, engine
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Literal, Any, List
|
||||
|
||||
|
||||
pool = ThreadPoolExecutor(os.cpu_count())
|
||||
|
||||
|
||||
def create_tables():
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
||||
|
|
@ -40,6 +38,9 @@ def folder2db(
|
|||
mode: Literal["recreate_vs", "fill_info_only", "update_in_db", "increament"],
|
||||
vs_type: Literal["faiss", "milvus", "pg", "chromadb"] = DEFAULT_VS_TYPE,
|
||||
embed_model: str = EMBEDDING_MODEL,
|
||||
chunk_size: int = CHUNK_SIZE,
|
||||
chunk_overlap: int = OVERLAP_SIZE,
|
||||
zh_title_enhance: bool = ZH_TITLE_ENHANCE,
|
||||
):
|
||||
'''
|
||||
use existed files in local folder to populate database and/or vector store.
|
||||
|
|
@ -60,7 +61,10 @@ def folder2db(
|
|||
print(f"清理后,知识库 {kb_name} 中共有 {files_count} 个文档。")
|
||||
|
||||
kb_files = file_to_kbfile(kb_name, list_files_from_folder(kb_name))
|
||||
for success, result in files2docs_in_thread(kb_files, pool=pool):
|
||||
for success, result in files2docs_in_thread(kb_files,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
zh_title_enhance=zh_title_enhance):
|
||||
if success:
|
||||
_, filename, docs = result
|
||||
print(f"正在将 {kb_name}/{filename} 添加到向量库,共包含{len(docs)}条文档")
|
||||
|
|
@ -89,7 +93,10 @@ def folder2db(
|
|||
files = list(set(folder_files) - set(db_files))
|
||||
kb_files = file_to_kbfile(kb_name, files)
|
||||
|
||||
for success, result in files2docs_in_thread(kb_files, pool=pool):
|
||||
for success, result in files2docs_in_thread(kb_files,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
zh_title_enhance=zh_title_enhance):
|
||||
if success:
|
||||
_, filename, docs = result
|
||||
print(f"正在将 {kb_name}/{filename} 添加到向量库")
|
||||
|
|
|
|||
|
|
@ -237,7 +237,7 @@ class KnowledgeFile:
|
|||
def docs2texts(
|
||||
self,
|
||||
docs: List[Document] = None,
|
||||
using_zh_title_enhance=ZH_TITLE_ENHANCE,
|
||||
zh_title_enhance: bool = ZH_TITLE_ENHANCE,
|
||||
refresh: bool = False,
|
||||
chunk_size: int = CHUNK_SIZE,
|
||||
chunk_overlap: int = OVERLAP_SIZE,
|
||||
|
|
@ -252,14 +252,14 @@ class KnowledgeFile:
|
|||
docs = text_splitter.split_documents(docs)
|
||||
|
||||
print(f"文档切分示例:{docs[0]}")
|
||||
if using_zh_title_enhance:
|
||||
if zh_title_enhance:
|
||||
docs = zh_title_enhance(docs)
|
||||
self.splited_docs = docs
|
||||
return self.splited_docs
|
||||
|
||||
def file2text(
|
||||
self,
|
||||
using_zh_title_enhance=ZH_TITLE_ENHANCE,
|
||||
zh_title_enhance: bool = ZH_TITLE_ENHANCE,
|
||||
refresh: bool = False,
|
||||
chunk_size: int = CHUNK_SIZE,
|
||||
chunk_overlap: int = OVERLAP_SIZE,
|
||||
|
|
@ -268,7 +268,7 @@ class KnowledgeFile:
|
|||
if self.splited_docs is None or refresh:
|
||||
docs = self.file2docs()
|
||||
self.splited_docs = self.docs2texts(docs=docs,
|
||||
using_zh_title_enhance=using_zh_title_enhance,
|
||||
zh_title_enhance=zh_title_enhance,
|
||||
refresh=refresh,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
|
|
@ -287,6 +287,9 @@ class KnowledgeFile:
|
|||
|
||||
def files2docs_in_thread(
|
||||
files: List[Union[KnowledgeFile, Tuple[str, str], Dict]],
|
||||
chunk_size: int = CHUNK_SIZE,
|
||||
chunk_overlap: int = OVERLAP_SIZE,
|
||||
zh_title_enhance: bool = ZH_TITLE_ENHANCE,
|
||||
pool: ThreadPoolExecutor = None,
|
||||
) -> Generator:
|
||||
'''
|
||||
|
|
@ -314,6 +317,9 @@ def files2docs_in_thread(
|
|||
kwargs = file
|
||||
file = KnowledgeFile(filename=filename, knowledge_base_name=kb_name)
|
||||
kwargs["file"] = file
|
||||
kwargs["chunk_size"] = chunk_size
|
||||
kwargs["chunk_overlap"] = chunk_overlap
|
||||
kwargs["zh_title_enhance"] = zh_title_enhance
|
||||
kwargs_list.append(kwargs)
|
||||
|
||||
for result in run_in_thread_pool(func=file2docs, params=kwargs_list, pool=pool):
|
||||
|
|
|
|||
|
|
@ -6,7 +6,9 @@ import pandas as pd
|
|||
from server.knowledge_base.utils import get_file_path, LOADER_DICT
|
||||
from server.knowledge_base.kb_service.base import get_kb_details, get_kb_file_details
|
||||
from typing import Literal, Dict, Tuple
|
||||
from configs.model_config import embedding_model_dict, kbs_config, EMBEDDING_MODEL, DEFAULT_VS_TYPE
|
||||
from configs.model_config import (embedding_model_dict, kbs_config,
|
||||
EMBEDDING_MODEL, DEFAULT_VS_TYPE,
|
||||
CHUNK_SIZE, OVERLAP_SIZE, ZH_TITLE_ENHANCE)
|
||||
import os
|
||||
import time
|
||||
|
||||
|
|
@ -125,25 +127,32 @@ def knowledge_base_page(api: ApiRequest):
|
|||
elif selected_kb:
|
||||
kb = selected_kb
|
||||
|
||||
with st.sidebar:
|
||||
chunk_size = st.number_input("单段文本最大长度:", 1, 1000, CHUNK_SIZE)
|
||||
chunk_overlap = st.number_input("相邻文本重合长度:", 0, 500, OVERLAP_SIZE)
|
||||
zh_title_enhance = st.checkbox("开启中文标题加强:", ZH_TITLE_ENHANCE)
|
||||
|
||||
# 上传文件
|
||||
# sentence_size = st.slider("文本入库分句长度限制", 1, 1000, SENTENCE_SIZE, disabled=True)
|
||||
files = st.file_uploader("上传知识文件",
|
||||
files = st.file_uploader("上传知识文件:",
|
||||
[i for ls in LOADER_DICT.values() for i in ls],
|
||||
accept_multiple_files=True,
|
||||
)
|
||||
|
||||
if st.button(
|
||||
"添加文件到知识库",
|
||||
# help="请先上传文件,再点击添加",
|
||||
# use_container_width=True,
|
||||
disabled=len(files) == 0,
|
||||
):
|
||||
ret = api.upload_kb_docs(files, knowledge_base_name=kb, override=True)
|
||||
ret = api.upload_kb_docs(files,
|
||||
knowledge_base_name=kb,
|
||||
override=True,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
zh_title_enhance=zh_title_enhance)
|
||||
if msg := check_success_msg(ret):
|
||||
st.toast(msg, icon="✔")
|
||||
elif msg := check_error_msg(ret):
|
||||
st.toast(msg, icon="✖")
|
||||
st.session_state.files = []
|
||||
|
||||
st.divider()
|
||||
|
||||
|
|
@ -216,7 +225,11 @@ def knowledge_base_page(api: ApiRequest):
|
|||
use_container_width=True,
|
||||
):
|
||||
file_names = [row["file_name"] for row in selected_rows]
|
||||
api.update_kb_docs(kb, file_names=file_names)
|
||||
api.update_kb_docs(kb,
|
||||
file_names=file_names,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
zh_title_enhance=zh_title_enhance)
|
||||
st.experimental_rerun()
|
||||
|
||||
# 将文件从向量库中删除,但不删除文件本身。
|
||||
|
|
@ -251,7 +264,10 @@ def knowledge_base_page(api: ApiRequest):
|
|||
with st.spinner("向量库重构中,请耐心等待,勿刷新或关闭页面。"):
|
||||
empty = st.empty()
|
||||
empty.progress(0.0, "")
|
||||
for d in api.recreate_vector_store(kb):
|
||||
for d in api.recreate_vector_store(kb,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
zh_title_enhance=zh_title_enhance):
|
||||
if msg := check_error_msg(d):
|
||||
st.toast(msg)
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -10,6 +10,9 @@ from configs.model_config import (
|
|||
HISTORY_LEN,
|
||||
TEMPERATURE,
|
||||
SCORE_THRESHOLD,
|
||||
CHUNK_SIZE,
|
||||
OVERLAP_SIZE,
|
||||
ZH_TITLE_ENHANCE,
|
||||
VECTOR_SEARCH_TOP_K,
|
||||
SEARCH_ENGINE_TOP_K,
|
||||
logger, log_verbose,
|
||||
|
|
@ -575,6 +578,9 @@ class ApiRequest:
|
|||
knowledge_base_name: str,
|
||||
override: bool = False,
|
||||
to_vector_store: bool = True,
|
||||
chunk_size=CHUNK_SIZE,
|
||||
chunk_overlap=OVERLAP_SIZE,
|
||||
zh_title_enhance=ZH_TITLE_ENHANCE,
|
||||
docs: Dict = {},
|
||||
not_refresh_vs_cache: bool = False,
|
||||
no_remote_api: bool = None,
|
||||
|
|
@ -600,6 +606,9 @@ class ApiRequest:
|
|||
"knowledge_base_name": knowledge_base_name,
|
||||
"override": override,
|
||||
"to_vector_store": to_vector_store,
|
||||
"chunk_size": chunk_size,
|
||||
"chunk_overlap": chunk_overlap,
|
||||
"zh_title_enhance": zh_title_enhance,
|
||||
"docs": docs,
|
||||
"not_refresh_vs_cache": not_refresh_vs_cache,
|
||||
}
|
||||
|
|
@ -665,6 +674,9 @@ class ApiRequest:
|
|||
knowledge_base_name: str,
|
||||
file_names: List[str],
|
||||
override_custom_docs: bool = False,
|
||||
chunk_size=CHUNK_SIZE,
|
||||
chunk_overlap=OVERLAP_SIZE,
|
||||
zh_title_enhance=ZH_TITLE_ENHANCE,
|
||||
docs: Dict = {},
|
||||
not_refresh_vs_cache: bool = False,
|
||||
no_remote_api: bool = None,
|
||||
|
|
@ -679,6 +691,9 @@ class ApiRequest:
|
|||
"knowledge_base_name": knowledge_base_name,
|
||||
"file_names": file_names,
|
||||
"override_custom_docs": override_custom_docs,
|
||||
"chunk_size": chunk_size,
|
||||
"chunk_overlap": chunk_overlap,
|
||||
"zh_title_enhance": zh_title_enhance,
|
||||
"docs": docs,
|
||||
"not_refresh_vs_cache": not_refresh_vs_cache,
|
||||
}
|
||||
|
|
@ -701,6 +716,9 @@ class ApiRequest:
|
|||
allow_empty_kb: bool = True,
|
||||
vs_type: str = DEFAULT_VS_TYPE,
|
||||
embed_model: str = EMBEDDING_MODEL,
|
||||
chunk_size=CHUNK_SIZE,
|
||||
chunk_overlap=OVERLAP_SIZE,
|
||||
zh_title_enhance=ZH_TITLE_ENHANCE,
|
||||
no_remote_api: bool = None,
|
||||
):
|
||||
'''
|
||||
|
|
@ -714,6 +732,9 @@ class ApiRequest:
|
|||
"allow_empty_kb": allow_empty_kb,
|
||||
"vs_type": vs_type,
|
||||
"embed_model": embed_model,
|
||||
"chunk_size": chunk_size,
|
||||
"chunk_overlap": chunk_overlap,
|
||||
"zh_title_enhance": zh_title_enhance,
|
||||
}
|
||||
|
||||
if no_remote_api:
|
||||
|
|
|
|||
Loading…
Reference in New Issue