diff --git a/configs/kb_config.py.exmaple b/configs/kb_config.py.exmaple new file mode 100644 index 0000000..3ceee3c --- /dev/null +++ b/configs/kb_config.py.exmaple @@ -0,0 +1,99 @@ +import os + + +# 默认向量库类型。可选:faiss, milvus, pg. +DEFAULT_VS_TYPE = "faiss" + +# 缓存向量库数量(针对FAISS) +CACHED_VS_NUM = 1 + +# 知识库中单段文本长度(不适用MarkdownHeaderTextSplitter) +CHUNK_SIZE = 250 + +# 知识库中相邻文本重合长度(不适用MarkdownHeaderTextSplitter) +OVERLAP_SIZE = 50 + +# 知识库匹配向量数量 +VECTOR_SEARCH_TOP_K = 3 + +# 知识库匹配相关度阈值,取值范围在0-1之间,SCORE越小,相关度越高,取到1相当于不筛选,建议设置在0.5左右 +SCORE_THRESHOLD = 1 + +# 搜索引擎匹配结题数量 +SEARCH_ENGINE_TOP_K = 3 + + +# Bing 搜索必备变量 +# 使用 Bing 搜索需要使用 Bing Subscription Key,需要在azure port中申请试用bing search +# 具体申请方式请见 +# https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/create-bing-search-service-resource +# 使用python创建bing api 搜索实例详见: +# https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/quickstarts/rest/python +BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search" +# 注意不是bing Webmaster Tools的api key, + +# 此外,如果是在服务器上,报Failed to establish a new connection: [Errno 110] Connection timed out +# 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG +BING_SUBSCRIPTION_KEY = "" + +# 是否开启中文标题加强,以及标题增强的相关配置 +# 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记; +# 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。 +ZH_TITLE_ENHANCE = False + + +# 通常情况下不需要更改以下内容 + +# 知识库默认存储路径 +KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base") +if not os.path.exists(KB_ROOT_PATH): + os.mkdir(KB_ROOT_PATH) + +# 数据库默认存储路径。 +# 如果使用sqlite,可以直接修改DB_ROOT_PATH;如果使用其它数据库,请直接修改SQLALCHEMY_DATABASE_URI。 +DB_ROOT_PATH = os.path.join(KB_ROOT_PATH, "info.db") +SQLALCHEMY_DATABASE_URI = f"sqlite:///{DB_ROOT_PATH}" + +# 可选向量库类型及对应配置 +kbs_config = { + "faiss": { + }, + "milvus": { + "host": "127.0.0.1", + "port": "19530", + "user": "", + "password": "", + "secure": False, + }, + "pg": { + "connection_uri": "postgresql://postgres:postgres@127.0.0.1:5432/langchain_chatchat", + } +} + +# TextSplitter配置项,如果你不明白其中的含义,就不要修改。 +text_splitter_dict = { + "ChineseRecursiveTextSplitter": { + "source": "huggingface", ## 选择tiktoken则使用openai的方法 + "tokenizer_name_or_path": "gpt2", + }, + "SpacyTextSplitter": { + "source": "huggingface", + "tokenizer_name_or_path": "", + }, + "RecursiveCharacterTextSplitter": { + "source": "tiktoken", + "tokenizer_name_or_path": "cl100k_base", + }, + "MarkdownHeaderTextSplitter": { + "headers_to_split_on": + [ + ("#", "head1"), + ("##", "head2"), + ("###", "head3"), + ("####", "head4"), + ] + }, +} + +# TEXT_SPLITTER 名称 +TEXT_SPLITTER_NAME = "ChineseRecursiveTextSplitter" diff --git a/img/LLM_success.png b/img/LLM_success.png new file mode 100644 index 0000000..48bd274 Binary files /dev/null and b/img/LLM_success.png differ diff --git a/img/init_knowledge_base.jpg b/img/init_knowledge_base.jpg new file mode 100644 index 0000000..a031b77 Binary files /dev/null and b/img/init_knowledge_base.jpg differ diff --git a/img/knowledge_base_success.jpg b/img/knowledge_base_success.jpg new file mode 100644 index 0000000..6639ee9 Binary files /dev/null and b/img/knowledge_base_success.jpg differ diff --git a/img/webui_0915_0.png b/img/webui_0915_0.png deleted file mode 100644 index 058d7b1..0000000 Binary files a/img/webui_0915_0.png and /dev/null differ diff --git a/img/webui_0915_1.png b/img/webui_0915_1.png deleted file mode 100644 index 8df1eca..0000000 Binary files a/img/webui_0915_1.png and /dev/null differ diff --git a/requirements.txt b/requirements.txt index c4e9bbd..68385b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -langchain>=0.0.302 -fschat[model_worker]==0.2.30 +langchain>=0.0.310 +fschat[model_worker]>=0.2.30 openai sentence_transformers transformers>=4.34 @@ -17,8 +17,8 @@ SQLAlchemy==2.0.19 faiss-cpu accelerate spacy -PyMuPDF==1.22.5 -rapidocr_onnxruntime>=1.3.2 +PyMuPDF +rapidocr_onnxruntime requests pathlib diff --git a/requirements_api.txt b/requirements_api.txt index 8ecb492..b5428dd 100644 --- a/requirements_api.txt +++ b/requirements_api.txt @@ -1,7 +1,7 @@ -langchain>=0.0.302 +langchain>=0.0.310 fschat[model_worker]>=0.2.30 openai -sentence_transformers +sentence_transformers>=2.2.2 transformers>=4.34 torch>=2.0.1 torchvision