115 lines
3.0 KiB
YAML
115 lines
3.0 KiB
YAML
# 知识库相关配置
|
||
|
||
|
||
# 默认使用的知识库
|
||
DEFAULT_KNOWLEDGE_BASE: samples
|
||
|
||
# 默认向量库/全文检索引擎类型
|
||
# 可选值:['faiss', 'milvus', 'zilliz', 'pg', 'es', 'relyt', 'chromadb']
|
||
DEFAULT_VS_TYPE: faiss
|
||
|
||
# 缓存向量库数量(针对FAISS)
|
||
CACHED_VS_NUM: 1
|
||
|
||
# 缓存临时向量库数量(针对FAISS),用于文件对话
|
||
CACHED_MEMO_VS_NUM: 10
|
||
|
||
# 知识库中单段文本长度(不适用MarkdownHeaderTextSplitter)
|
||
CHUNK_SIZE: 750
|
||
|
||
# 知识库中相邻文本重合长度(不适用MarkdownHeaderTextSplitter)
|
||
OVERLAP_SIZE: 150
|
||
|
||
# 知识库匹配向量数量
|
||
VECTOR_SEARCH_TOP_K: 3
|
||
|
||
# 知识库匹配相关度阈值,取值范围在0-2之间,SCORE越小,相关度越高,取到2相当于不筛选,建议设置在0.5左右
|
||
SCORE_THRESHOLD: 2.0
|
||
|
||
# 默认搜索引擎
|
||
# 可选值:['bing', 'duckduckgo', 'metaphor', 'searx']
|
||
DEFAULT_SEARCH_ENGINE: duckduckgo
|
||
|
||
# 搜索引擎匹配结题数量
|
||
SEARCH_ENGINE_TOP_K: 3
|
||
|
||
# 是否开启中文标题加强,以及标题增强的相关配置
|
||
ZH_TITLE_ENHANCE: false
|
||
|
||
# PDF OCR 控制:只对宽高超过页面一定比例(图片宽/页面宽,图片高/页面高)的图片进行 OCR。
|
||
# 这样可以避免 PDF 中一些小图片的干扰,提高非扫描版 PDF 处理速度
|
||
PDF_OCR_THRESHOLD:
|
||
- 0.6
|
||
- 0.6
|
||
|
||
# 每个知识库的初始化介绍,用于在初始化知识库时显示和Agent调用,没写则没有介绍,不会被Agent调用。
|
||
KB_INFO:
|
||
samples: 关于本项目issue的解答
|
||
|
||
# 可选向量库类型及对应配置
|
||
kbs_config:
|
||
faiss: {}
|
||
milvus:
|
||
host: 127.0.0.1
|
||
port: '19530'
|
||
user: ''
|
||
password: ''
|
||
secure: false
|
||
zilliz:
|
||
host: in01-a7ce524e41e3935.ali-cn-hangzhou.vectordb.zilliz.com.cn
|
||
port: '19530'
|
||
user: ''
|
||
password: ''
|
||
secure: true
|
||
pg:
|
||
connection_uri: postgresql://postgres:postgres@127.0.0.1:5432/langchain_chatchat
|
||
relyt:
|
||
connection_uri: postgresql+psycopg2://postgres:postgres@127.0.0.1:7000/langchain_chatchat
|
||
es:
|
||
scheme: http
|
||
host: 127.0.0.1
|
||
port: '9200'
|
||
index_name: test_index
|
||
user: ''
|
||
password: ''
|
||
verify_certs: true
|
||
ca_certs:
|
||
client_cert:
|
||
client_key:
|
||
milvus_kwargs:
|
||
search_params:
|
||
metric_type: L2
|
||
index_params:
|
||
metric_type: L2
|
||
index_type: HNSW
|
||
chromadb: {}
|
||
|
||
# TextSplitter配置项,如果你不明白其中的含义,就不要修改。
|
||
# source 如果选择tiktoken则使用openai的方法 "huggingface"
|
||
text_splitter_dict:
|
||
ChineseRecursiveTextSplitter:
|
||
source: ''
|
||
tokenizer_name_or_path: ''
|
||
SpacyTextSplitter:
|
||
source: huggingface
|
||
tokenizer_name_or_path: gpt2
|
||
RecursiveCharacterTextSplitter:
|
||
source: tiktoken
|
||
tokenizer_name_or_path: cl100k_base
|
||
MarkdownHeaderTextSplitter:
|
||
headers_to_split_on:
|
||
- - '#'
|
||
- head1
|
||
- - '##'
|
||
- head2
|
||
- - '###'
|
||
- head3
|
||
- - '####'
|
||
- head4
|
||
|
||
# TEXT_SPLITTER 名称
|
||
TEXT_SPLITTER_NAME: ChineseRecursiveTextSplitter
|
||
|
||
# Embedding模型定制词语的词表文件
|
||
EMBEDDING_KEYWORD_FILE: embedding_keywords.txt
|