Langchain-Chatchat/chatchat_data/kb_settings.yaml

115 lines
3.0 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 知识库相关配置
# 默认使用的知识库
DEFAULT_KNOWLEDGE_BASE: samples
# 默认向量库/全文检索引擎类型
# 可选值:['faiss', 'milvus', 'zilliz', 'pg', 'es', 'relyt', 'chromadb']
DEFAULT_VS_TYPE: faiss
# 缓存向量库数量针对FAISS
CACHED_VS_NUM: 1
# 缓存临时向量库数量针对FAISS用于文件对话
CACHED_MEMO_VS_NUM: 10
# 知识库中单段文本长度(不适用MarkdownHeaderTextSplitter)
CHUNK_SIZE: 750
# 知识库中相邻文本重合长度(不适用MarkdownHeaderTextSplitter)
OVERLAP_SIZE: 150
# 知识库匹配向量数量
VECTOR_SEARCH_TOP_K: 3
# 知识库匹配相关度阈值取值范围在0-2之间SCORE越小相关度越高取到2相当于不筛选建议设置在0.5左右
SCORE_THRESHOLD: 2.0
# 默认搜索引擎
# 可选值:['bing', 'duckduckgo', 'metaphor', 'searx']
DEFAULT_SEARCH_ENGINE: duckduckgo
# 搜索引擎匹配结题数量
SEARCH_ENGINE_TOP_K: 3
# 是否开启中文标题加强,以及标题增强的相关配置
ZH_TITLE_ENHANCE: false
# PDF OCR 控制:只对宽高超过页面一定比例(图片宽/页面宽,图片高/页面高)的图片进行 OCR。
# 这样可以避免 PDF 中一些小图片的干扰,提高非扫描版 PDF 处理速度
PDF_OCR_THRESHOLD:
- 0.6
- 0.6
# 每个知识库的初始化介绍用于在初始化知识库时显示和Agent调用没写则没有介绍不会被Agent调用。
KB_INFO:
samples: 关于本项目issue的解答
# 可选向量库类型及对应配置
kbs_config:
faiss: {}
milvus:
host: 127.0.0.1
port: '19530'
user: ''
password: ''
secure: false
zilliz:
host: in01-a7ce524e41e3935.ali-cn-hangzhou.vectordb.zilliz.com.cn
port: '19530'
user: ''
password: ''
secure: true
pg:
connection_uri: postgresql://postgres:postgres@127.0.0.1:5432/langchain_chatchat
relyt:
connection_uri: postgresql+psycopg2://postgres:postgres@127.0.0.1:7000/langchain_chatchat
es:
scheme: http
host: 127.0.0.1
port: '9200'
index_name: test_index
user: ''
password: ''
verify_certs: true
ca_certs:
client_cert:
client_key:
milvus_kwargs:
search_params:
metric_type: L2
index_params:
metric_type: L2
index_type: HNSW
chromadb: {}
# TextSplitter配置项如果你不明白其中的含义就不要修改。
# source 如果选择tiktoken则使用openai的方法 "huggingface"
text_splitter_dict:
ChineseRecursiveTextSplitter:
source: ''
tokenizer_name_or_path: ''
SpacyTextSplitter:
source: huggingface
tokenizer_name_or_path: gpt2
RecursiveCharacterTextSplitter:
source: tiktoken
tokenizer_name_or_path: cl100k_base
MarkdownHeaderTextSplitter:
headers_to_split_on:
- - '#'
- head1
- - '##'
- head2
- - '###'
- head3
- - '####'
- head4
# TEXT_SPLITTER 名称
TEXT_SPLITTER_NAME: ChineseRecursiveTextSplitter
# Embedding模型定制词语的词表文件
EMBEDDING_KEYWORD_FILE: embedding_keywords.txt