Merge pull request #1792 from liunux4odoo/fix
支持metaphor搜索引擎(无需代理,key申请简单,目前不支持中文)
This commit is contained in:
commit
7e28291e9f
|
|
@ -1,6 +1,9 @@
|
|||
import os
|
||||
|
||||
|
||||
# 默认使用的知识库
|
||||
DEFAULT_KNOWLEDGE_BASE = "samples"
|
||||
|
||||
# 默认向量库类型。可选:faiss, milvus(离线) & zilliz(在线), pg.
|
||||
DEFAULT_VS_TYPE = "faiss"
|
||||
|
||||
|
|
@ -19,6 +22,9 @@ VECTOR_SEARCH_TOP_K = 3
|
|||
# 知识库匹配相关度阈值,取值范围在0-1之间,SCORE越小,相关度越高,取到1相当于不筛选,建议设置在0.5左右
|
||||
SCORE_THRESHOLD = 1
|
||||
|
||||
# 默认搜索引擎。可选:bing, duckduckgo, metaphor
|
||||
DEFAULT_SEARCH_ENGINE = "duckduckgo"
|
||||
|
||||
# 搜索引擎匹配结题数量
|
||||
SEARCH_ENGINE_TOP_K = 3
|
||||
|
||||
|
|
@ -36,6 +42,10 @@ BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
|
|||
# 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG
|
||||
BING_SUBSCRIPTION_KEY = ""
|
||||
|
||||
# metaphor搜索需要KEY
|
||||
METAPHOR_API_KEY = ""
|
||||
|
||||
|
||||
# 是否开启中文标题加强,以及标题增强的相关配置
|
||||
# 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记;
|
||||
# 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。
|
||||
|
|
@ -49,10 +59,12 @@ KB_INFO = {
|
|||
}
|
||||
|
||||
# 通常情况下不需要更改以下内容
|
||||
|
||||
# 知识库默认存储路径
|
||||
KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base")
|
||||
if not os.path.exists(KB_ROOT_PATH):
|
||||
os.mkdir(KB_ROOT_PATH)
|
||||
|
||||
# 数据库默认存储路径。
|
||||
# 如果使用sqlite,可以直接修改DB_ROOT_PATH;如果使用其它数据库,请直接修改SQLALCHEMY_DATABASE_URI。
|
||||
DB_ROOT_PATH = os.path.join(KB_ROOT_PATH, "info.db")
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
from .search_knowledge_simple import knowledge_search_simple
|
||||
from .search_all_knowledge_once import knowledge_search_once
|
||||
from .search_all_knowledge_more import knowledge_search_more
|
||||
from .travel_assistant import travel_assistant
|
||||
# from .travel_assistant import travel_assistant
|
||||
from .calculate import calculate
|
||||
from .translator import translate
|
||||
from .weather import weathercheck
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
from langchain.utilities import BingSearchAPIWrapper, DuckDuckGoSearchAPIWrapper
|
||||
from configs import (BING_SEARCH_URL, BING_SUBSCRIPTION_KEY,
|
||||
LLM_MODEL, SEARCH_ENGINE_TOP_K, TEMPERATURE)
|
||||
from configs import (BING_SEARCH_URL, BING_SUBSCRIPTION_KEY, METAPHOR_API_KEY,
|
||||
LLM_MODEL, SEARCH_ENGINE_TOP_K, TEMPERATURE,
|
||||
TEXT_SPLITTER_NAME, OVERLAP_SIZE)
|
||||
from fastapi import Body
|
||||
from fastapi.responses import StreamingResponse
|
||||
from fastapi.concurrency import run_in_threadpool
|
||||
|
|
@ -11,7 +12,7 @@ from langchain.callbacks import AsyncIteratorCallbackHandler
|
|||
from typing import AsyncIterable
|
||||
import asyncio
|
||||
from langchain.prompts.chat import ChatPromptTemplate
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Dict
|
||||
from server.chat.utils import History
|
||||
from langchain.docstore.document import Document
|
||||
import json
|
||||
|
|
@ -32,8 +33,49 @@ def duckduckgo_search(text, result_len=SEARCH_ENGINE_TOP_K):
|
|||
return search.results(text, result_len)
|
||||
|
||||
|
||||
def metaphor_search(
|
||||
text: str,
|
||||
result_len: int = SEARCH_ENGINE_TOP_K,
|
||||
splitter_name: str = "SpacyTextSplitter",
|
||||
chunk_size: int = 500,
|
||||
chunk_overlap: int = OVERLAP_SIZE,
|
||||
) -> List[Dict]:
|
||||
from metaphor_python import Metaphor
|
||||
from server.knowledge_base.kb_cache.faiss_cache import memo_faiss_pool
|
||||
from server.knowledge_base.utils import make_text_splitter
|
||||
|
||||
if not METAPHOR_API_KEY:
|
||||
return []
|
||||
|
||||
client = Metaphor(METAPHOR_API_KEY)
|
||||
search = client.search(text, num_results=result_len, use_autoprompt=True)
|
||||
contents = search.get_contents().contents
|
||||
|
||||
# metaphor 返回的内容都是长文本,需要分词再检索
|
||||
docs = [Document(page_content=x.extract,
|
||||
metadata={"link": x.url, "title": x.title})
|
||||
for x in contents]
|
||||
text_splitter = make_text_splitter(splitter_name=splitter_name,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap)
|
||||
splitted_docs = text_splitter.split_documents(docs)
|
||||
|
||||
# 将切分好的文档放入临时向量库,重新筛选出TOP_K个文档
|
||||
if len(splitted_docs) > result_len:
|
||||
vs = memo_faiss_pool.new_vector_store()
|
||||
vs.add_documents(splitted_docs)
|
||||
splitted_docs = vs.similarity_search(text, k=result_len, score_threshold=1.0)
|
||||
|
||||
docs = [{"snippet": x.page_content,
|
||||
"link": x.metadata["link"],
|
||||
"title": x.metadata["title"]}
|
||||
for x in splitted_docs]
|
||||
return docs
|
||||
|
||||
|
||||
SEARCH_ENGINES = {"bing": bing_search,
|
||||
"duckduckgo": duckduckgo_search,
|
||||
"metaphor": metaphor_search,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -140,7 +140,7 @@ if __name__ == "__main__":
|
|||
ids = vs.add_texts([f"text added by {name}"], embeddings=embeddings)
|
||||
pprint(ids)
|
||||
elif r == 2: # search docs
|
||||
docs = vs.similarity_search_with_score(f"{name}", top_k=3, score_threshold=1.0)
|
||||
docs = vs.similarity_search_with_score(f"{name}", k=3, score_threshold=1.0)
|
||||
pprint(docs)
|
||||
if r == 3: # delete docs
|
||||
logger.warning(f"清除 {vs_name} by {name}")
|
||||
|
|
|
|||
|
|
@ -568,6 +568,8 @@ def get_server_configs() -> Dict:
|
|||
获取configs中的原始配置项,供前端使用
|
||||
'''
|
||||
from configs.kb_config import (
|
||||
DEFAULT_KNOWLEDGE_BASE,
|
||||
DEFAULT_SEARCH_ENGINE,
|
||||
DEFAULT_VS_TYPE,
|
||||
CHUNK_SIZE,
|
||||
OVERLAP_SIZE,
|
||||
|
|
|
|||
7
webui.py
7
webui.py
|
|
@ -21,13 +21,6 @@ if __name__ == "__main__":
|
|||
}
|
||||
)
|
||||
|
||||
if not chat_box.chat_inited:
|
||||
running_models = api.list_running_models()
|
||||
st.toast(
|
||||
f"欢迎使用 [`Langchain-Chatchat`](https://github.com/chatchat-space/Langchain-Chatchat) ! \n\n"
|
||||
f"当前运行中的模型`{running_models}`, 您可以开始提问了."
|
||||
)
|
||||
|
||||
pages = {
|
||||
"对话": {
|
||||
"icon": "chat",
|
||||
|
|
|
|||
|
|
@ -3,9 +3,11 @@ from webui_pages.utils import *
|
|||
from streamlit_chatbox import *
|
||||
from datetime import datetime
|
||||
import os
|
||||
from configs import LLM_MODEL, TEMPERATURE, HISTORY_LEN, PROMPT_TEMPLATES
|
||||
from configs import (LLM_MODEL, TEMPERATURE, HISTORY_LEN, PROMPT_TEMPLATES,
|
||||
DEFAULT_KNOWLEDGE_BASE, DEFAULT_SEARCH_ENGINE)
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
chat_box = ChatBox(
|
||||
assistant_avatar=os.path.join(
|
||||
"img",
|
||||
|
|
@ -55,7 +57,13 @@ def get_default_llm_model(api: ApiRequest) -> (str, bool):
|
|||
|
||||
|
||||
def dialogue_page(api: ApiRequest):
|
||||
chat_box.init_session()
|
||||
if not chat_box.chat_inited:
|
||||
default_model = get_default_llm_model(api)[0]
|
||||
st.toast(
|
||||
f"欢迎使用 [`Langchain-Chatchat`](https://github.com/chatchat-space/Langchain-Chatchat) ! \n\n"
|
||||
f"当前运行的模型`{default_model}`, 您可以开始提问了."
|
||||
)
|
||||
chat_box.init_session()
|
||||
|
||||
with st.sidebar:
|
||||
# TODO: 对话模型与会话绑定
|
||||
|
|
@ -156,9 +164,13 @@ def dialogue_page(api: ApiRequest):
|
|||
if dialogue_mode == "知识库问答":
|
||||
with st.expander("知识库配置", True):
|
||||
kb_list = api.list_knowledge_bases()
|
||||
index = 0
|
||||
if DEFAULT_KNOWLEDGE_BASE in kb_list:
|
||||
index = kb_list.index(DEFAULT_KNOWLEDGE_BASE)
|
||||
selected_kb = st.selectbox(
|
||||
"请选择知识库:",
|
||||
kb_list,
|
||||
index=index,
|
||||
on_change=on_kb_change,
|
||||
key="selected_kb",
|
||||
)
|
||||
|
|
@ -167,11 +179,15 @@ def dialogue_page(api: ApiRequest):
|
|||
|
||||
elif dialogue_mode == "搜索引擎问答":
|
||||
search_engine_list = api.list_search_engines()
|
||||
if DEFAULT_SEARCH_ENGINE in search_engine_list:
|
||||
index = search_engine_list.index(DEFAULT_SEARCH_ENGINE)
|
||||
else:
|
||||
index = search_engine_list.index("duckduckgo") if "duckduckgo" in search_engine_list else 0
|
||||
with st.expander("搜索引擎配置", True):
|
||||
search_engine = st.selectbox(
|
||||
label="请选择搜索引擎",
|
||||
options=search_engine_list,
|
||||
index=search_engine_list.index("duckduckgo") if "duckduckgo" in search_engine_list else 0,
|
||||
index=index,
|
||||
)
|
||||
se_top_k = st.number_input("匹配搜索结果条数:", 1, 20, SEARCH_ENGINE_TOP_K)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue