From b9b42991f6692ec51ef09d8d8d8940371e1a4c50 Mon Sep 17 00:00:00 2001 From: liunux4odoo Date: Wed, 18 Oct 2023 23:02:20 +0800 Subject: [PATCH] =?UTF-8?q?-=20=E6=94=AF=E6=8C=81metaphor=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=E5=BC=95=E6=93=8E=EF=BC=88=E6=97=A0=E9=9C=80=E4=BB=A3?= =?UTF-8?q?=E7=90=86=EF=BC=8Ckey=E7=94=B3=E8=AF=B7=E7=AE=80=E5=8D=95?= =?UTF-8?q?=EF=BC=8C=E7=9B=AE=E5=89=8D=E4=B8=8D=E6=94=AF=E6=8C=81=E4=B8=AD?= =?UTF-8?q?=E6=96=87=EF=BC=89=20-=20=E5=A2=9E=E5=8A=A0=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E7=9F=A5=E8=AF=86=E5=BA=93=E5=92=8C=E9=BB=98=E8=AE=A4=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=E5=BC=95=E6=93=8E=E7=9A=84=E9=85=8D=E7=BD=AE=E9=A1=B9?= =?UTF-8?q?=20-=20=E4=BF=AE=E5=A4=8DWEBUI=E5=BC=B9=E5=87=BA=E5=BD=93?= =?UTF-8?q?=E5=89=8D=E6=A8=A1=E5=9E=8B=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- configs/kb_config.py.example | 12 +++++ server/agent/tools/__init__.py | 2 +- server/chat/search_engine_chat.py | 48 +++++++++++++++++-- server/knowledge_base/kb_cache/faiss_cache.py | 2 +- server/utils.py | 2 + webui.py | 7 --- webui_pages/dialogue/dialogue.py | 22 +++++++-- 7 files changed, 80 insertions(+), 15 deletions(-) diff --git a/configs/kb_config.py.example b/configs/kb_config.py.example index a857e80..15abb28 100644 --- a/configs/kb_config.py.example +++ b/configs/kb_config.py.example @@ -1,6 +1,9 @@ import os +# 默认使用的知识库 +DEFAULT_KNOWLEDGE_BASE = "samples" + # 默认向量库类型。可选:faiss, milvus(离线) & zilliz(在线), pg. DEFAULT_VS_TYPE = "faiss" @@ -19,6 +22,9 @@ VECTOR_SEARCH_TOP_K = 3 # 知识库匹配相关度阈值,取值范围在0-1之间,SCORE越小,相关度越高,取到1相当于不筛选,建议设置在0.5左右 SCORE_THRESHOLD = 1 +# 默认搜索引擎。可选:bing, duckduckgo, metaphor +DEFAULT_SEARCH_ENGINE = "duckduckgo" + # 搜索引擎匹配结题数量 SEARCH_ENGINE_TOP_K = 3 @@ -36,6 +42,10 @@ BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search" # 是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG BING_SUBSCRIPTION_KEY = "" +# metaphor搜索需要KEY +METAPHOR_API_KEY = "" + + # 是否开启中文标题加强,以及标题增强的相关配置 # 通过增加标题判断,判断哪些文本为标题,并在metadata中进行标记; # 然后将文本与往上一级的标题进行拼合,实现文本信息的增强。 @@ -49,10 +59,12 @@ KB_INFO = { } # 通常情况下不需要更改以下内容 + # 知识库默认存储路径 KB_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base") if not os.path.exists(KB_ROOT_PATH): os.mkdir(KB_ROOT_PATH) + # 数据库默认存储路径。 # 如果使用sqlite,可以直接修改DB_ROOT_PATH;如果使用其它数据库,请直接修改SQLALCHEMY_DATABASE_URI。 DB_ROOT_PATH = os.path.join(KB_ROOT_PATH, "info.db") diff --git a/server/agent/tools/__init__.py b/server/agent/tools/__init__.py index 8bb5cac..3682787 100644 --- a/server/agent/tools/__init__.py +++ b/server/agent/tools/__init__.py @@ -2,7 +2,7 @@ from .search_knowledge_simple import knowledge_search_simple from .search_all_knowledge_once import knowledge_search_once from .search_all_knowledge_more import knowledge_search_more -from .travel_assistant import travel_assistant +# from .travel_assistant import travel_assistant from .calculate import calculate from .translator import translate from .weather import weathercheck diff --git a/server/chat/search_engine_chat.py b/server/chat/search_engine_chat.py index 83ed65e..06ac856 100644 --- a/server/chat/search_engine_chat.py +++ b/server/chat/search_engine_chat.py @@ -1,6 +1,7 @@ from langchain.utilities import BingSearchAPIWrapper, DuckDuckGoSearchAPIWrapper -from configs import (BING_SEARCH_URL, BING_SUBSCRIPTION_KEY, - LLM_MODEL, SEARCH_ENGINE_TOP_K, TEMPERATURE) +from configs import (BING_SEARCH_URL, BING_SUBSCRIPTION_KEY, METAPHOR_API_KEY, + LLM_MODEL, SEARCH_ENGINE_TOP_K, TEMPERATURE, + TEXT_SPLITTER_NAME, OVERLAP_SIZE) from fastapi import Body from fastapi.responses import StreamingResponse from fastapi.concurrency import run_in_threadpool @@ -11,7 +12,7 @@ from langchain.callbacks import AsyncIteratorCallbackHandler from typing import AsyncIterable import asyncio from langchain.prompts.chat import ChatPromptTemplate -from typing import List, Optional +from typing import List, Optional, Dict from server.chat.utils import History from langchain.docstore.document import Document import json @@ -32,8 +33,49 @@ def duckduckgo_search(text, result_len=SEARCH_ENGINE_TOP_K): return search.results(text, result_len) +def metaphor_search( + text: str, + result_len: int = SEARCH_ENGINE_TOP_K, + splitter_name: str = "SpacyTextSplitter", + chunk_size: int = 500, + chunk_overlap: int = OVERLAP_SIZE, +) -> List[Dict]: + from metaphor_python import Metaphor + from server.knowledge_base.kb_cache.faiss_cache import memo_faiss_pool + from server.knowledge_base.utils import make_text_splitter + + if not METAPHOR_API_KEY: + return [] + + client = Metaphor(METAPHOR_API_KEY) + search = client.search(text, num_results=result_len, use_autoprompt=True) + contents = search.get_contents().contents + + # metaphor 返回的内容都是长文本,需要分词再检索 + docs = [Document(page_content=x.extract, + metadata={"link": x.url, "title": x.title}) + for x in contents] + text_splitter = make_text_splitter(splitter_name=splitter_name, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap) + splitted_docs = text_splitter.split_documents(docs) + + # 将切分好的文档放入临时向量库,重新筛选出TOP_K个文档 + if len(splitted_docs) > result_len: + vs = memo_faiss_pool.new_vector_store() + vs.add_documents(splitted_docs) + splitted_docs = vs.similarity_search(text, k=result_len, score_threshold=1.0) + + docs = [{"snippet": x.page_content, + "link": x.metadata["link"], + "title": x.metadata["title"]} + for x in splitted_docs] + return docs + + SEARCH_ENGINES = {"bing": bing_search, "duckduckgo": duckduckgo_search, + "metaphor": metaphor_search, } diff --git a/server/knowledge_base/kb_cache/faiss_cache.py b/server/knowledge_base/kb_cache/faiss_cache.py index 801e4a6..c00ac4f 100644 --- a/server/knowledge_base/kb_cache/faiss_cache.py +++ b/server/knowledge_base/kb_cache/faiss_cache.py @@ -140,7 +140,7 @@ if __name__ == "__main__": ids = vs.add_texts([f"text added by {name}"], embeddings=embeddings) pprint(ids) elif r == 2: # search docs - docs = vs.similarity_search_with_score(f"{name}", top_k=3, score_threshold=1.0) + docs = vs.similarity_search_with_score(f"{name}", k=3, score_threshold=1.0) pprint(docs) if r == 3: # delete docs logger.warning(f"清除 {vs_name} by {name}") diff --git a/server/utils.py b/server/utils.py index 2f6dfc4..22185eb 100644 --- a/server/utils.py +++ b/server/utils.py @@ -568,6 +568,8 @@ def get_server_configs() -> Dict: 获取configs中的原始配置项,供前端使用 ''' from configs.kb_config import ( + DEFAULT_KNOWLEDGE_BASE, + DEFAULT_SEARCH_ENGINE, DEFAULT_VS_TYPE, CHUNK_SIZE, OVERLAP_SIZE, diff --git a/webui.py b/webui.py index 776d5e6..85d6cb4 100644 --- a/webui.py +++ b/webui.py @@ -21,13 +21,6 @@ if __name__ == "__main__": } ) - if not chat_box.chat_inited: - running_models = api.list_running_models() - st.toast( - f"欢迎使用 [`Langchain-Chatchat`](https://github.com/chatchat-space/Langchain-Chatchat) ! \n\n" - f"当前运行中的模型`{running_models}`, 您可以开始提问了." - ) - pages = { "对话": { "icon": "chat", diff --git a/webui_pages/dialogue/dialogue.py b/webui_pages/dialogue/dialogue.py index adc6029..2852250 100644 --- a/webui_pages/dialogue/dialogue.py +++ b/webui_pages/dialogue/dialogue.py @@ -3,9 +3,11 @@ from webui_pages.utils import * from streamlit_chatbox import * from datetime import datetime import os -from configs import LLM_MODEL, TEMPERATURE, HISTORY_LEN, PROMPT_TEMPLATES +from configs import (LLM_MODEL, TEMPERATURE, HISTORY_LEN, PROMPT_TEMPLATES, + DEFAULT_KNOWLEDGE_BASE, DEFAULT_SEARCH_ENGINE) from typing import List, Dict + chat_box = ChatBox( assistant_avatar=os.path.join( "img", @@ -55,7 +57,13 @@ def get_default_llm_model(api: ApiRequest) -> (str, bool): def dialogue_page(api: ApiRequest): - chat_box.init_session() + if not chat_box.chat_inited: + default_model = get_default_llm_model(api)[0] + st.toast( + f"欢迎使用 [`Langchain-Chatchat`](https://github.com/chatchat-space/Langchain-Chatchat) ! \n\n" + f"当前运行的模型`{default_model}`, 您可以开始提问了." + ) + chat_box.init_session() with st.sidebar: # TODO: 对话模型与会话绑定 @@ -156,9 +164,13 @@ def dialogue_page(api: ApiRequest): if dialogue_mode == "知识库问答": with st.expander("知识库配置", True): kb_list = api.list_knowledge_bases() + index = 0 + if DEFAULT_KNOWLEDGE_BASE in kb_list: + index = kb_list.index(DEFAULT_KNOWLEDGE_BASE) selected_kb = st.selectbox( "请选择知识库:", kb_list, + index=index, on_change=on_kb_change, key="selected_kb", ) @@ -167,11 +179,15 @@ def dialogue_page(api: ApiRequest): elif dialogue_mode == "搜索引擎问答": search_engine_list = api.list_search_engines() + if DEFAULT_SEARCH_ENGINE in search_engine_list: + index = search_engine_list.index(DEFAULT_SEARCH_ENGINE) + else: + index = search_engine_list.index("duckduckgo") if "duckduckgo" in search_engine_list else 0 with st.expander("搜索引擎配置", True): search_engine = st.selectbox( label="请选择搜索引擎", options=search_engine_list, - index=search_engine_list.index("duckduckgo") if "duckduckgo" in search_engine_list else 0, + index=index, ) se_top_k = st.number_input("匹配搜索结果条数:", 1, 20, SEARCH_ENGINE_TOP_K)