From 1b50547e60d6db322741770881c53f66dfb99a0f Mon Sep 17 00:00:00 2001 From: zR <2448370773@qq.com> Date: Sun, 22 Oct 2023 00:00:15 +0800 Subject: [PATCH] Dev (#1822) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 支持了agentlm * 支持了agentlm和相关提示词 * 修改了Agent的一些功能,加入了Embed方面的一个优化 --------- Co-authored-by: zR --- configs/kb_config.py.example | 3 + embeddings/add_embedding_keywords.py | 55 +++++++++++++++++++ embeddings/embedding_keywords.txt | 3 + server/agent/custom_template.py | 2 +- server/agent/tools/calculate.py | 8 +-- .../agent/tools/search_all_knowledge_more.py | 6 -- .../agent/tools/search_all_knowledge_once.py | 5 -- server/agent/tools/search_internet.py | 5 -- server/agent/tools/search_knowledge_simple.py | 7 --- server/agent/tools/translator.py | 5 -- server/agent/tools/weather.py | 12 +--- server/agent/tools_select.py | 4 +- server/chat/agent_chat.py | 16 +++++- webui_pages/dialogue/dialogue.py | 2 +- 14 files changed, 82 insertions(+), 51 deletions(-) create mode 100644 embeddings/add_embedding_keywords.py create mode 100644 embeddings/embedding_keywords.txt diff --git a/configs/kb_config.py.example b/configs/kb_config.py.example index 40165b9..e125c84 100644 --- a/configs/kb_config.py.example +++ b/configs/kb_config.py.example @@ -116,3 +116,6 @@ text_splitter_dict = { # TEXT_SPLITTER 名称 TEXT_SPLITTER_NAME = "ChineseRecursiveTextSplitter" + +## Embedding模型定制词语的词表文件 +EMBEDDING_KEYWORD_FILE = "embedding_keywords.txt" \ No newline at end of file diff --git a/embeddings/add_embedding_keywords.py b/embeddings/add_embedding_keywords.py new file mode 100644 index 0000000..b922a38 --- /dev/null +++ b/embeddings/add_embedding_keywords.py @@ -0,0 +1,55 @@ +''' +该功能是为了将关键词加入到embedding模型中,以便于在embedding模型中进行关键词的embedding +该功能的实现是通过修改embedding模型的tokenizer来实现的 +该功能仅仅对EMBEDDING_MODEL参数对应的的模型有效,输出后的模型保存在原本模型 +该功能的Idea由社区贡献,感谢@CharlesJu1 + +保存的模型的位置位于原本嵌入模型的目录下,模型的名称为原模型名称+Merge_Keywords_时间戳 +''' +import sys + +sys.path.append("..") +import os +from safetensors.torch import save_model +from sentence_transformers import SentenceTransformer +from datetime import datetime +from configs import ( + MODEL_PATH, + EMBEDDING_MODEL, + EMBEDDING_KEYWORD_FILE, +) + + +def add_keyword_to_model(model_name: str = EMBEDDING_MODEL, keyword_file: str = "", output_model_path: str = None): + key_words = [] + with open(keyword_file, "r") as f: + for line in f: + key_words.append(line.strip()) + + model = SentenceTransformer(model_name) + word_embedding_model = model._first_module() + tokenizer = word_embedding_model.tokenizer + tokenizer.add_tokens(key_words) + word_embedding_model.auto_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32) + + if output_model_path: + os.makedirs(output_model_path, exist_ok=True) + tokenizer.save_pretrained(output_model_path) + model.save(output_model_path) + safetensors_file = os.path.join(output_model_path, "model.safetensors") + metadata = {'format': 'pt'} + save_model(model, safetensors_file, metadata) + +def add_keyword_to_embedding_model(path: str = EMBEDDING_KEYWORD_FILE): + keyword_file = os.path.join(path) + model_name = MODEL_PATH["embed_model"][EMBEDDING_MODEL] + model_parent_directory = os.path.dirname(model_name) + current_time = datetime.now().strftime('%Y%m%d_%H%M%S') + output_model_name = "{}_Merge_Keywords_{}".format(EMBEDDING_MODEL, current_time) + output_model_path = os.path.join(model_parent_directory, output_model_name) + add_keyword_to_model(model_name, keyword_file, output_model_path) + print("save model to {}".format(output_model_path)) + + +if __name__ == '__main__': + add_keyword_to_embedding_model(EMBEDDING_KEYWORD_FILE) diff --git a/embeddings/embedding_keywords.txt b/embeddings/embedding_keywords.txt new file mode 100644 index 0000000..3822b99 --- /dev/null +++ b/embeddings/embedding_keywords.txt @@ -0,0 +1,3 @@ +Langchain-Chatchat +数据科学与大数据技术 +人工智能与先进计算 \ No newline at end of file diff --git a/server/agent/custom_template.py b/server/agent/custom_template.py index fdac6e2..a08ba8e 100644 --- a/server/agent/custom_template.py +++ b/server/agent/custom_template.py @@ -37,7 +37,7 @@ class CustomOutputParser(AgentOutputParser): def parse(self, llm_output: str) -> AgentFinish | tuple[dict[str, str], str] | AgentAction: # Check if agent should finish - support_agent = ["Azure-OpenAI", "OpenAI", "Anthropic", "Qwen", "qwen-api", "baichuan-api"] # 目前支持agent的模型 + support_agent = ["Azure-OpenAI", "OpenAI", "Anthropic", "Qwen", "qwen-api", "baichuan-api","agentlm"] # 目前支持agent的模型 if not any(agent in model_container.MODEL for agent in support_agent) and self.begin: self.begin = False stop_words = ["Observation:"] diff --git a/server/agent/tools/calculate.py b/server/agent/tools/calculate.py index 2d963ae..629331f 100644 --- a/server/agent/tools/calculate.py +++ b/server/agent/tools/calculate.py @@ -1,8 +1,3 @@ -## 单独运行的时候需要添加 -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) - from langchain.prompts import PromptTemplate from langchain.chains import LLMMathChain from server.agent import model_container @@ -19,7 +14,7 @@ ${{运行代码的输出}} ``` 答案: ${{答案}} -这是两个例子: +这是两个例子: 问题: 37593 * 67是多少? ```text @@ -56,6 +51,7 @@ ${{运行代码的输出}} 现在,这是我的问题: 问题: {question} """ + PROMPT = PromptTemplate( input_variables=["question"], template=_PROMPT_TEMPLATE, diff --git a/server/agent/tools/search_all_knowledge_more.py b/server/agent/tools/search_all_knowledge_more.py index fe70171..332695b 100644 --- a/server/agent/tools/search_all_knowledge_more.py +++ b/server/agent/tools/search_all_knowledge_more.py @@ -1,9 +1,3 @@ -## 单独运行的时候需要添加 -import sys -import os - -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) - import json import re import warnings diff --git a/server/agent/tools/search_all_knowledge_once.py b/server/agent/tools/search_all_knowledge_once.py index 7a10536..a29eadf 100644 --- a/server/agent/tools/search_all_knowledge_once.py +++ b/server/agent/tools/search_all_knowledge_once.py @@ -1,8 +1,3 @@ -## 单独运行的时候需要添加 -# import sys -# import os -# sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) - import re import warnings from typing import Dict diff --git a/server/agent/tools/search_internet.py b/server/agent/tools/search_internet.py index 6eec93e..de57d31 100644 --- a/server/agent/tools/search_internet.py +++ b/server/agent/tools/search_internet.py @@ -1,8 +1,3 @@ -## 单独运行的时候需要添加 -# import sys -# import os -# sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) - import json from server.chat import search_engine_chat from configs import VECTOR_SEARCH_TOP_K diff --git a/server/agent/tools/search_knowledge_simple.py b/server/agent/tools/search_knowledge_simple.py index 03f4da1..bad5ed5 100644 --- a/server/agent/tools/search_knowledge_simple.py +++ b/server/agent/tools/search_knowledge_simple.py @@ -1,10 +1,3 @@ -## 最简单的版本,只支持固定的知识库 - -# ## 单独运行的时候需要添加 -# import sys -# import os -# sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) - from server.chat.knowledge_base_chat import knowledge_base_chat from configs import VECTOR_SEARCH_TOP_K, SCORE_THRESHOLD import json diff --git a/server/agent/tools/translator.py b/server/agent/tools/translator.py index 62ffa33..78422fb 100644 --- a/server/agent/tools/translator.py +++ b/server/agent/tools/translator.py @@ -1,8 +1,3 @@ -## 单独运行的时候需要添加 -# import sys -# import os -# sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) - from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from server.agent import model_container diff --git a/server/agent/tools/weather.py b/server/agent/tools/weather.py index d0dd58d..2dba79b 100644 --- a/server/agent/tools/weather.py +++ b/server/agent/tools/weather.py @@ -1,12 +1,4 @@ -## 使用和风天气API查询天气,这个模型仅仅对免费的API进行了适配 -## 这个模型的提示词非常复杂,我们推荐使用GPT4模型进行运行 -from __future__ import annotations - -## 单独运行的时候需要添加 -import sys -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) - +## 使用和风天气API查询天气,这个模型仅仅对免费的API进行了适配,建议使用GPT4等高级模型进行适配 import re import warnings from typing import Dict @@ -30,8 +22,6 @@ from server.agent import model_container KEY = "ac880e5a877042809ac7ffdd19d95b0d" #key长这样,这里提供了示例的key,这个key没法使用,你需要自己去注册和风天气的账号,然后在这里填入你的key - - _PROMPT_TEMPLATE = """ 用户会提出一个关于天气的问题,你的目标是拆分出用户问题中的区,市 并按照我提供的工具回答。 例如 用户提出的问题是: 上海浦东未来1小时天气情况? diff --git a/server/agent/tools_select.py b/server/agent/tools_select.py index 23c99a0..314a218 100644 --- a/server/agent/tools_select.py +++ b/server/agent/tools_select.py @@ -58,12 +58,12 @@ tools = [ ), Tool.from_function( func=knowledge_search_more, - name="Knowledge Base Query Tool", + name="Knowledge Base Tool", description="Prioritize accessing the knowledge base to get answers" ), Tool.from_function( func=search_internet, - name="Internet Query Tool", + name="Internet Tool", description="If you can't access the internet, this tool can help you access Bing to answer questions" ), ] diff --git a/server/chat/agent_chat.py b/server/chat/agent_chat.py index 5a71478..6d92a10 100644 --- a/server/chat/agent_chat.py +++ b/server/chat/agent_chat.py @@ -5,7 +5,7 @@ from langchain.agents import AgentExecutor, LLMSingleActionAgent from server.agent.custom_template import CustomOutputParser, CustomPromptTemplate from fastapi import Body from fastapi.responses import StreamingResponse -from configs import LLM_MODEL, TEMPERATURE, HISTORY_LEN +from configs import LLM_MODEL, TEMPERATURE, HISTORY_LEN,Agent_MODEL from server.utils import wrap_done, get_ChatOpenAI, get_prompt_template from langchain.chains import LLMChain from typing import AsyncIterable, Optional, Dict @@ -49,7 +49,19 @@ async def agent_chat(query: str = Body(..., description="用户输入", examples ## 传入全局变量来实现agent调用 kb_list = {x["kb_name"]: x for x in get_kb_details()} model_container.DATABASE = {name: details['kb_info'] for name, details in kb_list.items()} - model_container.MODEL = model + + + if Agent_MODEL: + ## 如果有指定使用Agent模型来完成任务 + model_agent = get_ChatOpenAI( + model_name=Agent_MODEL, + temperature=temperature, + max_tokens=max_tokens, + callbacks=[callback], + ) + model_container.MODEL = model_agent + else: + model_container.MODEL = model prompt_template = get_prompt_template("agent_chat", prompt_name) prompt_template_agent = CustomPromptTemplate( diff --git a/webui_pages/dialogue/dialogue.py b/webui_pages/dialogue/dialogue.py index b49c47d..619c27e 100644 --- a/webui_pages/dialogue/dialogue.py +++ b/webui_pages/dialogue/dialogue.py @@ -224,7 +224,7 @@ def dialogue_page(api: ApiRequest): ]) text = "" ans = "" - support_agent = ["Azure-OpenAI", "OpenAI", "Anthropic", "Qwen", "qwen-api", "baichuan-api"] # 目前支持agent的模型 + support_agent = ["Azure-OpenAI", "OpenAI", "Anthropic", "Qwen", "qwen-api", "baichuan-api","agentlm"] # 目前支持agent的模型 if not any(agent in llm_model for agent in support_agent): ans += "正在思考... \n\n 该模型并没有进行Agent对齐,请更换支持Agent的模型获得更好的体验!\n\n\n" chat_box.update_msg(ans, element_index=0, streaming=False)