diff --git a/agent/agent模式测试.ipynb b/agent/agent模式测试.ipynb index 239c3f7..c53876f 100644 --- a/agent/agent模式测试.ipynb +++ b/agent/agent模式测试.ipynb @@ -10,12 +10,12 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO 2023-06-01 20:26:48,576-1d: \n", + "INFO 2023-06-09 20:52:01,296-1d: \n", "loading model config\n", "llm device: cuda\n", "embedding device: cuda\n", "dir: /media/gpt4-pdf-chatbot-langchain/dev-langchain-ChatGLM\n", - "flagging username: 7daba79785044bceb6896b9e6f8f9894\n", + "flagging username: 35d96e513c5347dbb0c1d7c2fb21cbd4\n", "\n" ] } @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "68978c38-c0e9-4ae9-ba90-9c02aca335be", "metadata": {}, "outputs": [ @@ -50,7 +50,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Loading vicuna-7b-hf...\n" + "Loading vicuna-13b-hf...\n" ] }, { @@ -84,12 +84,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9b61d05e18044b009c72b862c84ab5cb", + "model_id": "9df1856e06d1460683851a0b73537a6d", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Loading checkpoint shards: 0%| | 0/2 [00:00 float:\n", + " \"\"\"Multiply the provided floats.\"\"\"\n", + " return a * b\n", + "\n", + "tool = StructuredTool.from_function(multiplier)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "id": "e089a828-b662-4d9a-8d88-4bf95ccadbab", "metadata": {}, "outputs": [], + "source": [ + "from langchain import OpenAI\n", + "from langchain.agents import initialize_agent, AgentType\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d4ea7f0e-1ba9-4f40-82ec-7c453bd64945", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "# Structured tools are compatible with the STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION agent type. \n", + "agent_executor = initialize_agent([tool], llm, agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "640bfdfb-41e7-4429-9718-8fa724de12b7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "__call:System: Respond to the human as helpfully and accurately as possible. You have access to the following tools:\n", + "\n", + "multiplier: multiplier(a: float, b: float) -> float - Multiply the provided floats., args: {{'a': {{'title': 'A', 'type': 'number'}}, 'b': {{'title': 'B', 'type': 'number'}}}}\n", + "\n", + "Use a json blob to specify a tool by providing an action key (tool name) and an action_input key (tool input).\n", + "\n", + "Valid \"action\" values: \"Final Answer\" or multiplier\n", + "\n", + "Provide only ONE action per $JSON_BLOB, as shown:\n", + "\n", + "```\n", + "{\n", + " \"action\": $TOOL_NAME,\n", + " \"action_input\": $INPUT\n", + "}\n", + "```\n", + "\n", + "Follow this format:\n", + "\n", + "Question: input question to answer\n", + "Thought: consider previous and subsequent steps\n", + "Action:\n", + "```\n", + "$JSON_BLOB\n", + "```\n", + "Observation: action result\n", + "... (repeat Thought/Action/Observation N times)\n", + "Thought: I know what to respond\n", + "Action:\n", + "```\n", + "{\n", + " \"action\": \"Final Answer\",\n", + " \"action_input\": \"Final response to human\"\n", + "}\n", + "```\n", + "\n", + "Begin! Reminder to ALWAYS respond with a valid json blob of a single action. Use tools if necessary. Respond directly if appropriate. Format is Action:```$JSON_BLOB```then Observation:.\n", + "Thought:\n", + "Human: What is 1 times 14\n", + "\n", + "\n", + "response:System: ```{\"action\":\"multiplier\",\"action_input\":{\"a\":1,\"b\":14}}``\n", + "\n", + "Observation:\n", + "\u001b[32;1m\u001b[1;3mSystem: ```{\"action\":\"multiplier\",\"action_input\":{\"a\":1,\"b\":14}}``\n", + "\n", + "Observation:\u001b[0m\n", + "Observation: \u001b[36;1m\u001b[1;3m14.0\u001b[0m\n", + "Thought:__call:System: Respond to the human as helpfully and accurately as possible. You have access to the following tools:\n", + "\n", + "multiplier: multiplier(a: float, b: float) -> float - Multiply the provided floats., args: {{'a': {{'title': 'A', 'type': 'number'}}, 'b': {{'title': 'B', 'type': 'number'}}}}\n", + "\n", + "Use a json blob to specify a tool by providing an action key (tool name) and an action_input key (tool input).\n", + "\n", + "Valid \"action\" values: \"Final Answer\" or multiplier\n", + "\n", + "Provide only ONE action per $JSON_BLOB, as shown:\n", + "\n", + "```\n", + "{\n", + " \"action\": $TOOL_NAME,\n", + " \"action_input\": $INPUT\n", + "}\n", + "```\n", + "\n", + "Follow this format:\n", + "\n", + "Question: input question to answer\n", + "Thought: consider previous and subsequent steps\n", + "Action:\n", + "```\n", + "$JSON_BLOB\n", + "```\n", + "Observation: action result\n", + "... (repeat Thought/Action/Observation N times)\n", + "Thought: I know what to respond\n", + "Action:\n", + "```\n", + "{\n", + " \"action\": \"Final Answer\",\n", + " \"action_input\": \"Final response to human\"\n", + "}\n", + "```\n", + "\n", + "Begin! Reminder to ALWAYS respond with a valid json blob of a single action. Use tools if necessary. Respond directly if appropriate. Format is Action:```$JSON_BLOB```then Observation:.\n", + "Thought:\n", + "Human: What is 1 times 14\n", + "\n", + "This was your previous work (but I haven't seen any of it! I only see what you return as final answer):\n", + "System: ```{\"action\":\"multiplier\",\"action_input\":{\"a\":1,\"b\":14}}``\n", + "\n", + "Observation:\n", + "Observation: 14.0\n", + "Thought:\n", + "response:\n", + "\u001b[32;1m\u001b[1;3m\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "''" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agent_executor.run(\"What is 1 times 14\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9baa881f-5ff2-4958-b3a2-1653a5e8bc3b", + "metadata": {}, + "outputs": [], "source": [] } ], diff --git a/models/fastchat_api_llm.py b/models/fastchat_api_llm.py deleted file mode 100644 index 1fa678e..0000000 --- a/models/fastchat_api_llm.py +++ /dev/null @@ -1,502 +0,0 @@ -"""Wrapper around FastChat APIs.""" -from __future__ import annotations - -import logging -import sys -import warnings -from abc import ABC -from typing import ( - AbstractSet, - Any, - Callable, - Collection, - Dict, - Generator, - List, - Literal, - Mapping, - Optional, - Set, - Tuple, - Union, -) - -from pydantic import Extra, Field, root_validator -from tenacity import ( - before_sleep_log, - retry, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) - -from langchain.llms.base import BaseLLM -from langchain.schema import Generation, LLMResult -from langchain.utils import get_from_dict_or_env -from models.base import (RemoteRpcModel, - AnswerResult) -from models.loader import LoaderCheckPoint -import requests -import json - -logger = logging.getLogger(__name__) - - -def _streaming_response_template() -> Dict[str, Any]: - """ - :return: 响应结构 - """ - return { - "text": "", - "error_code": 0, - } - - -def _update_response(response: Dict[str, Any], stream_response: Dict[str, Any]) -> None: - """Update response from the stream response.""" - response["text"] += stream_response["text"] - response["error_code"] += stream_response["error_code"] - - -class BaseFastChat(BaseLLM): - """Wrapper around FastChat large language models.""" - - api_base_url: str = "http://localhost:21002/worker_generate_stream" - model_name: str = "text-davinci-003" - """Model name to use.""" - temperature: float = 0.7 - """What sampling temperature to use.""" - max_new_tokens: int = 200 - stop: int = 20 - batch_size: int = 20 - """Maximum number of retries to make when generating.""" - streaming: bool = False - """Penalizes repeated tokens.""" - n: int = 1 - """Whether to stream the results or not.""" - allowed_special: Union[Literal["all"], AbstractSet[str]] = set() - """Set of special tokens that are allowed。""" - disallowed_special: Union[Literal["all"], Collection[str]] = "all" - """Set of special tokens that are not allowed。""" - - class Config: - """Configuration for this pydantic object.""" - - extra = Extra.ignore - - @root_validator(pre=True) - def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]: - """Build extra kwargs from additional params that were passed in.""" - all_required_field_names = {field.alias for field in cls.__fields__.values()} - - extra = values.get("model_kwargs", {}) - for field_name in list(values): - if field_name not in all_required_field_names: - if field_name in extra: - raise ValueError(f"Found {field_name} supplied twice.") - logger.warning( - f"""WARNING! {field_name} is not default parameter. - {field_name} was transfered to model_kwargs. - Please confirm that {field_name} is what you intended.""" - ) - extra[field_name] = values.pop(field_name) - values["model_kwargs"] = extra - return values - - @property - def _default_params(self) -> Dict[str, Any]: - """Get the default parameters for calling FastChat API.""" - normal_params = { - "model": self.model_name, - "prompt": '', - "max_new_tokens": self.max_new_tokens, - "temperature": self.temperature, - } - - return {**normal_params} - - def _generate( - self, prompts: List[str], stop: Optional[List[str]] = None - ) -> LLMResult: - """Call out to FastChat's endpoint with k unique prompts. - - Args: - prompts: The prompts to pass into the model. - stop: Optional list of stop words to use when generating. - - Returns: - The full LLM output. - - Example: - .. code-block:: python - - response = fastchat.generate(["Tell me a joke."]) - """ - # TODO: write a unit test for this - params = self._invocation_params - sub_prompts = self.get_sub_prompts(params, prompts) - choices = [] - token_usage: Dict[str, int] = {} - headers = {"User-Agent": "fastchat Client"} - for _prompts in sub_prompts: - - params["prompt"] = _prompts[0] - - if stop is not None: - if "stop" in params: - raise ValueError("`stop` found in both the input and default params.") - params["stop"] = stop - - if self.streaming: - if len(_prompts) > 1: - raise ValueError("Cannot stream results with multiple prompts.") - - response_template = _streaming_response_template() - response = requests.post( - self.api_base_url, - headers=headers, - json=params, - stream=True, - ) - for stream_resp in response.iter_lines( - chunk_size=8192, decode_unicode=False, delimiter=b"\0" - ): - if stream_resp: - data = json.loads(stream_resp.decode("utf-8")) - skip_echo_len = len(_prompts[0]) - output = data["text"][skip_echo_len:].strip() - data["text"] = output - self.callback_manager.on_llm_new_token( - output, - verbose=self.verbose, - logprobs=data["error_code"], - ) - _update_response(response_template, data) - choices.append(response_template) - else: - response_template = _streaming_response_template() - response = requests.post( - self.api_base_url, - headers=headers, - json=params, - stream=True, - ) - for stream_resp in response.iter_lines( - chunk_size=8192, decode_unicode=False, delimiter=b"\0" - ): - if stream_resp: - data = json.loads(stream_resp.decode("utf-8")) - skip_echo_len = len(_prompts[0]) - output = data["text"][skip_echo_len:].strip() - data["text"] = output - _update_response(response_template, data) - - choices.append(response_template) - - return self.create_llm_result(choices, prompts, token_usage) - - async def _agenerate( - self, prompts: List[str], stop: Optional[List[str]] = None - ) -> LLMResult: - """Call out to FastChat's endpoint async with k unique prompts.""" - params = self._invocation_params - sub_prompts = self.get_sub_prompts(params, prompts) - choices = [] - token_usage: Dict[str, int] = {} - - headers = {"User-Agent": "fastchat Client"} - for _prompts in sub_prompts: - - params["prompt"] = _prompts[0] - if stop is not None: - if "stop" in params: - raise ValueError("`stop` found in both the input and default params.") - params["stop"] = stop - - if self.streaming: - if len(_prompts) > 1: - raise ValueError("Cannot stream results with multiple prompts.") - - response_template = _streaming_response_template() - response = requests.post( - self.api_base_url, - headers=headers, - json=params, - stream=True, - ) - for stream_resp in response.iter_lines( - chunk_size=8192, decode_unicode=False, delimiter=b"\0" - ): - if stream_resp: - data = json.loads(stream_resp.decode("utf-8")) - skip_echo_len = len(_prompts[0]) - output = data["text"][skip_echo_len:].strip() - data["text"] = output - self.callback_manager.on_llm_new_token( - output, - verbose=self.verbose, - logprobs=data["error_code"], - ) - _update_response(response_template, data) - choices.append(response_template) - else: - response_template = _streaming_response_template() - response = requests.post( - self.api_base_url, - headers=headers, - json=params, - stream=True, - ) - for stream_resp in response.iter_lines( - chunk_size=8192, decode_unicode=False, delimiter=b"\0" - ): - if stream_resp: - data = json.loads(stream_resp.decode("utf-8")) - skip_echo_len = len(_prompts[0]) - output = data["text"][skip_echo_len:].strip() - data["text"] = output - _update_response(response_template, data) - - choices.append(response_template) - - return self.create_llm_result(choices, prompts, token_usage) - - def get_sub_prompts( - self, - params: Dict[str, Any], - prompts: List[str], - ) -> List[List[str]]: - """Get the sub prompts for llm call.""" - if params["max_new_tokens"] == -1: - if len(prompts) != 1: - raise ValueError( - "max_new_tokens set to -1 not supported for multiple inputs." - ) - params["max_new_tokens"] = self.max_new_tokens_for_prompt(prompts[0]) - # append pload - sub_prompts = [ - prompts[i: i + self.batch_size] - for i in range(0, len(prompts), self.batch_size) - ] - - return sub_prompts - - def create_llm_result( - self, choices: Any, prompts: List[str], token_usage: Dict[str, int] - ) -> LLMResult: - """Create the LLMResult from the choices and prompts.""" - generations = [] - for i, _ in enumerate(prompts): - sub_choices = choices[i * self.n: (i + 1) * self.n] - generations.append( - [ - Generation( - text=choice["text"], - generation_info=dict( - finish_reason='over', - logprobs=choice["text"], - ), - ) - for choice in sub_choices - ] - ) - llm_output = {"token_usage": token_usage, "model_name": self.model_name} - return LLMResult(generations=generations, llm_output=llm_output) - - def stream(self, prompt: str, stop: Optional[List[str]] = None) -> Generator: - """Call FastChat with streaming flag and return the resulting generator. - - BETA: this is a beta feature while we figure out the right abstraction. - Once that happens, this interface could change. - - Args: - prompt: The prompts to pass into the model. - stop: Optional list of stop words to use when generating. - - Returns: - A generator representing the stream of tokens from OpenAI. - - Example: - .. code-block:: python - - generator = fastChat.stream("Tell me a joke.") - for token in generator: - yield token - """ - params = self._invocation_params - params["prompt"] = prompt - if stop is not None: - if "stop" in params: - raise ValueError("`stop` found in both the input and default params.") - params["stop"] = stop - - headers = {"User-Agent": "fastchat Client"} - response = requests.post( - self.api_base_url, - headers=headers, - json=params, - stream=True, - ) - for stream_resp in response.iter_lines( - chunk_size=8192, decode_unicode=False, delimiter=b"\0" - ): - if stream_resp: - data = json.loads(stream_resp.decode("utf-8")) - skip_echo_len = len(prompt) - output = data["text"][skip_echo_len:].strip() - data["text"] = output - yield data - - @property - def _invocation_params(self) -> Dict[str, Any]: - """Get the parameters used to invoke the model.""" - return self._default_params - - @property - def _identifying_params(self) -> Mapping[str, Any]: - """Get the identifying parameters.""" - return {**{"model_name": self.model_name}, **self._default_params} - - @property - def _llm_type(self) -> str: - """Return type of llm.""" - return "fastChat" - - def get_num_tokens(self, text: str) -> int: - """Calculate num tokens with tiktoken package.""" - # tiktoken NOT supported for Python < 3.8 - if sys.version_info[1] < 8: - return super().get_num_tokens(text) - try: - import tiktoken - except ImportError: - raise ValueError( - "Could not import tiktoken python package. " - "This is needed in order to calculate get_num_tokens. " - "Please install it with `pip install tiktoken`." - ) - - enc = tiktoken.encoding_for_model(self.model_name) - - tokenized_text = enc.encode( - text, - allowed_special=self.allowed_special, - disallowed_special=self.disallowed_special, - ) - - # calculate the number of tokens in the encoded text - return len(tokenized_text) - - def modelname_to_contextsize(self, modelname: str) -> int: - """Calculate the maximum number of tokens possible to generate for a model. - - Args: - modelname: The modelname we want to know the context size for. - - Returns: - The maximum context size - - Example: - .. code-block:: python - - max_new_tokens = openai.modelname_to_contextsize("text-davinci-003") - """ - model_token_mapping = { - "vicuna-13b": 2049, - "koala": 2049, - "dolly-v2": 2049, - "oasst": 2049, - "stablelm": 2049, - } - - context_size = model_token_mapping.get(modelname, None) - - if context_size is None: - raise ValueError( - f"Unknown model: {modelname}. Please provide a valid OpenAI model name." - "Known models are: " + ", ".join(model_token_mapping.keys()) - ) - - return context_size - - def max_new_tokens_for_prompt(self, prompt: str) -> int: - """Calculate the maximum number of tokens possible to generate for a prompt. - - Args: - prompt: The prompt to pass into the model. - - Returns: - The maximum number of tokens to generate for a prompt. - - Example: - .. code-block:: python - - max_new_tokens = openai.max_token_for_prompt("Tell me a joke.") - """ - num_tokens = self.get_num_tokens(prompt) - - # get max context size for model by name - max_size = self.modelname_to_contextsize(self.model_name) - return max_size - num_tokens - - -class FastChatAPILLM(RemoteRpcModel, BaseFastChat, ABC): - """Wrapper around FastChat large language models. - - Example: - .. code-block:: python - - openai = FastChat(model_name="vicuna") - """ - checkPoint: LoaderCheckPoint = None - - history_len: int = 10 - - def __init__(self, checkPoint: LoaderCheckPoint = None): - super().__init__() - self.checkPoint = checkPoint - - @property - def _invocation_params(self) -> Dict[str, Any]: - return {**{"model": self.model_name}, **super()._invocation_params} - - @property - def _check_point(self) -> LoaderCheckPoint: - return self.checkPoint - - @property - def _history_len(self) -> int: - return self.history_len - - def set_history_len(self, history_len: int = 10) -> None: - self.history_len = history_len - - @property - def _api_key(self) -> str: - pass - - @property - def _api_base_url(self) -> str: - return self.api_base_url - - def set_api_key(self, api_key: str): - pass - - def set_api_base_url(self, api_base_url: str): - self.api_base_url = api_base_url - - def call_model_name(self, model_name): - self.model_name = model_name - - def generatorAnswer(self, prompt: str, - history: List[List[str]] = [], - streaming: bool = False): - generator = self.stream("Tell me a joke.") - for token in generator: - yield token - - history += [[prompt, token["text"]]] - answer_result = AnswerResult() - answer_result.history = history - answer_result.llm_output = {"answer": token["text"]} - yield answer_result diff --git a/models/llama_llm.py b/models/llama_llm.py index 400f704..1b0f403 100644 --- a/models/llama_llm.py +++ b/models/llama_llm.py @@ -22,7 +22,7 @@ class InvalidScoreLogitsProcessor(LogitsProcessor): class LLamaLLM(BaseAnswer, LLM, ABC): checkPoint: LoaderCheckPoint = None - history = [] + # history = [] history_len: int = 3 max_new_tokens: int = 500 num_beams: int = 1 @@ -88,9 +88,16 @@ class LLamaLLM(BaseAnswer, LLM, ABC): return reply # 将历史对话数组转换为文本格式 - def history_to_text(self, query): + def history_to_text(self, query, history): + """ + 历史对话软提示 + 这段代码首先定义了一个名为 history_to_text 的函数,用于将 self.history + 数组转换为所需的文本格式。然后,我们将格式化后的历史文本 + 再用 self.encode 将其转换为向量表示。最后,将历史对话向量与当前输入的对话向量拼接在一起。 + :return: + """ formatted_history = '' - history = self.history[-self.history_len:] if self.history_len > 0 else [] + history = history[-self.history_len:] if self.history_len > 0 else [] for i, (old_query, response) in enumerate(history): formatted_history += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response) formatted_history += "[Round {}]\n问:{}\n答:".format(len(history), query) @@ -116,20 +123,6 @@ class LLamaLLM(BaseAnswer, LLM, ABC): return input_ids, position_ids, attention_mask - def generate_softprompt_history_tensors(self, query): - """ - 历史对话软提示 - 这段代码首先定义了一个名为 history_to_text 的函数,用于将 self.history - 数组转换为所需的文本格式。然后,我们将格式化后的历史文本 - 再用 self.encode 将其转换为向量表示。最后,将历史对话向量与当前输入的对话向量拼接在一起。 - :return: - """ - - # 对话内容 - # 处理历史对话 - formatted_history = self.history_to_text(query) - return formatted_history - @property def _history_len(self) -> int: return self.history_len @@ -173,18 +166,18 @@ class LLamaLLM(BaseAnswer, LLM, ABC): new_tokens = len(output_ids[0]) - len(input_ids[0]) reply = self.decode(output_ids[0][-new_tokens:]) print(f"response:{reply}") - self.history = self.history + [[None, reply]] + print(f"+++++++++++++++++++++++++++++++++++") return reply def generatorAnswer(self, prompt: str, history: List[List[str]] = [], streaming: bool = False): - if history: - self.history = history + # TODO 需要实现chat对话模块和注意力模型,目前_call为langchain的LLM拓展的api,默认为无提示词模式,如果需要操作注意力模型,可以参考chat_glm的实现 - softprompt = self.generate_softprompt_history_tensors(prompt) + softprompt = self.history_to_text(prompt,history=history) response = self._call(prompt=softprompt, stop=['\n###']) + answer_result = AnswerResult() - answer_result.history = self.history + answer_result.history = history + [[None, response]] answer_result.llm_output = {"answer": response} yield answer_result