llm
This commit is contained in:
parent
218aca2e20
commit
b352c29d46
|
|
@ -10,12 +10,12 @@
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"INFO 2023-06-01 20:26:48,576-1d: \n",
|
"INFO 2023-06-09 20:52:01,296-1d: \n",
|
||||||
"loading model config\n",
|
"loading model config\n",
|
||||||
"llm device: cuda\n",
|
"llm device: cuda\n",
|
||||||
"embedding device: cuda\n",
|
"embedding device: cuda\n",
|
||||||
"dir: /media/gpt4-pdf-chatbot-langchain/dev-langchain-ChatGLM\n",
|
"dir: /media/gpt4-pdf-chatbot-langchain/dev-langchain-ChatGLM\n",
|
||||||
"flagging username: 7daba79785044bceb6896b9e6f8f9894\n",
|
"flagging username: 35d96e513c5347dbb0c1d7c2fb21cbd4\n",
|
||||||
"\n"
|
"\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
@ -42,7 +42,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 2,
|
||||||
"id": "68978c38-c0e9-4ae9-ba90-9c02aca335be",
|
"id": "68978c38-c0e9-4ae9-ba90-9c02aca335be",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
|
@ -50,7 +50,7 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Loading vicuna-7b-hf...\n"
|
"Loading vicuna-13b-hf...\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -84,12 +84,12 @@
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
"model_id": "9b61d05e18044b009c72b862c84ab5cb",
|
"model_id": "9df1856e06d1460683851a0b73537a6d",
|
||||||
"version_major": 2,
|
"version_major": 2,
|
||||||
"version_minor": 0
|
"version_minor": 0
|
||||||
},
|
},
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
|
"Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
|
@ -99,7 +99,7 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Loaded the model in 6.39 seconds.\n"
|
"Loaded the model in 11.22 seconds.\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -110,19 +110,18 @@
|
||||||
"from langchain.agents import initialize_agent, Tool\n",
|
"from langchain.agents import initialize_agent, Tool\n",
|
||||||
"from langchain.agents import AgentType\n",
|
"from langchain.agents import AgentType\n",
|
||||||
" \n",
|
" \n",
|
||||||
"args = parser.parse_args(args=['--model-dir', '/media/checkpoint/', '--model', 'vicuna-7b-hf', '--no-remote-model', '--load-in-8bit'])\n",
|
"args = parser.parse_args(args=['--model', 'vicuna-13b-hf', '--no-remote-model', '--load-in-8bit'])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"args_dict = vars(args)\n",
|
"args_dict = vars(args)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"shared.loaderCheckPoint = LoaderCheckPoint(args_dict)\n",
|
"shared.loaderCheckPoint = LoaderCheckPoint(args_dict)\n",
|
||||||
"torch.cuda.empty_cache()\n",
|
"torch.cuda.empty_cache()\n",
|
||||||
"shared.loaderCheckPoint.unload_model()\n",
|
"llm=shared.loaderLLM() \n"
|
||||||
"shared.loaderCheckPoint.reload_model() \n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": 3,
|
||||||
"id": "c8e4a58d-1a3a-484a-8417-bcec0eb7170e",
|
"id": "c8e4a58d-1a3a-484a-8417-bcec0eb7170e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
|
@ -130,7 +129,7 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"{'action': 'State of Dialogue History System', 'action_input': '露ᥫᩣ,'}\n"
|
"{'action': 'summary', 'action_input': '露ᥫᩣ,'}\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -188,10 +187,178 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 4,
|
||||||
|
"id": "a55f92ce-4ebf-4cb3-8e16-780c14b6517f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.tools import StructuredTool\n",
|
||||||
|
"\n",
|
||||||
|
"def multiplier(a: float, b: float) -> float:\n",
|
||||||
|
" \"\"\"Multiply the provided floats.\"\"\"\n",
|
||||||
|
" return a * b\n",
|
||||||
|
"\n",
|
||||||
|
"tool = StructuredTool.from_function(multiplier)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
"id": "e089a828-b662-4d9a-8d88-4bf95ccadbab",
|
"id": "e089a828-b662-4d9a-8d88-4bf95ccadbab",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain import OpenAI\n",
|
||||||
|
"from langchain.agents import initialize_agent, AgentType\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "d4ea7f0e-1ba9-4f40-82ec-7c453bd64945",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Structured tools are compatible with the STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION agent type. \n",
|
||||||
|
"agent_executor = initialize_agent([tool], llm, agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"id": "640bfdfb-41e7-4429-9718-8fa724de12b7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||||
|
"__call:System: Respond to the human as helpfully and accurately as possible. You have access to the following tools:\n",
|
||||||
|
"\n",
|
||||||
|
"multiplier: multiplier(a: float, b: float) -> float - Multiply the provided floats., args: {{'a': {{'title': 'A', 'type': 'number'}}, 'b': {{'title': 'B', 'type': 'number'}}}}\n",
|
||||||
|
"\n",
|
||||||
|
"Use a json blob to specify a tool by providing an action key (tool name) and an action_input key (tool input).\n",
|
||||||
|
"\n",
|
||||||
|
"Valid \"action\" values: \"Final Answer\" or multiplier\n",
|
||||||
|
"\n",
|
||||||
|
"Provide only ONE action per $JSON_BLOB, as shown:\n",
|
||||||
|
"\n",
|
||||||
|
"```\n",
|
||||||
|
"{\n",
|
||||||
|
" \"action\": $TOOL_NAME,\n",
|
||||||
|
" \"action_input\": $INPUT\n",
|
||||||
|
"}\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"Follow this format:\n",
|
||||||
|
"\n",
|
||||||
|
"Question: input question to answer\n",
|
||||||
|
"Thought: consider previous and subsequent steps\n",
|
||||||
|
"Action:\n",
|
||||||
|
"```\n",
|
||||||
|
"$JSON_BLOB\n",
|
||||||
|
"```\n",
|
||||||
|
"Observation: action result\n",
|
||||||
|
"... (repeat Thought/Action/Observation N times)\n",
|
||||||
|
"Thought: I know what to respond\n",
|
||||||
|
"Action:\n",
|
||||||
|
"```\n",
|
||||||
|
"{\n",
|
||||||
|
" \"action\": \"Final Answer\",\n",
|
||||||
|
" \"action_input\": \"Final response to human\"\n",
|
||||||
|
"}\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"Begin! Reminder to ALWAYS respond with a valid json blob of a single action. Use tools if necessary. Respond directly if appropriate. Format is Action:```$JSON_BLOB```then Observation:.\n",
|
||||||
|
"Thought:\n",
|
||||||
|
"Human: What is 1 times 14\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"response:System: ```{\"action\":\"multiplier\",\"action_input\":{\"a\":1,\"b\":14}}``\n",
|
||||||
|
"\n",
|
||||||
|
"Observation:\n",
|
||||||
|
"\u001b[32;1m\u001b[1;3mSystem: ```{\"action\":\"multiplier\",\"action_input\":{\"a\":1,\"b\":14}}``\n",
|
||||||
|
"\n",
|
||||||
|
"Observation:\u001b[0m\n",
|
||||||
|
"Observation: \u001b[36;1m\u001b[1;3m14.0\u001b[0m\n",
|
||||||
|
"Thought:__call:System: Respond to the human as helpfully and accurately as possible. You have access to the following tools:\n",
|
||||||
|
"\n",
|
||||||
|
"multiplier: multiplier(a: float, b: float) -> float - Multiply the provided floats., args: {{'a': {{'title': 'A', 'type': 'number'}}, 'b': {{'title': 'B', 'type': 'number'}}}}\n",
|
||||||
|
"\n",
|
||||||
|
"Use a json blob to specify a tool by providing an action key (tool name) and an action_input key (tool input).\n",
|
||||||
|
"\n",
|
||||||
|
"Valid \"action\" values: \"Final Answer\" or multiplier\n",
|
||||||
|
"\n",
|
||||||
|
"Provide only ONE action per $JSON_BLOB, as shown:\n",
|
||||||
|
"\n",
|
||||||
|
"```\n",
|
||||||
|
"{\n",
|
||||||
|
" \"action\": $TOOL_NAME,\n",
|
||||||
|
" \"action_input\": $INPUT\n",
|
||||||
|
"}\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"Follow this format:\n",
|
||||||
|
"\n",
|
||||||
|
"Question: input question to answer\n",
|
||||||
|
"Thought: consider previous and subsequent steps\n",
|
||||||
|
"Action:\n",
|
||||||
|
"```\n",
|
||||||
|
"$JSON_BLOB\n",
|
||||||
|
"```\n",
|
||||||
|
"Observation: action result\n",
|
||||||
|
"... (repeat Thought/Action/Observation N times)\n",
|
||||||
|
"Thought: I know what to respond\n",
|
||||||
|
"Action:\n",
|
||||||
|
"```\n",
|
||||||
|
"{\n",
|
||||||
|
" \"action\": \"Final Answer\",\n",
|
||||||
|
" \"action_input\": \"Final response to human\"\n",
|
||||||
|
"}\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"Begin! Reminder to ALWAYS respond with a valid json blob of a single action. Use tools if necessary. Respond directly if appropriate. Format is Action:```$JSON_BLOB```then Observation:.\n",
|
||||||
|
"Thought:\n",
|
||||||
|
"Human: What is 1 times 14\n",
|
||||||
|
"\n",
|
||||||
|
"This was your previous work (but I haven't seen any of it! I only see what you return as final answer):\n",
|
||||||
|
"System: ```{\"action\":\"multiplier\",\"action_input\":{\"a\":1,\"b\":14}}``\n",
|
||||||
|
"\n",
|
||||||
|
"Observation:\n",
|
||||||
|
"Observation: 14.0\n",
|
||||||
|
"Thought:\n",
|
||||||
|
"response:\n",
|
||||||
|
"\u001b[32;1m\u001b[1;3m\u001b[0m\n",
|
||||||
|
"\n",
|
||||||
|
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"''"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"agent_executor.run(\"What is 1 times 14\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "9baa881f-5ff2-4958-b3a2-1653a5e8bc3b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -1,502 +0,0 @@
|
||||||
"""Wrapper around FastChat APIs."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import sys
|
|
||||||
import warnings
|
|
||||||
from abc import ABC
|
|
||||||
from typing import (
|
|
||||||
AbstractSet,
|
|
||||||
Any,
|
|
||||||
Callable,
|
|
||||||
Collection,
|
|
||||||
Dict,
|
|
||||||
Generator,
|
|
||||||
List,
|
|
||||||
Literal,
|
|
||||||
Mapping,
|
|
||||||
Optional,
|
|
||||||
Set,
|
|
||||||
Tuple,
|
|
||||||
Union,
|
|
||||||
)
|
|
||||||
|
|
||||||
from pydantic import Extra, Field, root_validator
|
|
||||||
from tenacity import (
|
|
||||||
before_sleep_log,
|
|
||||||
retry,
|
|
||||||
retry_if_exception_type,
|
|
||||||
stop_after_attempt,
|
|
||||||
wait_exponential,
|
|
||||||
)
|
|
||||||
|
|
||||||
from langchain.llms.base import BaseLLM
|
|
||||||
from langchain.schema import Generation, LLMResult
|
|
||||||
from langchain.utils import get_from_dict_or_env
|
|
||||||
from models.base import (RemoteRpcModel,
|
|
||||||
AnswerResult)
|
|
||||||
from models.loader import LoaderCheckPoint
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def _streaming_response_template() -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
:return: 响应结构
|
|
||||||
"""
|
|
||||||
return {
|
|
||||||
"text": "",
|
|
||||||
"error_code": 0,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _update_response(response: Dict[str, Any], stream_response: Dict[str, Any]) -> None:
|
|
||||||
"""Update response from the stream response."""
|
|
||||||
response["text"] += stream_response["text"]
|
|
||||||
response["error_code"] += stream_response["error_code"]
|
|
||||||
|
|
||||||
|
|
||||||
class BaseFastChat(BaseLLM):
|
|
||||||
"""Wrapper around FastChat large language models."""
|
|
||||||
|
|
||||||
api_base_url: str = "http://localhost:21002/worker_generate_stream"
|
|
||||||
model_name: str = "text-davinci-003"
|
|
||||||
"""Model name to use."""
|
|
||||||
temperature: float = 0.7
|
|
||||||
"""What sampling temperature to use."""
|
|
||||||
max_new_tokens: int = 200
|
|
||||||
stop: int = 20
|
|
||||||
batch_size: int = 20
|
|
||||||
"""Maximum number of retries to make when generating."""
|
|
||||||
streaming: bool = False
|
|
||||||
"""Penalizes repeated tokens."""
|
|
||||||
n: int = 1
|
|
||||||
"""Whether to stream the results or not."""
|
|
||||||
allowed_special: Union[Literal["all"], AbstractSet[str]] = set()
|
|
||||||
"""Set of special tokens that are allowed。"""
|
|
||||||
disallowed_special: Union[Literal["all"], Collection[str]] = "all"
|
|
||||||
"""Set of special tokens that are not allowed。"""
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
"""Configuration for this pydantic object."""
|
|
||||||
|
|
||||||
extra = Extra.ignore
|
|
||||||
|
|
||||||
@root_validator(pre=True)
|
|
||||||
def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""Build extra kwargs from additional params that were passed in."""
|
|
||||||
all_required_field_names = {field.alias for field in cls.__fields__.values()}
|
|
||||||
|
|
||||||
extra = values.get("model_kwargs", {})
|
|
||||||
for field_name in list(values):
|
|
||||||
if field_name not in all_required_field_names:
|
|
||||||
if field_name in extra:
|
|
||||||
raise ValueError(f"Found {field_name} supplied twice.")
|
|
||||||
logger.warning(
|
|
||||||
f"""WARNING! {field_name} is not default parameter.
|
|
||||||
{field_name} was transfered to model_kwargs.
|
|
||||||
Please confirm that {field_name} is what you intended."""
|
|
||||||
)
|
|
||||||
extra[field_name] = values.pop(field_name)
|
|
||||||
values["model_kwargs"] = extra
|
|
||||||
return values
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _default_params(self) -> Dict[str, Any]:
|
|
||||||
"""Get the default parameters for calling FastChat API."""
|
|
||||||
normal_params = {
|
|
||||||
"model": self.model_name,
|
|
||||||
"prompt": '',
|
|
||||||
"max_new_tokens": self.max_new_tokens,
|
|
||||||
"temperature": self.temperature,
|
|
||||||
}
|
|
||||||
|
|
||||||
return {**normal_params}
|
|
||||||
|
|
||||||
def _generate(
|
|
||||||
self, prompts: List[str], stop: Optional[List[str]] = None
|
|
||||||
) -> LLMResult:
|
|
||||||
"""Call out to FastChat's endpoint with k unique prompts.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prompts: The prompts to pass into the model.
|
|
||||||
stop: Optional list of stop words to use when generating.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The full LLM output.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
response = fastchat.generate(["Tell me a joke."])
|
|
||||||
"""
|
|
||||||
# TODO: write a unit test for this
|
|
||||||
params = self._invocation_params
|
|
||||||
sub_prompts = self.get_sub_prompts(params, prompts)
|
|
||||||
choices = []
|
|
||||||
token_usage: Dict[str, int] = {}
|
|
||||||
headers = {"User-Agent": "fastchat Client"}
|
|
||||||
for _prompts in sub_prompts:
|
|
||||||
|
|
||||||
params["prompt"] = _prompts[0]
|
|
||||||
|
|
||||||
if stop is not None:
|
|
||||||
if "stop" in params:
|
|
||||||
raise ValueError("`stop` found in both the input and default params.")
|
|
||||||
params["stop"] = stop
|
|
||||||
|
|
||||||
if self.streaming:
|
|
||||||
if len(_prompts) > 1:
|
|
||||||
raise ValueError("Cannot stream results with multiple prompts.")
|
|
||||||
|
|
||||||
response_template = _streaming_response_template()
|
|
||||||
response = requests.post(
|
|
||||||
self.api_base_url,
|
|
||||||
headers=headers,
|
|
||||||
json=params,
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
for stream_resp in response.iter_lines(
|
|
||||||
chunk_size=8192, decode_unicode=False, delimiter=b"\0"
|
|
||||||
):
|
|
||||||
if stream_resp:
|
|
||||||
data = json.loads(stream_resp.decode("utf-8"))
|
|
||||||
skip_echo_len = len(_prompts[0])
|
|
||||||
output = data["text"][skip_echo_len:].strip()
|
|
||||||
data["text"] = output
|
|
||||||
self.callback_manager.on_llm_new_token(
|
|
||||||
output,
|
|
||||||
verbose=self.verbose,
|
|
||||||
logprobs=data["error_code"],
|
|
||||||
)
|
|
||||||
_update_response(response_template, data)
|
|
||||||
choices.append(response_template)
|
|
||||||
else:
|
|
||||||
response_template = _streaming_response_template()
|
|
||||||
response = requests.post(
|
|
||||||
self.api_base_url,
|
|
||||||
headers=headers,
|
|
||||||
json=params,
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
for stream_resp in response.iter_lines(
|
|
||||||
chunk_size=8192, decode_unicode=False, delimiter=b"\0"
|
|
||||||
):
|
|
||||||
if stream_resp:
|
|
||||||
data = json.loads(stream_resp.decode("utf-8"))
|
|
||||||
skip_echo_len = len(_prompts[0])
|
|
||||||
output = data["text"][skip_echo_len:].strip()
|
|
||||||
data["text"] = output
|
|
||||||
_update_response(response_template, data)
|
|
||||||
|
|
||||||
choices.append(response_template)
|
|
||||||
|
|
||||||
return self.create_llm_result(choices, prompts, token_usage)
|
|
||||||
|
|
||||||
async def _agenerate(
|
|
||||||
self, prompts: List[str], stop: Optional[List[str]] = None
|
|
||||||
) -> LLMResult:
|
|
||||||
"""Call out to FastChat's endpoint async with k unique prompts."""
|
|
||||||
params = self._invocation_params
|
|
||||||
sub_prompts = self.get_sub_prompts(params, prompts)
|
|
||||||
choices = []
|
|
||||||
token_usage: Dict[str, int] = {}
|
|
||||||
|
|
||||||
headers = {"User-Agent": "fastchat Client"}
|
|
||||||
for _prompts in sub_prompts:
|
|
||||||
|
|
||||||
params["prompt"] = _prompts[0]
|
|
||||||
if stop is not None:
|
|
||||||
if "stop" in params:
|
|
||||||
raise ValueError("`stop` found in both the input and default params.")
|
|
||||||
params["stop"] = stop
|
|
||||||
|
|
||||||
if self.streaming:
|
|
||||||
if len(_prompts) > 1:
|
|
||||||
raise ValueError("Cannot stream results with multiple prompts.")
|
|
||||||
|
|
||||||
response_template = _streaming_response_template()
|
|
||||||
response = requests.post(
|
|
||||||
self.api_base_url,
|
|
||||||
headers=headers,
|
|
||||||
json=params,
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
for stream_resp in response.iter_lines(
|
|
||||||
chunk_size=8192, decode_unicode=False, delimiter=b"\0"
|
|
||||||
):
|
|
||||||
if stream_resp:
|
|
||||||
data = json.loads(stream_resp.decode("utf-8"))
|
|
||||||
skip_echo_len = len(_prompts[0])
|
|
||||||
output = data["text"][skip_echo_len:].strip()
|
|
||||||
data["text"] = output
|
|
||||||
self.callback_manager.on_llm_new_token(
|
|
||||||
output,
|
|
||||||
verbose=self.verbose,
|
|
||||||
logprobs=data["error_code"],
|
|
||||||
)
|
|
||||||
_update_response(response_template, data)
|
|
||||||
choices.append(response_template)
|
|
||||||
else:
|
|
||||||
response_template = _streaming_response_template()
|
|
||||||
response = requests.post(
|
|
||||||
self.api_base_url,
|
|
||||||
headers=headers,
|
|
||||||
json=params,
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
for stream_resp in response.iter_lines(
|
|
||||||
chunk_size=8192, decode_unicode=False, delimiter=b"\0"
|
|
||||||
):
|
|
||||||
if stream_resp:
|
|
||||||
data = json.loads(stream_resp.decode("utf-8"))
|
|
||||||
skip_echo_len = len(_prompts[0])
|
|
||||||
output = data["text"][skip_echo_len:].strip()
|
|
||||||
data["text"] = output
|
|
||||||
_update_response(response_template, data)
|
|
||||||
|
|
||||||
choices.append(response_template)
|
|
||||||
|
|
||||||
return self.create_llm_result(choices, prompts, token_usage)
|
|
||||||
|
|
||||||
def get_sub_prompts(
|
|
||||||
self,
|
|
||||||
params: Dict[str, Any],
|
|
||||||
prompts: List[str],
|
|
||||||
) -> List[List[str]]:
|
|
||||||
"""Get the sub prompts for llm call."""
|
|
||||||
if params["max_new_tokens"] == -1:
|
|
||||||
if len(prompts) != 1:
|
|
||||||
raise ValueError(
|
|
||||||
"max_new_tokens set to -1 not supported for multiple inputs."
|
|
||||||
)
|
|
||||||
params["max_new_tokens"] = self.max_new_tokens_for_prompt(prompts[0])
|
|
||||||
# append pload
|
|
||||||
sub_prompts = [
|
|
||||||
prompts[i: i + self.batch_size]
|
|
||||||
for i in range(0, len(prompts), self.batch_size)
|
|
||||||
]
|
|
||||||
|
|
||||||
return sub_prompts
|
|
||||||
|
|
||||||
def create_llm_result(
|
|
||||||
self, choices: Any, prompts: List[str], token_usage: Dict[str, int]
|
|
||||||
) -> LLMResult:
|
|
||||||
"""Create the LLMResult from the choices and prompts."""
|
|
||||||
generations = []
|
|
||||||
for i, _ in enumerate(prompts):
|
|
||||||
sub_choices = choices[i * self.n: (i + 1) * self.n]
|
|
||||||
generations.append(
|
|
||||||
[
|
|
||||||
Generation(
|
|
||||||
text=choice["text"],
|
|
||||||
generation_info=dict(
|
|
||||||
finish_reason='over',
|
|
||||||
logprobs=choice["text"],
|
|
||||||
),
|
|
||||||
)
|
|
||||||
for choice in sub_choices
|
|
||||||
]
|
|
||||||
)
|
|
||||||
llm_output = {"token_usage": token_usage, "model_name": self.model_name}
|
|
||||||
return LLMResult(generations=generations, llm_output=llm_output)
|
|
||||||
|
|
||||||
def stream(self, prompt: str, stop: Optional[List[str]] = None) -> Generator:
|
|
||||||
"""Call FastChat with streaming flag and return the resulting generator.
|
|
||||||
|
|
||||||
BETA: this is a beta feature while we figure out the right abstraction.
|
|
||||||
Once that happens, this interface could change.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prompt: The prompts to pass into the model.
|
|
||||||
stop: Optional list of stop words to use when generating.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A generator representing the stream of tokens from OpenAI.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
generator = fastChat.stream("Tell me a joke.")
|
|
||||||
for token in generator:
|
|
||||||
yield token
|
|
||||||
"""
|
|
||||||
params = self._invocation_params
|
|
||||||
params["prompt"] = prompt
|
|
||||||
if stop is not None:
|
|
||||||
if "stop" in params:
|
|
||||||
raise ValueError("`stop` found in both the input and default params.")
|
|
||||||
params["stop"] = stop
|
|
||||||
|
|
||||||
headers = {"User-Agent": "fastchat Client"}
|
|
||||||
response = requests.post(
|
|
||||||
self.api_base_url,
|
|
||||||
headers=headers,
|
|
||||||
json=params,
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
for stream_resp in response.iter_lines(
|
|
||||||
chunk_size=8192, decode_unicode=False, delimiter=b"\0"
|
|
||||||
):
|
|
||||||
if stream_resp:
|
|
||||||
data = json.loads(stream_resp.decode("utf-8"))
|
|
||||||
skip_echo_len = len(prompt)
|
|
||||||
output = data["text"][skip_echo_len:].strip()
|
|
||||||
data["text"] = output
|
|
||||||
yield data
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _invocation_params(self) -> Dict[str, Any]:
|
|
||||||
"""Get the parameters used to invoke the model."""
|
|
||||||
return self._default_params
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _identifying_params(self) -> Mapping[str, Any]:
|
|
||||||
"""Get the identifying parameters."""
|
|
||||||
return {**{"model_name": self.model_name}, **self._default_params}
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _llm_type(self) -> str:
|
|
||||||
"""Return type of llm."""
|
|
||||||
return "fastChat"
|
|
||||||
|
|
||||||
def get_num_tokens(self, text: str) -> int:
|
|
||||||
"""Calculate num tokens with tiktoken package."""
|
|
||||||
# tiktoken NOT supported for Python < 3.8
|
|
||||||
if sys.version_info[1] < 8:
|
|
||||||
return super().get_num_tokens(text)
|
|
||||||
try:
|
|
||||||
import tiktoken
|
|
||||||
except ImportError:
|
|
||||||
raise ValueError(
|
|
||||||
"Could not import tiktoken python package. "
|
|
||||||
"This is needed in order to calculate get_num_tokens. "
|
|
||||||
"Please install it with `pip install tiktoken`."
|
|
||||||
)
|
|
||||||
|
|
||||||
enc = tiktoken.encoding_for_model(self.model_name)
|
|
||||||
|
|
||||||
tokenized_text = enc.encode(
|
|
||||||
text,
|
|
||||||
allowed_special=self.allowed_special,
|
|
||||||
disallowed_special=self.disallowed_special,
|
|
||||||
)
|
|
||||||
|
|
||||||
# calculate the number of tokens in the encoded text
|
|
||||||
return len(tokenized_text)
|
|
||||||
|
|
||||||
def modelname_to_contextsize(self, modelname: str) -> int:
|
|
||||||
"""Calculate the maximum number of tokens possible to generate for a model.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
modelname: The modelname we want to know the context size for.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The maximum context size
|
|
||||||
|
|
||||||
Example:
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
max_new_tokens = openai.modelname_to_contextsize("text-davinci-003")
|
|
||||||
"""
|
|
||||||
model_token_mapping = {
|
|
||||||
"vicuna-13b": 2049,
|
|
||||||
"koala": 2049,
|
|
||||||
"dolly-v2": 2049,
|
|
||||||
"oasst": 2049,
|
|
||||||
"stablelm": 2049,
|
|
||||||
}
|
|
||||||
|
|
||||||
context_size = model_token_mapping.get(modelname, None)
|
|
||||||
|
|
||||||
if context_size is None:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unknown model: {modelname}. Please provide a valid OpenAI model name."
|
|
||||||
"Known models are: " + ", ".join(model_token_mapping.keys())
|
|
||||||
)
|
|
||||||
|
|
||||||
return context_size
|
|
||||||
|
|
||||||
def max_new_tokens_for_prompt(self, prompt: str) -> int:
|
|
||||||
"""Calculate the maximum number of tokens possible to generate for a prompt.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
prompt: The prompt to pass into the model.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The maximum number of tokens to generate for a prompt.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
max_new_tokens = openai.max_token_for_prompt("Tell me a joke.")
|
|
||||||
"""
|
|
||||||
num_tokens = self.get_num_tokens(prompt)
|
|
||||||
|
|
||||||
# get max context size for model by name
|
|
||||||
max_size = self.modelname_to_contextsize(self.model_name)
|
|
||||||
return max_size - num_tokens
|
|
||||||
|
|
||||||
|
|
||||||
class FastChatAPILLM(RemoteRpcModel, BaseFastChat, ABC):
|
|
||||||
"""Wrapper around FastChat large language models.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
.. code-block:: python
|
|
||||||
|
|
||||||
openai = FastChat(model_name="vicuna")
|
|
||||||
"""
|
|
||||||
checkPoint: LoaderCheckPoint = None
|
|
||||||
|
|
||||||
history_len: int = 10
|
|
||||||
|
|
||||||
def __init__(self, checkPoint: LoaderCheckPoint = None):
|
|
||||||
super().__init__()
|
|
||||||
self.checkPoint = checkPoint
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _invocation_params(self) -> Dict[str, Any]:
|
|
||||||
return {**{"model": self.model_name}, **super()._invocation_params}
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _check_point(self) -> LoaderCheckPoint:
|
|
||||||
return self.checkPoint
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _history_len(self) -> int:
|
|
||||||
return self.history_len
|
|
||||||
|
|
||||||
def set_history_len(self, history_len: int = 10) -> None:
|
|
||||||
self.history_len = history_len
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _api_key(self) -> str:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@property
|
|
||||||
def _api_base_url(self) -> str:
|
|
||||||
return self.api_base_url
|
|
||||||
|
|
||||||
def set_api_key(self, api_key: str):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def set_api_base_url(self, api_base_url: str):
|
|
||||||
self.api_base_url = api_base_url
|
|
||||||
|
|
||||||
def call_model_name(self, model_name):
|
|
||||||
self.model_name = model_name
|
|
||||||
|
|
||||||
def generatorAnswer(self, prompt: str,
|
|
||||||
history: List[List[str]] = [],
|
|
||||||
streaming: bool = False):
|
|
||||||
generator = self.stream("Tell me a joke.")
|
|
||||||
for token in generator:
|
|
||||||
yield token
|
|
||||||
|
|
||||||
history += [[prompt, token["text"]]]
|
|
||||||
answer_result = AnswerResult()
|
|
||||||
answer_result.history = history
|
|
||||||
answer_result.llm_output = {"answer": token["text"]}
|
|
||||||
yield answer_result
|
|
||||||
|
|
@ -22,7 +22,7 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
|
||||||
|
|
||||||
class LLamaLLM(BaseAnswer, LLM, ABC):
|
class LLamaLLM(BaseAnswer, LLM, ABC):
|
||||||
checkPoint: LoaderCheckPoint = None
|
checkPoint: LoaderCheckPoint = None
|
||||||
history = []
|
# history = []
|
||||||
history_len: int = 3
|
history_len: int = 3
|
||||||
max_new_tokens: int = 500
|
max_new_tokens: int = 500
|
||||||
num_beams: int = 1
|
num_beams: int = 1
|
||||||
|
|
@ -88,9 +88,16 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
|
||||||
return reply
|
return reply
|
||||||
|
|
||||||
# 将历史对话数组转换为文本格式
|
# 将历史对话数组转换为文本格式
|
||||||
def history_to_text(self, query):
|
def history_to_text(self, query, history):
|
||||||
|
"""
|
||||||
|
历史对话软提示
|
||||||
|
这段代码首先定义了一个名为 history_to_text 的函数,用于将 self.history
|
||||||
|
数组转换为所需的文本格式。然后,我们将格式化后的历史文本
|
||||||
|
再用 self.encode 将其转换为向量表示。最后,将历史对话向量与当前输入的对话向量拼接在一起。
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
formatted_history = ''
|
formatted_history = ''
|
||||||
history = self.history[-self.history_len:] if self.history_len > 0 else []
|
history = history[-self.history_len:] if self.history_len > 0 else []
|
||||||
for i, (old_query, response) in enumerate(history):
|
for i, (old_query, response) in enumerate(history):
|
||||||
formatted_history += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response)
|
formatted_history += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response)
|
||||||
formatted_history += "[Round {}]\n问:{}\n答:".format(len(history), query)
|
formatted_history += "[Round {}]\n问:{}\n答:".format(len(history), query)
|
||||||
|
|
@ -116,20 +123,6 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
|
||||||
|
|
||||||
return input_ids, position_ids, attention_mask
|
return input_ids, position_ids, attention_mask
|
||||||
|
|
||||||
def generate_softprompt_history_tensors(self, query):
|
|
||||||
"""
|
|
||||||
历史对话软提示
|
|
||||||
这段代码首先定义了一个名为 history_to_text 的函数,用于将 self.history
|
|
||||||
数组转换为所需的文本格式。然后,我们将格式化后的历史文本
|
|
||||||
再用 self.encode 将其转换为向量表示。最后,将历史对话向量与当前输入的对话向量拼接在一起。
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
|
|
||||||
# 对话内容
|
|
||||||
# 处理历史对话
|
|
||||||
formatted_history = self.history_to_text(query)
|
|
||||||
return formatted_history
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _history_len(self) -> int:
|
def _history_len(self) -> int:
|
||||||
return self.history_len
|
return self.history_len
|
||||||
|
|
@ -173,18 +166,18 @@ class LLamaLLM(BaseAnswer, LLM, ABC):
|
||||||
new_tokens = len(output_ids[0]) - len(input_ids[0])
|
new_tokens = len(output_ids[0]) - len(input_ids[0])
|
||||||
reply = self.decode(output_ids[0][-new_tokens:])
|
reply = self.decode(output_ids[0][-new_tokens:])
|
||||||
print(f"response:{reply}")
|
print(f"response:{reply}")
|
||||||
self.history = self.history + [[None, reply]]
|
print(f"+++++++++++++++++++++++++++++++++++")
|
||||||
return reply
|
return reply
|
||||||
|
|
||||||
def generatorAnswer(self, prompt: str,
|
def generatorAnswer(self, prompt: str,
|
||||||
history: List[List[str]] = [],
|
history: List[List[str]] = [],
|
||||||
streaming: bool = False):
|
streaming: bool = False):
|
||||||
if history:
|
|
||||||
self.history = history
|
|
||||||
# TODO 需要实现chat对话模块和注意力模型,目前_call为langchain的LLM拓展的api,默认为无提示词模式,如果需要操作注意力模型,可以参考chat_glm的实现
|
# TODO 需要实现chat对话模块和注意力模型,目前_call为langchain的LLM拓展的api,默认为无提示词模式,如果需要操作注意力模型,可以参考chat_glm的实现
|
||||||
softprompt = self.generate_softprompt_history_tensors(prompt)
|
softprompt = self.history_to_text(prompt,history=history)
|
||||||
response = self._call(prompt=softprompt, stop=['\n###'])
|
response = self._call(prompt=softprompt, stop=['\n###'])
|
||||||
|
|
||||||
answer_result = AnswerResult()
|
answer_result = AnswerResult()
|
||||||
answer_result.history = self.history
|
answer_result.history = history + [[None, response]]
|
||||||
answer_result.llm_output = {"answer": response}
|
answer_result.llm_output = {"answer": response}
|
||||||
yield answer_result
|
yield answer_result
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue