在args.py中增加ptuning相关的参数 (#838)
* 修复 bing_search.py的typo;更新model_config.py中Bing Subscription Key申请方式及注意事项 * 更新FAQ,增加了[Errno 110] Connection timed out的原因与解决方案 * 修改loader.py中load_in_8bit失败的原因和详细解决方案 * update loader.py * stream_chat_bing * 修改stream_chat的接口,在请求体中选择knowledge_base_id;增加stream_chat_bing接口 * 优化cli_demo.py的逻辑:支持 输入提示;多输入;重新输入 * update cli_demo.py * 按照review建议进行修改 * 修改默认的多卡部署方案,基本保证针对新模型也不会失败 * 测试openai接口成功 * add ptuning-v2 dir * 支持命令行输入ptuning路径 * 在FAQ中给出加载量化版本失败的原因和解决方案 * print error * udpate * Update args.py * debug for fastchat_openai_llm * temporarily save * update faq for --------- Co-authored-by: imClumsyPanda <littlepanda0716@gmail.com> Co-authored-by: zg h <bj wang@hzg0601-acer.hundsun.com>
This commit is contained in:
parent
f88bf2cbf3
commit
ee7285cd93
|
|
@ -87,6 +87,12 @@ llm_model_dict = {
|
||||||
"local_model_path": None,
|
"local_model_path": None,
|
||||||
"provides": "MOSSLLMChain"
|
"provides": "MOSSLLMChain"
|
||||||
},
|
},
|
||||||
|
"moss-int4": {
|
||||||
|
"name": "moss",
|
||||||
|
"pretrained_model_name": "fnlp/moss-moon-003-sft-int4",
|
||||||
|
"local_model_path": None,
|
||||||
|
"provides": "MOSSLLM"
|
||||||
|
},
|
||||||
"vicuna-13b-hf": {
|
"vicuna-13b-hf": {
|
||||||
"name": "vicuna-13b-hf",
|
"name": "vicuna-13b-hf",
|
||||||
"pretrained_model_name": "vicuna-13b-hf",
|
"pretrained_model_name": "vicuna-13b-hf",
|
||||||
|
|
@ -148,6 +154,15 @@ llm_model_dict = {
|
||||||
"provides": "FastChatOpenAILLMChain", # 使用fastchat api时,需保证"provides"为"FastChatOpenAILLMChain"
|
"provides": "FastChatOpenAILLMChain", # 使用fastchat api时,需保证"provides"为"FastChatOpenAILLMChain"
|
||||||
"api_base_url": "http://localhost:8000/v1", # "name"修改为fastchat服务中的"api_base_url"
|
"api_base_url": "http://localhost:8000/v1", # "name"修改为fastchat服务中的"api_base_url"
|
||||||
"api_key": "EMPTY"
|
"api_key": "EMPTY"
|
||||||
|
},
|
||||||
|
# 通过 fastchat 调用的模型请参考如下格式
|
||||||
|
"fastchat-chatglm-6b-int4": {
|
||||||
|
"name": "chatglm-6b-int4", # "name"修改为fastchat服务中的"model_name"
|
||||||
|
"pretrained_model_name": "chatglm-6b-int4",
|
||||||
|
"local_model_path": None,
|
||||||
|
"provides": "FastChatOpenAILLMChain", # 使用fastchat api时,需保证"provides"为"FastChatOpenAILLMChain"
|
||||||
|
"api_base_url": "http://localhost:8001/v1", # "name"修改为fastchat服务中的"api_base_url"
|
||||||
|
"api_key": "EMPTY"
|
||||||
},
|
},
|
||||||
"fastchat-chatglm2-6b": {
|
"fastchat-chatglm2-6b": {
|
||||||
"name": "chatglm2-6b", # "name"修改为fastchat服务中的"model_name"
|
"name": "chatglm2-6b", # "name"修改为fastchat服务中的"model_name"
|
||||||
|
|
@ -173,7 +188,7 @@ llm_model_dict = {
|
||||||
# 如果报出:raise NewConnectionError(
|
# 如果报出:raise NewConnectionError(
|
||||||
# urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x000001FE4BDB85E0>:
|
# urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x000001FE4BDB85E0>:
|
||||||
# Failed to establish a new connection: [WinError 10060]
|
# Failed to establish a new connection: [WinError 10060]
|
||||||
# 则是因为内地和香港的IP都被OPENAI封了,需要挂切换为日本、新加坡等地
|
# 则是因为内地和香港的IP都被OPENAI封了,需要切换为日本、新加坡等地
|
||||||
"openai-chatgpt-3.5": {
|
"openai-chatgpt-3.5": {
|
||||||
"name": "gpt-3.5-turbo",
|
"name": "gpt-3.5-turbo",
|
||||||
"pretrained_model_name": "gpt-3.5-turbo",
|
"pretrained_model_name": "gpt-3.5-turbo",
|
||||||
|
|
@ -186,7 +201,7 @@ llm_model_dict = {
|
||||||
}
|
}
|
||||||
|
|
||||||
# LLM 名称
|
# LLM 名称
|
||||||
LLM_MODEL = "chatglm-6b"
|
LLM_MODEL = "fastchat-chatglm-6b-int4"
|
||||||
# 量化加载8bit 模型
|
# 量化加载8bit 模型
|
||||||
LOAD_IN_8BIT = False
|
LOAD_IN_8BIT = False
|
||||||
# Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
|
# Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
|
||||||
|
|
@ -203,7 +218,7 @@ STREAMING = True
|
||||||
|
|
||||||
# Use p-tuning-v2 PrefixEncoder
|
# Use p-tuning-v2 PrefixEncoder
|
||||||
USE_PTUNING_V2 = False
|
USE_PTUNING_V2 = False
|
||||||
|
PTUNING_DIR='./ptuing-v2'
|
||||||
# LLM running device
|
# LLM running device
|
||||||
LLM_DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
|
LLM_DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
|
||||||
|
|
||||||
|
|
|
||||||
19
docs/FAQ.md
19
docs/FAQ.md
|
|
@ -177,3 +177,22 @@ download_with_progressbar(url, tmp_path)
|
||||||
Q14 调用api中的 `bing_search_chat`接口时,报出 `Failed to establish a new connection: [Errno 110] Connection timed out`
|
Q14 调用api中的 `bing_search_chat`接口时,报出 `Failed to establish a new connection: [Errno 110] Connection timed out`
|
||||||
|
|
||||||
这是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG--!
|
这是因为服务器加了防火墙,需要联系管理员加白名单,如果公司的服务器的话,就别想了GG--!
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Q15 加载chatglm-6b-int8或chatglm-6b-int4抛出 `RuntimeError: Only Tensors of floating point andcomplex dtype can require gradients`
|
||||||
|
|
||||||
|
疑为chatglm的quantization的问题或torch版本差异问题,针对已经变为Parameter的torch.zeros矩阵也执行Parameter操作,从而抛出 `RuntimeError: Only Tensors of floating point andcomplex dtype can require gradients`。解决办法是在chatglm-项目的原始文件中的quantization.py文件374行改为:
|
||||||
|
|
||||||
|
```
|
||||||
|
try:
|
||||||
|
self.weight =Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
如果上述方式不起作用,则在.cache/hugggingface/modules/目录下针对chatglm项目的原始文件中的quantization.py文件执行上述操作,若软链接不止一个,按照错误提示选择正确的路径。
|
||||||
|
|
||||||
|
注:虽然模型可以顺利加载但在cpu上仍存在推理失败的可能:即针对每个问题,模型一直输出gugugugu。
|
||||||
|
|
||||||
|
因此,最好不要试图用cpu加载量化模型,原因可能是目前python主流量化包的量化操作是在gpu上执行的,会天然地存在gap。
|
||||||
|
|
|
||||||
|
|
@ -245,7 +245,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
chain = FastChatOpenAILLMChain()
|
chain = FastChatOpenAILLMChain()
|
||||||
|
|
||||||
chain.set_api_key("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")
|
chain.set_api_key("EMPTY")
|
||||||
# chain.set_api_base_url("https://api.openai.com/v1")
|
# chain.set_api_base_url("https://api.openai.com/v1")
|
||||||
# chain.call_model_name("gpt-3.5-turbo")
|
# chain.call_model_name("gpt-3.5-turbo")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
from configs.model_config import *
|
from configs.model_config import *
|
||||||
|
|
@ -43,7 +44,8 @@ parser.add_argument('--no-remote-model', action='store_true', help='remote in th
|
||||||
parser.add_argument('--model-name', type=str, default=LLM_MODEL, help='Name of the model to load by default.')
|
parser.add_argument('--model-name', type=str, default=LLM_MODEL, help='Name of the model to load by default.')
|
||||||
parser.add_argument('--lora', type=str, help='Name of the LoRA to apply to the model by default.')
|
parser.add_argument('--lora', type=str, help='Name of the LoRA to apply to the model by default.')
|
||||||
parser.add_argument("--lora-dir", type=str, default=LORA_DIR, help="Path to directory with all the loras")
|
parser.add_argument("--lora-dir", type=str, default=LORA_DIR, help="Path to directory with all the loras")
|
||||||
|
parser.add_argument('--use-ptuning-v2',type=str,default=USE_PTUNING_V2,help="whether use ptuning-v2 checkpoint")
|
||||||
|
parser.add_argument("--ptuning-dir",type=str,default=PTUNING_DIR,help="the dir of ptuning-v2 checkpoint")
|
||||||
# Accelerate/transformers
|
# Accelerate/transformers
|
||||||
parser.add_argument('--load-in-8bit', action='store_true', default=LOAD_IN_8BIT,
|
parser.add_argument('--load-in-8bit', action='store_true', default=LOAD_IN_8BIT,
|
||||||
help='Load the model with 8-bit precision.')
|
help='Load the model with 8-bit precision.')
|
||||||
|
|
|
||||||
|
|
@ -454,6 +454,7 @@ class LoaderCheckPoint:
|
||||||
self.model_config.pre_seq_len = prefix_encoder_config['pre_seq_len']
|
self.model_config.pre_seq_len = prefix_encoder_config['pre_seq_len']
|
||||||
self.model_config.prefix_projection = prefix_encoder_config['prefix_projection']
|
self.model_config.prefix_projection = prefix_encoder_config['prefix_projection']
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
print("加载PrefixEncoder config.json失败")
|
print("加载PrefixEncoder config.json失败")
|
||||||
|
|
||||||
self.model, self.tokenizer = self._load_model()
|
self.model, self.tokenizer = self._load_model()
|
||||||
|
|
@ -471,6 +472,7 @@ class LoaderCheckPoint:
|
||||||
self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
|
self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
|
||||||
self.model.transformer.prefix_encoder.float()
|
self.model.transformer.prefix_encoder.float()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
print("加载PrefixEncoder模型参数失败")
|
print("加载PrefixEncoder模型参数失败")
|
||||||
# llama-cpp模型(至少vicuna-13b)的eval方法就是自身,其没有eval方法
|
# llama-cpp模型(至少vicuna-13b)的eval方法就是自身,其没有eval方法
|
||||||
if not self.is_llamacpp:
|
if not self.is_llamacpp:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue