From ee7285cd932dc1322ca0be43a143b58cfa661ebf Mon Sep 17 00:00:00 2001
From: Zhi-guo Huang <hzg0601@163.com>
Date: Wed, 19 Jul 2023 23:15:14 +0800
Subject: [PATCH] =?UTF-8?q?=E5=9C=A8args.py=E4=B8=AD=E5=A2=9E=E5=8A=A0ptun?=
 =?UTF-8?q?ing=E7=9B=B8=E5=85=B3=E7=9A=84=E5=8F=82=E6=95=B0=20(#838)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 修复 bing_search.py的typo;更新model_config.py中Bing Subscription Key申请方式及注意事项

* 更新FAQ，增加了[Errno 110] Connection timed out的原因与解决方案

* 修改loader.py中load_in_8bit失败的原因和详细解决方案

* update loader.py

* stream_chat_bing

* 修改stream_chat的接口，在请求体中选择knowledge_base_id;增加stream_chat_bing接口

* 优化cli_demo.py的逻辑：支持 输入提示；多输入；重新输入

* update cli_demo.py

* 按照review建议进行修改

* 修改默认的多卡部署方案，基本保证针对新模型也不会失败

* 测试openai接口成功

* add ptuning-v2 dir

* 支持命令行输入ptuning路径

* 在FAQ中给出加载量化版本失败的原因和解决方案

* print error

* udpate

* Update args.py

* debug for fastchat_openai_llm

* temporarily save

* update faq for

---------

Co-authored-by: imClumsyPanda <littlepanda0716@gmail.com>
Co-authored-by: zg h <bj wang@hzg0601-acer.hundsun.com>
---
 configs/model_config.py       | 21 ++++++++++++++++++---
 docs/FAQ.md                   | 19 +++++++++++++++++++
 models/fastchat_openai_llm.py |  2 +-
 models/loader/args.py         |  4 +++-
 models/loader/loader.py       |  2 ++
 5 files changed, 43 insertions(+), 5 deletions(-)
diff --git a/configs/model_config.py b/configs/model_config.py
index 8aace42..bcc6a97 100644
--- a/configs/model_config.py
+++ b/configs/model_config.py
@@ -87,6 +87,12 @@ llm_model_dict = {
         "local_model_path": None,
         "provides": "MOSSLLMChain"
     },
+    "moss-int4": {
+        "name": "moss",
+        "pretrained_model_name": "fnlp/moss-moon-003-sft-int4",
+        "local_model_path": None,
+        "provides": "MOSSLLM"
+    },
     "vicuna-13b-hf": {
         "name": "vicuna-13b-hf",
         "pretrained_model_name": "vicuna-13b-hf",
@@ -148,6 +154,15 @@ llm_model_dict = {
         "provides": "FastChatOpenAILLMChain",  # 使用fastchat api时，需保证"provides"为"FastChatOpenAILLMChain"
         "api_base_url": "http://localhost:8000/v1",  # "name"修改为fastchat服务中的"api_base_url"
         "api_key": "EMPTY"
+    },
+        # 通过 fastchat 调用的模型请参考如下格式
+    "fastchat-chatglm-6b-int4": {
+        "name": "chatglm-6b-int4",  # "name"修改为fastchat服务中的"model_name"
+        "pretrained_model_name": "chatglm-6b-int4",
+        "local_model_path": None,
+        "provides": "FastChatOpenAILLMChain",  # 使用fastchat api时，需保证"provides"为"FastChatOpenAILLMChain"
+        "api_base_url": "http://localhost:8001/v1",  # "name"修改为fastchat服务中的"api_base_url"
+        "api_key": "EMPTY"
     },
     "fastchat-chatglm2-6b": {
         "name": "chatglm2-6b",  # "name"修改为fastchat服务中的"model_name"
@@ -173,7 +188,7 @@ llm_model_dict = {
     # 如果报出：raise NewConnectionError(
     # urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x000001FE4BDB85E0>:
     # Failed to establish a new connection: [WinError 10060]
-    # 则是因为内地和香港的IP都被OPENAI封了，需要挂切换为日本、新加坡等地
+    # 则是因为内地和香港的IP都被OPENAI封了，需要切换为日本、新加坡等地
     "openai-chatgpt-3.5": {
         "name": "gpt-3.5-turbo",
         "pretrained_model_name": "gpt-3.5-turbo",
@@ -186,7 +201,7 @@ llm_model_dict = {
 }
 
 # LLM 名称
-LLM_MODEL = "chatglm-6b"
+LLM_MODEL = "fastchat-chatglm-6b-int4"
 # 量化加载8bit 模型
 LOAD_IN_8BIT = False
 # Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
@@ -203,7 +218,7 @@ STREAMING = True
 
 # Use p-tuning-v2 PrefixEncoder
 USE_PTUNING_V2 = False
-
+PTUNING_DIR='./ptuing-v2'
 # LLM running device
 LLM_DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
 
diff --git a/docs/FAQ.md b/docs/FAQ.md
index f712477..ccc0f25 100644
--- a/docs/FAQ.md
+++ b/docs/FAQ.md
@@ -177,3 +177,22 @@ download_with_progressbar(url, tmp_path)
 Q14 调用api中的 `bing_search_chat`接口时，报出 `Failed to establish a new connection: [Errno 110] Connection timed out`
 
 这是因为服务器加了防火墙，需要联系管理员加白名单，如果公司的服务器的话，就别想了GG--!
+
+---
+
+Q15 加载chatglm-6b-int8或chatglm-6b-int4抛出 `RuntimeError: Only Tensors of floating point andcomplex dtype can require gradients`
+
+疑为chatglm的quantization的问题或torch版本差异问题，针对已经变为Parameter的torch.zeros矩阵也执行Parameter操作，从而抛出 `RuntimeError: Only Tensors of floating point andcomplex dtype can require gradients`。解决办法是在chatglm-项目的原始文件中的quantization.py文件374行改为：
+
+```
+    try:
+        self.weight =Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
+    except Exception as e:
+        pass
+```
+
+    如果上述方式不起作用，则在.cache/hugggingface/modules/目录下针对chatglm项目的原始文件中的quantization.py文件执行上述操作，若软链接不止一个，按照错误提示选择正确的路径。
+
+注：虽然模型可以顺利加载但在cpu上仍存在推理失败的可能：即针对每个问题，模型一直输出gugugugu。
+
+    因此，最好不要试图用cpu加载量化模型，原因可能是目前python主流量化包的量化操作是在gpu上执行的,会天然地存在gap。
diff --git a/models/fastchat_openai_llm.py b/models/fastchat_openai_llm.py
index d0972f7..217910a 100644
--- a/models/fastchat_openai_llm.py
+++ b/models/fastchat_openai_llm.py
@@ -245,7 +245,7 @@ if __name__ == "__main__":
 
     chain = FastChatOpenAILLMChain()
 
-    chain.set_api_key("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")
+    chain.set_api_key("EMPTY")
     # chain.set_api_base_url("https://api.openai.com/v1")
     # chain.call_model_name("gpt-3.5-turbo")
 
diff --git a/models/loader/args.py b/models/loader/args.py
index b15ad5e..e7df76b 100644
--- a/models/loader/args.py
+++ b/models/loader/args.py
@@ -1,3 +1,4 @@
+
 import argparse
 import os
 from configs.model_config import *
@@ -43,7 +44,8 @@ parser.add_argument('--no-remote-model', action='store_true', help='remote in th
 parser.add_argument('--model-name', type=str, default=LLM_MODEL, help='Name of the model to load by default.')
 parser.add_argument('--lora', type=str, help='Name of the LoRA to apply to the model by default.')
 parser.add_argument("--lora-dir", type=str, default=LORA_DIR, help="Path to directory with all the loras")
-
+parser.add_argument('--use-ptuning-v2',type=str,default=USE_PTUNING_V2,help="whether use ptuning-v2 checkpoint")
+parser.add_argument("--ptuning-dir",type=str,default=PTUNING_DIR,help="the dir of ptuning-v2 checkpoint")
 # Accelerate/transformers
 parser.add_argument('--load-in-8bit', action='store_true', default=LOAD_IN_8BIT,
                     help='Load the model with 8-bit precision.')
diff --git a/models/loader/loader.py b/models/loader/loader.py
index 5056932..ae4b408 100644
--- a/models/loader/loader.py
+++ b/models/loader/loader.py
@@ -454,6 +454,7 @@ class LoaderCheckPoint:
                 self.model_config.pre_seq_len = prefix_encoder_config['pre_seq_len']
                 self.model_config.prefix_projection = prefix_encoder_config['prefix_projection']
             except Exception as e:
+                print(e)
                 print("加载PrefixEncoder config.json失败")
 
         self.model, self.tokenizer = self._load_model()
@@ -471,6 +472,7 @@ class LoaderCheckPoint:
                 self.model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
                 self.model.transformer.prefix_encoder.float()
             except Exception as e:
+                print(e)
                 print("加载PrefixEncoder模型参数失败")
         # llama-cpp模型（至少vicuna-13b）的eval方法就是自身，其没有eval方法
         if not self.is_llamacpp: