From edbe155c2e0d058444f52eae72d6ea1ff1b5df3b Mon Sep 17 00:00:00 2001 From: Liao Zhenyu Date: Tue, 18 Apr 2023 15:54:51 +0800 Subject: [PATCH] =?UTF-8?q?1.=20=E5=8F=82=E8=80=83ChatGLM-6B=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E5=AE=9E=E7=8E=B0=E6=A8=A1=E5=9E=8B=E5=A4=9A=E5=8D=A1?= =?UTF-8?q?=E9=83=A8=E7=BD=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/chatglm_llm.py | 63 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/models/chatglm_llm.py b/models/chatglm_llm.py index 7cf3b24..76fe939 100644 --- a/models/chatglm_llm.py +++ b/models/chatglm_llm.py @@ -5,6 +5,8 @@ from transformers import AutoTokenizer, AutoModel import torch from configs.model_config import LLM_DEVICE +from typing import Dict, Tuple, Union, Optional + DEVICE = LLM_DEVICE DEVICE_ID = "0" if torch.cuda.is_available() else None CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE @@ -17,6 +19,36 @@ def torch_gc(): torch.cuda.ipc_collect() +def auto_configure_device_map(num_gpus: int) -> Dict[str, int]: + # transformer.word_embeddings 占用1层 + # transformer.final_layernorm 和 lm_head 占用1层 + # transformer.layers 占用 28 层 + # 总共30层分配到num_gpus张卡上 + num_trans_layers = 28 + per_gpu_layers = 30 / num_gpus + + # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError + # windows下 model.device 会被设置成 transformer.word_embeddings.device + # linux下 model.device 会被设置成 lm_head.device + # 在调用chat或者stream_chat时,input_ids会被放到model.device上 + # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError + # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上 + device_map = {'transformer.word_embeddings': 0, + 'transformer.final_layernorm': 0, 'lm_head': 0} + + used = 2 + gpu_target = 0 + for i in range(num_trans_layers): + if used >= per_gpu_layers: + gpu_target += 1 + used = 0 + assert gpu_target < num_gpus + device_map[f'transformer.layers.{i}'] = gpu_target + used += 1 + + return device_map + + class ChatGLM(LLM): max_token: int = 10000 temperature: float = 0.01 @@ -51,19 +83,34 @@ class ChatGLM(LLM): def load_model(self, model_name_or_path: str = "THUDM/chatglm-6b", - llm_device=LLM_DEVICE): + llm_device=LLM_DEVICE, + device_map: Optional[Dict[str, int]] = None, + **kwargs): self.tokenizer = AutoTokenizer.from_pretrained( model_name_or_path, trust_remote_code=True ) if torch.cuda.is_available() and llm_device.lower().startswith("cuda"): - self.model = ( - AutoModel.from_pretrained( - model_name_or_path, - trust_remote_code=True) - .half() - .cuda() - ) + # 根据当前设备GPU数量决定是否进行多卡部署 + num_gpus = torch.cuda.device_count() + if num_gpus < 2 and device_map is None: + self.model = ( + AutoModel.from_pretrained( + model_name_or_path, + trust_remote_code=True, + **kwargs) + .half() + .cuda() + ) + else: + from accelerate import dispatch_model + + model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, **kwargs).half() + # 可传入device_map自定义每张卡的部署情况 + if device_map is None: + device_map = auto_configure_device_map(num_gpus) + + self.model = dispatch_model(model, device_map=device_map) else: self.model = ( AutoModel.from_pretrained(