diff --git a/README.md b/README.md index b583eee..288e3bc 100644 --- a/README.md +++ b/README.md @@ -60,13 +60,14 @@ docker run -d --gpus all -p 80:8501 registry.cn-beijing.aliyuncs.com/chatchat/ch ## 环境最低要求 想顺利运行本代码,请按照以下的最低要求进行配置: + + Python版本: >= 3.8.5, < 3.11 + Cuda版本: >= 11.7, 且能顺利安装Python 如果想要顺利在GPU运行本地模型(int4版本),你至少需要以下的硬件配置: + chatglm2-6b & LLaMA-7B 最低显存要求: 7GB 推荐显卡: RTX 3060, RTX 2060 -+ LLaMA-13B 最低显存要求: 11GB 推荐显卡: RTX 2060 12GB, RTX3060 12GB, RTX3080, RTXA2000 ++ LLaMA-13B 最低显存要求: 11GB 推荐显卡: RTX 2060 12GB, RTX3060 12GB, RTX3080, RTXA2000 + Qwen-14B-Chat 最低显存要求: 13GB 推荐显卡: RTX 3090 + LLaMA-30B 最低显存要求: 22GB 推荐显卡:RTX A5000,RTX 3090,RTX 4090,RTX 6000,Tesla V100,RTX Tesla P40 + LLaMA-65B 最低显存要求: 40GB 推荐显卡:A100,A40,A6000 @@ -215,8 +216,11 @@ docker run -d --gpus all -p 80:8501 registry.cn-beijing.aliyuncs.com/chatchat/ch 关于如何使用自定义分词器和贡献自己的分词器,可以参考[Text Splitter 贡献说明](docs/splitter.md)。 ## Agent生态 + ### 基础的Agent + 在本版本中,我们实现了一个简单的基于OpenAI的React的Agent模型,目前,经过我们测试,仅有以下两个模型支持: + + OpenAI GPT4 + ChatGLM2-130B @@ -278,6 +282,7 @@ $ git clone https://huggingface.co/moka-ai/m3e-base 在开始执行 Web UI 或命令行交互前,请先检查 [configs/model_config.py](configs/model_config.py) 和 [configs/server_config.py](configs/server_config.py) 中的各项模型参数设计是否符合需求: - 请确认已下载至本地的 LLM 模型本地存储路径写在 `llm_model_dict` 对应模型的 `local_model_path` 属性中,如: + ``` "chatglm2-6b": "/Users/xxx/Downloads/chatglm2-6b", @@ -374,9 +379,13 @@ CUDA_VISIBLE_DEVICES=0,1 python startup.py -a #### 5.4 PEFT 加载(包括lora,p-tuning,prefix tuning, prompt tuning,ia3等) -本项目基于 FastChat 加载 LLM 服务,故需以 FastChat 加载 PEFT 路径,即保证路径名称里必须有 peft 这个词,配置文件的名字为 adapter_config.json,peft 路径下包含.bin 格式的 PEFT 权重,peft路径在startup.py中create_model_worker_app函数的args.model_names中指定,并开启环境变量PEFT_SHARE_BASE_WEIGHTS=true参数。 +本项目基于 FastChat 加载 LLM 服务,故需以 FastChat 加载 PEFT 路径,针对chatglm,falcon,codet5p以外的模型,以及非p-tuning以外的peft方法,步骤如下: -注:如果上述方式启动失败,则需要以标准的fastchat服务启动方式分步启动,分步启动步骤参考第六节,PEFT加载详细步骤参考[加载lora微调后模型失效](https://github.com/chatchat-space/Langchain-Chatchat/issues/1130#issuecomment-1685291822), +1. 将训练peft生成的config.json文件命名为adapter_config.json; +2. 重命名文件夹,保证文件夹中包含'peft'一词; +3. 开启 `PEFT_SHARE_BASE_WEIGHTS=true`环境变量,再执行python startup.py -a + +针对p-tuning和chatglm模型,需要对fastchat进行较大幅度的修改,详细步骤参考[chatchat加载p-tuning](docs/chatchat加载ptuning.md) #### **5.5 注意事项:** @@ -454,10 +463,7 @@ CUDA_VISIBLE_DEVICES=0,1 python startup.py -a 🎉 langchain-Chatchat 项目微信交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。 - ## 关注我们 图片 🎉 langchain-Chatchat 项目官方公众号,欢迎扫码关注。 - - diff --git a/docs/chatchat加载ptuning.md b/docs/chatchat加载ptuning.md new file mode 100644 index 0000000..03ef4a9 --- /dev/null +++ b/docs/chatchat加载ptuning.md @@ -0,0 +1,674 @@ +# chatchat加载ptuning指南 + +P-tuning虽然是一种peft方法,但并不能于huggingface的peft python包兼容,而fastchat在多处以字符串匹配的方式进行硬编码加载模型,因此导致fastchat和chatchat不能兼容p-tuning,经langchain-chatchat开发组多次尝试,给出如下指南进行p-tuning加载。 + +# 1. peft文件夹修改 + +1. 将config.json文件修改为adapter_config.json; +2. 保证文件夹包含pytorch_model.bin文件; +3. 修改文件夹名称,保证文件夹包含'peft'一词; +4. 在adapter_config.json文件中增加如下字段: + + ```json + "base_model_name_or_path": "/root/model/chatglm2-6b/" + "task_type": "CAUSAL_LM", + "peft_type": "PREFIX_TUNING", + "inference_mode": true, + "revision": "main", + "num_virtual_tokens": 16 + ``` + + **其中,"base_model_name_or_path"为基础模型的存在位置**; +5. 将文件夹移入项目文件夹中,如Langchain-Chatchat项目文件夹目录下; + +# 2. fastchat包代码修改 + +## 2.1 fastchat.model.model_adapter文件修改 + +1. 将fastchat.model.model_adapter.py文件的load_model函数修改为: + + ```python + def load_model( + model_path: str, + device: str = "cuda", + num_gpus: int = 1, + max_gpu_memory: Optional[str] = None, + dtype: Optional[torch.dtype] = None, + load_8bit: bool = False, + cpu_offloading: bool = False, + gptq_config: Optional[GptqConfig] = None, + awq_config: Optional[AWQConfig] = None, + revision: str = "main", + debug: bool = False, + load_kwargs = {} + ): + """Load a model from Hugging Face.""" + # get model adapter + adapter = get_model_adapter(model_path) + kwargs = load_kwargs + # Handle device mapping + cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration( + device, load_8bit, cpu_offloading + ) + if device == "cpu": + kwargs["torch_dtype"]= torch.float32 + if CPU_ISA in ["avx512_bf16", "amx"]: + try: + import intel_extension_for_pytorch as ipex + + kwargs ["torch_dtype"]= torch.bfloat16 + except ImportError: + warnings.warn( + "Intel Extension for PyTorch is not installed, it can be installed to accelerate cpu inference" + ) + elif device == "cuda": + kwargs["torch_dtype"] = torch.float16 + if num_gpus != 1: + kwargs["device_map"] = "auto" + if max_gpu_memory is None: + kwargs[ + "device_map" + ] = "sequential" # This is important for not the same VRAM sizes + available_gpu_memory = get_gpu_memory(num_gpus) + kwargs["max_memory"] = { + i: str(int(available_gpu_memory[i] * 0.85)) + "GiB" + for i in range(num_gpus) + } + else: + kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)} + elif device == "mps": + kwargs["torch_dtype"] = torch.float16 + # Avoid bugs in mps backend by not using in-place operations. + replace_llama_attn_with_non_inplace_operations() + elif device == "xpu": + kwargs["torch_dtype"] = torch.bfloat16 + # Try to load ipex, while it looks unused, it links into torch for xpu support + try: + import intel_extension_for_pytorch as ipex + except ImportError: + warnings.warn( + "Intel Extension for PyTorch is not installed, but is required for xpu inference." + ) + elif device == "npu": + kwargs["torch_dtype"]= torch.float16 + # Try to load ipex, while it looks unused, it links into torch for xpu support + try: + import torch_npu + except ImportError: + warnings.warn("Ascend Extension for PyTorch is not installed.") + else: + raise ValueError(f"Invalid device: {device}") + + if cpu_offloading: + # raises an error on incompatible platforms + from transformers import BitsAndBytesConfig + + if "max_memory" in kwargs: + kwargs["max_memory"]["cpu"] = ( + str(math.floor(psutil.virtual_memory().available / 2**20)) + "Mib" + ) + kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_8bit_fp32_cpu_offload=cpu_offloading + ) + kwargs["load_in_8bit"] = load_8bit + elif load_8bit: + if num_gpus != 1: + warnings.warn( + "8-bit quantization is not supported for multi-gpu inference." + ) + else: + model, tokenizer = adapter.load_compress_model( + model_path=model_path, + device=device, + torch_dtype=kwargs["torch_dtype"], + revision=revision, + ) + if debug: + print(model) + return model, tokenizer + elif awq_config and awq_config.wbits < 16: + assert ( + awq_config.wbits == 4 + ), "Currently we only support 4-bit inference for AWQ." + model, tokenizer = load_awq_quantized(model_path, awq_config, device) + if num_gpus != 1: + device_map = accelerate.infer_auto_device_map( + model, + max_memory=kwargs["max_memory"], + no_split_module_classes=[ + "OPTDecoderLayer", + "LlamaDecoderLayer", + "BloomBlock", + "MPTBlock", + "DecoderLayer", + ], + ) + model = accelerate.dispatch_model( + model, device_map=device_map, offload_buffers=True + ) + else: + model.to(device) + return model, tokenizer + elif gptq_config and gptq_config.wbits < 16: + model, tokenizer = load_gptq_quantized(model_path, gptq_config) + if num_gpus != 1: + device_map = accelerate.infer_auto_device_map( + model, + max_memory=kwargs["max_memory"], + no_split_module_classes=["LlamaDecoderLayer"], + ) + model = accelerate.dispatch_model( + model, device_map=device_map, offload_buffers=True + ) + else: + model.to(device) + return model, tokenizer + kwargs["revision"] = revision + + if dtype is not None: # Overwrite dtype if it is provided in the arguments. + kwargs["torch_dtype"] = dtype + + # Load model + model, tokenizer = adapter.load_model(model_path, kwargs) + + if ( + device == "cpu" + and kwargs["torch_dtype"] is torch.bfloat16 + and CPU_ISA is not None + ): + model = ipex.optimize(model, dtype=kwargs["torch_dtype"]) + + if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in ( + "mps", + "xpu", + "npu", + ): + model.to(device) + + if device == "xpu": + model = torch.xpu.optimize(model, dtype=kwargs["torch_dtype"], inplace=True) + + if debug: + print(model) + + return model, tokenizer + ``` +2. 将fastchat.model.model_adapter.py的函数修改为: + + ```python + def get_generate_stream_function(model: torch.nn.Module, model_path: str): + """Get the generate_stream function for inference.""" + from fastchat.serve.inference import generate_stream + + model_type = str(type(model)).lower() + + is_chatglm = "chatglm" in model_type + is_falcon = "rwforcausallm" in model_type + is_codet5p = "codet5p" in model_type + is_peft = "peft" in model_type + + if is_chatglm: + return generate_stream_chatglm + elif is_falcon: + return generate_stream_falcon + elif is_codet5p: + return generate_stream_codet5p + elif peft_share_base_weights and is_peft: + # Return a curried stream function that loads the right adapter + # according to the model_name available in this context. This ensures + # the right weights are available. + @torch.inference_mode() + def generate_stream_peft( + model, + tokenizer, + params: Dict, + device: str, + context_len: int, + stream_interval: int = 2, + judge_sent_end: bool = False, + ): + + model.set_adapter(model_path) + if "chatglm" in str(type(model.base_model)).lower(): + model.disable_adapter() + prefix_state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin")) + new_prefix_state_dict = {} + + for k, v in prefix_state_dict.items(): + if k.startswith("transformer.prefix_encoder."): + new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v + elif k.startswith("transformer.prompt_encoder."): + new_prefix_state_dict[k[len("transformer.prompt_encoder."):]] = v + model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict) + for x in generate_stream_chatglm( + model, + tokenizer, + params, + device, + context_len, + stream_interval, + judge_sent_end, + ): + yield x + elif "rwforcausallm" in str(type(model.base_model)).lower(): + + for x in generate_stream_falcon( + model, + tokenizer, + params, + device, + context_len, + stream_interval, + judge_sent_end, + ): + yield x + elif "codet5p" in str(type(model.base_model)).lower(): + + for x in generate_stream_codet5p( + model, + tokenizer, + params, + device, + context_len, + stream_interval, + judge_sent_end, + ): + yield x + else: + + for x in generate_stream( + model, + tokenizer, + params, + device, + context_len, + stream_interval, + judge_sent_end, + ): + yield x + + return generate_stream_peft + else: + return generate_stream + ``` +3. 将fastchat.model.model_adapter.py的PeftModelAdapter类的load_model方法修改为: + + ```python + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + """Loads the base model then the (peft) adapter weights""" + from peft import PeftConfig, PeftModel + + config = PeftConfig.from_pretrained(model_path) + base_model_path = config.base_model_name_or_path + if "peft" in base_model_path: + raise ValueError( + f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}" + ) + + # Basic proof of concept for loading peft adapters that share the base + # weights. This is pretty messy because Peft re-writes the underlying + # base model and internally stores a map of adapter layers. + # So, to make this work we: + # 1. Cache the first peft model loaded for a given base models. + # 2. Call `load_model` for any follow on Peft models. + # 3. Make sure we load the adapters by the model_path. Why? This is + # what's accessible during inference time. + # 4. In get_generate_stream_function, make sure we load the right + # adapter before doing inference. This *should* be safe when calls + # are blocked the same semaphore. + if peft_share_base_weights: + if base_model_path in peft_model_cache: + model, tokenizer = peft_model_cache[base_model_path] + # Super important: make sure we use model_path as the + # `adapter_name`. + model.load_adapter(model_path, adapter_name=model_path) + else: + base_adapter = get_model_adapter(base_model_path) + base_model, tokenizer = base_adapter.load_model( + base_model_path, from_pretrained_kwargs + ) + # Super important: make sure we use model_path as the + # `adapter_name`. + from peft import get_peft_model + model = get_peft_model(base_model,config,adapter_name=model_path) + peft_model_cache[base_model_path] = (model, tokenizer) + return model, tokenizer + + # In the normal case, load up the base model weights again. + base_adapter = get_model_adapter(base_model_path) + base_model, tokenizer = base_adapter.load_model( + base_model_path, from_pretrained_kwargs + ) + from peft import get_peft_model + model = get_peft_model(base_model,config,adapter_name=model_path) + return model, tokenizer + + ``` +4. 将fastchat.model.model_adapter.py的ChatglmAdapter类的load_model方法修改为: + + ```python + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + revision = from_pretrained_kwargs.get("revision", "main") + tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=True, revision=revision + ) + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True,**from_pretrained_kwargs) + model = AutoModel.from_pretrained( + model_path, trust_remote_code=True, config=config + ) + return model, tokenizer + ``` + +## 2.2 fastchat.serve.model_worker文件修改 + +1. 将fastchat.serve.model_worker文件的ModelWorker的__init__方法修改如下: + + ```python + class ModelWorker(BaseModelWorker): + def __init__( + self, + controller_addr: str, + worker_addr: str, + worker_id: str, + model_path: str, + model_names: List[str], + limit_worker_concurrency: int, + no_register: bool, + device: str, + num_gpus: int, + max_gpu_memory: str, + dtype: Optional[torch.dtype] = None, + load_8bit: bool = False, + cpu_offloading: bool = False, + gptq_config: Optional[GptqConfig] = None, + awq_config: Optional[AWQConfig] = None, + stream_interval: int = 2, + conv_template: Optional[str] = None, + embed_in_truncate: bool = False, + seed: Optional[int] = None, + load_kwargs = {}, #修改点 + **kwargs, + ): + super().__init__( + controller_addr, + worker_addr, + worker_id, + model_path, + model_names, + limit_worker_concurrency, + conv_template=conv_template, + ) + + logger.info(f"Loading the model {self.model_names} on worker {worker_id} ...") + self.model, self.tokenizer = load_model( + model_path, + device=device, + num_gpus=num_gpus, + max_gpu_memory=max_gpu_memory, + dtype=dtype, + load_8bit=load_8bit, + cpu_offloading=cpu_offloading, + gptq_config=gptq_config, + awq_config=awq_config, + load_kwargs=load_kwargs #修改点 + ) + self.device = device + if self.tokenizer.pad_token == None: + self.tokenizer.pad_token = self.tokenizer.eos_token + self.context_len = get_context_length(self.model.config) + print("**"*100) + self.generate_stream_func = get_generate_stream_function(self.model, model_path) + print(f"self.generate_stream_func{self.generate_stream_func}") + print("*"*100) + self.stream_interval = stream_interval + self.embed_in_truncate = embed_in_truncate + self.seed = seed + + if not no_register: + self.init_heart_beat() + ``` +2. 在fastchat.serve.model_worker文件的create_model_worker增加如下args参数: + + ```python + parser.add_argument("--load_kwargs",type=dict,default={}) + ``` + + 并将如下语句: + +```python + worker = ModelWorker( + args.controller_address, + args.worker_address, + worker_id, + args.model_path, + args.model_names, + args.limit_worker_concurrency, + no_register=args.no_register, + device=args.device, + num_gpus=args.num_gpus, + max_gpu_memory=args.max_gpu_memory, + dtype=str_to_torch_dtype(args.dtype), + load_8bit=args.load_8bit, + cpu_offloading=args.cpu_offloading, + gptq_config=gptq_config, + awq_config=awq_config, + stream_interval=args.stream_interval, + conv_template=args.conv_template, + embed_in_truncate=args.embed_in_truncate, + seed=args.seed, + ) +``` + +修改为: + +```python + worker = ModelWorker( + args.controller_address, + args.worker_address, + worker_id, + args.model_path, + args.model_names, + args.limit_worker_concurrency, + no_register=args.no_register, + device=args.device, + num_gpus=args.num_gpus, + max_gpu_memory=args.max_gpu_memory, + dtype=str_to_torch_dtype(args.dtype), + load_8bit=args.load_8bit, + cpu_offloading=args.cpu_offloading, + gptq_config=gptq_config, + awq_config=awq_config, + stream_interval=args.stream_interval, + conv_template=args.conv_template, + embed_in_truncate=args.embed_in_truncate, + seed=args.seed, + load_kwargs=args.load_kwargs + ) +``` + +至此,我们完成了fastchat加载ptuning的所有修改,在调用fastchat加载p-tuning时,可以通过加入 `PEFT_SHARE_BASE_WEIGHTS=true`,并以字典的形式添加--load_kwargs参数为训练ptuning时的pre_seq_len值即可,例如将2.2.2步骤中的 `parser.add_argument("--load_kwargs",type=dict,default={})`修改为: + +`parser.add_argument("--load_kwargs",type=dict,default={"pre_seq_len":16})` + +# 3 langchain-chatchat代码修改: + +1. 在configs/serve_config.py中的FSCHAT_MODEL_WORKERS字典中增加如下字段: + + ``` + "load_kwargs": {"pre_seq_len": 16} #值修改为adapter_config.json中的pre_seq_len值 + ``` +2. 将startup.py中的create_model_worker_app修改为: + + ```python + def create_model_worker_app(log_level: str = "INFO", **kwargs) -> FastAPI: + """ + kwargs包含的字段如下: + host: + port: + model_names:[`model_name`] + controller_address: + worker_address: + + + 对于online_api: + online_api:True + worker_class: `provider` + 对于离线模型: + model_path: `model_name_or_path`,huggingface的repo-id或本地路径 + device:`LLM_DEVICE` + """ + import fastchat.constants + fastchat.constants.LOGDIR = LOG_PATH + from fastchat.serve.model_worker import worker_id, logger + import argparse + logger.setLevel(log_level) + + parser = argparse.ArgumentParser() + args = parser.parse_args([]) + + for k, v in kwargs.items(): + setattr(args, k, v) + + # 在线模型API + if worker_class := kwargs.get("worker_class"): + from fastchat.serve.model_worker import app + worker = worker_class(model_names=args.model_names, + controller_addr=args.controller_address, + worker_addr=args.worker_address) + sys.modules["fastchat.serve.model_worker"].worker = worker + # 本地模型 + else: + from configs.model_config import VLLM_MODEL_DICT + if kwargs["model_names"][0] in VLLM_MODEL_DICT and args.infer_turbo == "vllm": + import fastchat.serve.vllm_worker + from fastchat.serve.vllm_worker import VLLMWorker,app + from vllm import AsyncLLMEngine + from vllm.engine.arg_utils import AsyncEngineArgs,EngineArgs + args.tokenizer = args.model_path # 如果tokenizer与model_path不一致在此处添加 + args.tokenizer_mode = 'auto' + args.trust_remote_code= True + args.download_dir= None + args.load_format = 'auto' + args.dtype = 'auto' + args.seed = 0 + args.worker_use_ray = False + args.pipeline_parallel_size = 1 + args.tensor_parallel_size = 1 + args.block_size = 16 + args.swap_space = 4 # GiB + args.gpu_memory_utilization = 0.90 + args.max_num_batched_tokens = 2560 + args.max_num_seqs = 256 + args.disable_log_stats = False + args.conv_template = None + args.limit_worker_concurrency = 5 + args.no_register = False + args.num_gpus = 1 # vllm worker的切分是tensor并行,这里填写显卡的数量 + args.engine_use_ray = False + args.disable_log_requests = False + if args.model_path: + args.model = args.model_path + if args.num_gpus > 1: + args.tensor_parallel_size = args.num_gpus + + for k, v in kwargs.items(): + setattr(args, k, v) + + engine_args = AsyncEngineArgs.from_cli_args(args) + engine = AsyncLLMEngine.from_engine_args(engine_args) + + worker = VLLMWorker( + controller_addr = args.controller_address, + worker_addr = args.worker_address, + worker_id = worker_id, + model_path = args.model_path, + model_names = args.model_names, + limit_worker_concurrency = args.limit_worker_concurrency, + no_register = args.no_register, + llm_engine = engine, + conv_template = args.conv_template, + ) + sys.modules["fastchat.serve.vllm_worker"].engine = engine + sys.modules["fastchat.serve.vllm_worker"].worker = worker + + else: + from fastchat.serve.model_worker import app, GptqConfig, AWQConfig, ModelWorker + args.gpus = "0" # GPU的编号,如果有多个GPU,可以设置为"0,1,2,3" + args.max_gpu_memory = "20GiB" + args.num_gpus = 1 # model worker的切分是model并行,这里填写显卡的数量 + + args.load_8bit = False + args.cpu_offloading = None + args.gptq_ckpt = None + args.gptq_wbits = 16 + args.gptq_groupsize = -1 + args.gptq_act_order = False + args.awq_ckpt = None + args.awq_wbits = 16 + args.awq_groupsize = -1 + args.model_names = [] + args.conv_template = None + args.limit_worker_concurrency = 5 + args.stream_interval = 2 + args.no_register = False + args.embed_in_truncate = False + args.load_kwargs = {"pre_seq_len": 16} # 改************************* + for k, v in kwargs.items(): + setattr(args, k, v) + if args.gpus: + if args.num_gpus is None: + args.num_gpus = len(args.gpus.split(',')) + if len(args.gpus.split(",")) < args.num_gpus: + raise ValueError( + f"Larger --num-gpus ({args.num_gpus}) than --gpus {args.gpus}!" + ) + os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus + gptq_config = GptqConfig( + ckpt=args.gptq_ckpt or args.model_path, + wbits=args.gptq_wbits, + groupsize=args.gptq_groupsize, + act_order=args.gptq_act_order, + ) + awq_config = AWQConfig( + ckpt=args.awq_ckpt or args.model_path, + wbits=args.awq_wbits, + groupsize=args.awq_groupsize, + ) + + worker = ModelWorker( + controller_addr=args.controller_address, + worker_addr=args.worker_address, + worker_id=worker_id, + model_path=args.model_path, + model_names=args.model_names, + limit_worker_concurrency=args.limit_worker_concurrency, + no_register=args.no_register, + device=args.device, + num_gpus=args.num_gpus, + max_gpu_memory=args.max_gpu_memory, + load_8bit=args.load_8bit, + cpu_offloading=args.cpu_offloading, + gptq_config=gptq_config, + awq_config=awq_config, + stream_interval=args.stream_interval, + conv_template=args.conv_template, + embed_in_truncate=args.embed_in_truncate, + load_kwargs=args.load_kwargs #改************************* + ) + sys.modules["fastchat.serve.model_worker"].args = args + sys.modules["fastchat.serve.model_worker"].gptq_config = gptq_config + + sys.modules["fastchat.serve.model_worker"].worker = worker + + MakeFastAPIOffline(app) + app.title = f"FastChat LLM Server ({args.model_names[0]})" + app._worker = worker + return app + ``` + +至此,我们完成了langchain-chatchat加载p-tuning的全部操作,可以如下方式加载p-tuning: + +```shell +PEFT_SHARE_BASE_WEIGHTS=true python startup.py -a + +```