From 8ed1f8efca439c4895654c5fb7f4cc2d8f2cd9b2 Mon Sep 17 00:00:00 2001 From: liunux4odoo Date: Wed, 16 Aug 2023 16:15:45 +0800 Subject: [PATCH] update fastchat to 0.2.24 --- requirements.txt | 4 ++-- requirements_api.txt | 4 ++-- server/llm_api.py | 36 +++++++++++++++++++++++++----------- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6c013e5..646a5c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ langchain==0.0.257 openai sentence_transformers -fschat==0.2.20 -transformers +fschat==0.2.24 +transformers>=4.31.0 torch~=2.0.0 fastapi~=0.99.1 nltk~=3.8.1 diff --git a/requirements_api.txt b/requirements_api.txt index 9b45aac..1e13587 100644 --- a/requirements_api.txt +++ b/requirements_api.txt @@ -1,8 +1,8 @@ langchain==0.0.257 openai sentence_transformers -fschat==0.2.20 -transformers +fschat==0.2.24 +transformers>=4.31.0 torch~=2.0.0 fastapi~=0.99.1 nltk~=3.8.1 diff --git a/server/llm_api.py b/server/llm_api.py index e1013ed..49d8292 100644 --- a/server/llm_api.py +++ b/server/llm_api.py @@ -38,27 +38,31 @@ def create_controller_app( def create_model_worker_app( + worker_address=base_url.format(model_worker_port), + controller_address=base_url.format(controller_port), model_path=llm_model_dict[LLM_MODEL].get("local_model_path"), - model_names=[LLM_MODEL], device=LLM_DEVICE, + gpus=None, + max_gpu_memory="20GiB", load_8bit=False, + cpu_offloading=None, gptq_ckpt=None, gptq_wbits=16, gptq_groupsize=-1, - gptq_act_order=None, - gpus=None, - num_gpus=1, - max_gpu_memory="20GiB", - cpu_offloading=None, - worker_address=base_url.format(model_worker_port), - controller_address=base_url.format(controller_port), + gptq_act_order=False, + awq_ckpt=None, + awq_wbits=16, + awq_groupsize=-1, + model_names=[LLM_MODEL], + num_gpus=1, # not in fastchat + conv_template=None, limit_worker_concurrency=5, stream_interval=2, no_register=False, ): import fastchat.constants fastchat.constants.LOGDIR = LOG_PATH - from fastchat.serve.model_worker import app, GptqConfig, ModelWorker, worker_id + from fastchat.serve.model_worker import app, GptqConfig, AWQConfig, ModelWorker, worker_id import argparse parser = argparse.ArgumentParser() @@ -71,12 +75,16 @@ def create_model_worker_app( args.gptq_wbits = gptq_wbits args.gptq_groupsize = gptq_groupsize args.gptq_act_order = gptq_act_order + args.awq_ckpt = awq_ckpt + args.awq_wbits = awq_wbits + args.awq_groupsize = awq_groupsize args.gpus = gpus args.num_gpus = num_gpus args.max_gpu_memory = max_gpu_memory args.cpu_offloading = cpu_offloading args.worker_address = worker_address args.controller_address = controller_address + args.conv_template = conv_template args.limit_worker_concurrency = limit_worker_concurrency args.stream_interval = stream_interval args.no_register = no_register @@ -98,6 +106,12 @@ def create_model_worker_app( groupsize=args.gptq_groupsize, act_order=args.gptq_act_order, ) + awq_config = AWQConfig( + ckpt=args.awq_ckpt or args.model_path, + wbits=args.awq_wbits, + groupsize=args.awq_groupsize, + ) + # torch.multiprocessing.set_start_method('spawn') worker = ModelWorker( controller_addr=args.controller_address, @@ -113,7 +127,9 @@ def create_model_worker_app( load_8bit=args.load_8bit, cpu_offloading=args.cpu_offloading, gptq_config=gptq_config, + awq_config=awq_config, stream_interval=args.stream_interval, + conv_template=args.conv_template, ) sys.modules["fastchat.serve.model_worker"].worker = worker @@ -126,8 +142,6 @@ def create_model_worker_app( def create_openai_api_app( - host=host_ip, - port=openai_api_port, controller_address=base_url.format(controller_port), api_keys=[], ):