update fastchat to 0.2.24
This commit is contained in:
parent
f64affc930
commit
8ed1f8efca
|
|
@ -1,8 +1,8 @@
|
||||||
langchain==0.0.257
|
langchain==0.0.257
|
||||||
openai
|
openai
|
||||||
sentence_transformers
|
sentence_transformers
|
||||||
fschat==0.2.20
|
fschat==0.2.24
|
||||||
transformers
|
transformers>=4.31.0
|
||||||
torch~=2.0.0
|
torch~=2.0.0
|
||||||
fastapi~=0.99.1
|
fastapi~=0.99.1
|
||||||
nltk~=3.8.1
|
nltk~=3.8.1
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
langchain==0.0.257
|
langchain==0.0.257
|
||||||
openai
|
openai
|
||||||
sentence_transformers
|
sentence_transformers
|
||||||
fschat==0.2.20
|
fschat==0.2.24
|
||||||
transformers
|
transformers>=4.31.0
|
||||||
torch~=2.0.0
|
torch~=2.0.0
|
||||||
fastapi~=0.99.1
|
fastapi~=0.99.1
|
||||||
nltk~=3.8.1
|
nltk~=3.8.1
|
||||||
|
|
|
||||||
|
|
@ -38,27 +38,31 @@ def create_controller_app(
|
||||||
|
|
||||||
|
|
||||||
def create_model_worker_app(
|
def create_model_worker_app(
|
||||||
|
worker_address=base_url.format(model_worker_port),
|
||||||
|
controller_address=base_url.format(controller_port),
|
||||||
model_path=llm_model_dict[LLM_MODEL].get("local_model_path"),
|
model_path=llm_model_dict[LLM_MODEL].get("local_model_path"),
|
||||||
model_names=[LLM_MODEL],
|
|
||||||
device=LLM_DEVICE,
|
device=LLM_DEVICE,
|
||||||
|
gpus=None,
|
||||||
|
max_gpu_memory="20GiB",
|
||||||
load_8bit=False,
|
load_8bit=False,
|
||||||
|
cpu_offloading=None,
|
||||||
gptq_ckpt=None,
|
gptq_ckpt=None,
|
||||||
gptq_wbits=16,
|
gptq_wbits=16,
|
||||||
gptq_groupsize=-1,
|
gptq_groupsize=-1,
|
||||||
gptq_act_order=None,
|
gptq_act_order=False,
|
||||||
gpus=None,
|
awq_ckpt=None,
|
||||||
num_gpus=1,
|
awq_wbits=16,
|
||||||
max_gpu_memory="20GiB",
|
awq_groupsize=-1,
|
||||||
cpu_offloading=None,
|
model_names=[LLM_MODEL],
|
||||||
worker_address=base_url.format(model_worker_port),
|
num_gpus=1, # not in fastchat
|
||||||
controller_address=base_url.format(controller_port),
|
conv_template=None,
|
||||||
limit_worker_concurrency=5,
|
limit_worker_concurrency=5,
|
||||||
stream_interval=2,
|
stream_interval=2,
|
||||||
no_register=False,
|
no_register=False,
|
||||||
):
|
):
|
||||||
import fastchat.constants
|
import fastchat.constants
|
||||||
fastchat.constants.LOGDIR = LOG_PATH
|
fastchat.constants.LOGDIR = LOG_PATH
|
||||||
from fastchat.serve.model_worker import app, GptqConfig, ModelWorker, worker_id
|
from fastchat.serve.model_worker import app, GptqConfig, AWQConfig, ModelWorker, worker_id
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
@ -71,12 +75,16 @@ def create_model_worker_app(
|
||||||
args.gptq_wbits = gptq_wbits
|
args.gptq_wbits = gptq_wbits
|
||||||
args.gptq_groupsize = gptq_groupsize
|
args.gptq_groupsize = gptq_groupsize
|
||||||
args.gptq_act_order = gptq_act_order
|
args.gptq_act_order = gptq_act_order
|
||||||
|
args.awq_ckpt = awq_ckpt
|
||||||
|
args.awq_wbits = awq_wbits
|
||||||
|
args.awq_groupsize = awq_groupsize
|
||||||
args.gpus = gpus
|
args.gpus = gpus
|
||||||
args.num_gpus = num_gpus
|
args.num_gpus = num_gpus
|
||||||
args.max_gpu_memory = max_gpu_memory
|
args.max_gpu_memory = max_gpu_memory
|
||||||
args.cpu_offloading = cpu_offloading
|
args.cpu_offloading = cpu_offloading
|
||||||
args.worker_address = worker_address
|
args.worker_address = worker_address
|
||||||
args.controller_address = controller_address
|
args.controller_address = controller_address
|
||||||
|
args.conv_template = conv_template
|
||||||
args.limit_worker_concurrency = limit_worker_concurrency
|
args.limit_worker_concurrency = limit_worker_concurrency
|
||||||
args.stream_interval = stream_interval
|
args.stream_interval = stream_interval
|
||||||
args.no_register = no_register
|
args.no_register = no_register
|
||||||
|
|
@ -98,6 +106,12 @@ def create_model_worker_app(
|
||||||
groupsize=args.gptq_groupsize,
|
groupsize=args.gptq_groupsize,
|
||||||
act_order=args.gptq_act_order,
|
act_order=args.gptq_act_order,
|
||||||
)
|
)
|
||||||
|
awq_config = AWQConfig(
|
||||||
|
ckpt=args.awq_ckpt or args.model_path,
|
||||||
|
wbits=args.awq_wbits,
|
||||||
|
groupsize=args.awq_groupsize,
|
||||||
|
)
|
||||||
|
|
||||||
# torch.multiprocessing.set_start_method('spawn')
|
# torch.multiprocessing.set_start_method('spawn')
|
||||||
worker = ModelWorker(
|
worker = ModelWorker(
|
||||||
controller_addr=args.controller_address,
|
controller_addr=args.controller_address,
|
||||||
|
|
@ -113,7 +127,9 @@ def create_model_worker_app(
|
||||||
load_8bit=args.load_8bit,
|
load_8bit=args.load_8bit,
|
||||||
cpu_offloading=args.cpu_offloading,
|
cpu_offloading=args.cpu_offloading,
|
||||||
gptq_config=gptq_config,
|
gptq_config=gptq_config,
|
||||||
|
awq_config=awq_config,
|
||||||
stream_interval=args.stream_interval,
|
stream_interval=args.stream_interval,
|
||||||
|
conv_template=args.conv_template,
|
||||||
)
|
)
|
||||||
|
|
||||||
sys.modules["fastchat.serve.model_worker"].worker = worker
|
sys.modules["fastchat.serve.model_worker"].worker = worker
|
||||||
|
|
@ -126,8 +142,6 @@ def create_model_worker_app(
|
||||||
|
|
||||||
|
|
||||||
def create_openai_api_app(
|
def create_openai_api_app(
|
||||||
host=host_ip,
|
|
||||||
port=openai_api_port,
|
|
||||||
controller_address=base_url.format(controller_port),
|
controller_address=base_url.format(controller_port),
|
||||||
api_keys=[],
|
api_keys=[],
|
||||||
):
|
):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue