diff --git a/configs/server_config.py.example b/configs/server_config.py.example index 6197052..5496dad 100644 --- a/configs/server_config.py.example +++ b/configs/server_config.py.example @@ -38,7 +38,8 @@ FSCHAT_MODEL_WORKERS = { "host": DEFAULT_BIND_HOST, "port": 20002, "device": LLM_DEVICE, - "infer_turbo": False # 可选[False,'vllm'],使用的推理加速框架,使用vllm如果出现HuggingFace通信问题,参见doc/FAQ + # False,'vllm',使用的推理加速框架,使用vllm如果出现HuggingFace通信问题,参见doc/FAQ + "infer_turbo": "vllm" if sys.platform.startswith("linux") else False, # model_worker多卡加载需要配置的参数 # "gpus": None, # 使用的GPU,以str的格式指定,如"0,1",如失效请使用CUDA_VISIBLE_DEVICES="0,1"等形式指定 diff --git a/startup.py b/startup.py index c4f9e7b..d886227 100644 --- a/startup.py +++ b/startup.py @@ -78,7 +78,6 @@ def create_model_worker_app(log_level: str = "INFO", **kwargs) -> FastAPI: from fastchat.serve.model_worker import worker_id, logger import argparse import fastchat.serve.model_worker - import fastchat.serve.vllm_worker logger.setLevel(log_level) parser = argparse.ArgumentParser() @@ -98,6 +97,7 @@ def create_model_worker_app(log_level: str = "INFO", **kwargs) -> FastAPI: else: from configs.model_config import VLLM_MODEL_DICT if kwargs["model_names"][0] in VLLM_MODEL_DICT and args.infer_turbo == "vllm": + import fastchat.serve.vllm_worker from fastchat.serve.vllm_worker import VLLMWorker,app from vllm import AsyncLLMEngine from vllm.engine.arg_utils import AsyncEngineArgs,EngineArgs