Adding an implementaion for involving with accessing Baichuan-13B-Chat (#1005)

This commit is contained in:
Vincent chen 2023-08-10 21:55:35 +08:00 committed by GitHub
parent 22c6192561
commit 62047c880e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 134 additions and 19 deletions

View File

@ -241,17 +241,28 @@ class LocalDocQA:
else: else:
prompt = query prompt = query
answer_result_stream_result = self.llm_model_chain( # 接入baichuan的代码分支
{"prompt": prompt, "history": chat_history, "streaming": streaming}) if LLM_MODEL == "Baichuan-13B-Chat":
for answer_result in self.llm_model_chain._generate_answer(prompt=prompt, history=chat_history,
streaming=streaming):
resp = answer_result.llm_output["answer"]
history = answer_result.history
response = {"query": query,
"result": resp,
"source_documents": related_docs_with_score}
yield response, history
else: # 原本逻辑分支:
answer_result_stream_result = self.llm_model_chain(
{"prompt": prompt, "history": chat_history, "streaming": streaming})
for answer_result in answer_result_stream_result['answer_result_stream']: for answer_result in answer_result_stream_result['answer_result_stream']:
resp = answer_result.llm_output["answer"] resp = answer_result.llm_output["answer"]
history = answer_result.history history = answer_result.history
history[-1][0] = query history[-1][0] = query
response = {"query": query, response = {"query": query,
"result": resp, "result": resp,
"source_documents": related_docs_with_score} "source_documents": related_docs_with_score}
yield response, history yield response, history
# query 查询内容 # query 查询内容
# vs_path 知识库路径 # vs_path 知识库路径

View File

@ -156,6 +156,12 @@ llm_model_dict = {
"local_model_path": None, "local_model_path": None,
"provides": "MOSSLLMChain" "provides": "MOSSLLMChain"
}, },
"Baichuan-13b-Chat": {
"name": "Baichuan-13b-Chat",
"pretrained_model_name": "baichuan-inc/Baichuan-13b-Chat",
"local_model_path": None,
"provides": "BaichuanLLMChain"
},
# llama-cpp模型的兼容性问题参考https://github.com/abetlen/llama-cpp-python/issues/204 # llama-cpp模型的兼容性问题参考https://github.com/abetlen/llama-cpp-python/issues/204
"ggml-vicuna-13b-1.1-q5": { "ggml-vicuna-13b-1.1-q5": {
"name": "ggml-vicuna-13b-1.1-q5", "name": "ggml-vicuna-13b-1.1-q5",
@ -244,6 +250,9 @@ USE_LORA = True if LORA_NAME else False
# LLM streaming reponse # LLM streaming reponse
STREAMING = True STREAMING = True
# 直接定义baichuan的lora完整路径即可
LORA_MODEL_PATH_BAICHUAN=""
# Use p-tuning-v2 PrefixEncoder # Use p-tuning-v2 PrefixEncoder
USE_PTUNING_V2 = False USE_PTUNING_V2 = False
PTUNING_DIR='./ptuning-v2' PTUNING_DIR='./ptuning-v2'

View File

@ -3,3 +3,5 @@ from .llama_llm import LLamaLLMChain
from .chatglmcpp_llm import ChatGLMCppLLMChain from .chatglmcpp_llm import ChatGLMCppLLMChain
from .fastchat_openai_llm import FastChatOpenAILLMChain from .fastchat_openai_llm import FastChatOpenAILLMChain
from .moss_llm import MOSSLLMChain from .moss_llm import MOSSLLMChain
from .baichuan_llm import BaichuanLLMChain

71
models/baichuan_llm.py Normal file
View File

@ -0,0 +1,71 @@
from abc import ABC
from langchain.llms.base import LLM
from typing import Optional, List
from models.loader import LoaderCheckPoint
from models.base import (BaseAnswer,
AnswerResult)
class BaichuanLLMChain(BaseAnswer, LLM, ABC):
max_token: int = 10000
temperature: float = 0.01
top_p = 0.9
checkPoint: LoaderCheckPoint = None
# history = []
history_len: int = 10
def __init__(self, checkPoint: LoaderCheckPoint = None):
super().__init__()
self.checkPoint = checkPoint
@property
def _llm_type(self) -> str:
return "BaichuanLLMChain"
@property
def _check_point(self) -> LoaderCheckPoint:
return self.checkPoint
@property
def _history_len(self) -> int:
return self.history_len
def set_history_len(self, history_len: int = 10) -> None:
self.history_len = history_len
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
print(f"__call:{prompt}")
response, _ = self.checkPoint.model.chat(
self.checkPoint.tokenizer,
prompt,
# history=[],
# max_length=self.max_token,
# temperature=self.temperature
)
print(f"response:{response}")
print(f"+++++++++++++++++++++++++++++++++++")
return response
def _generate_answer(self, prompt: str,
history: List[List[str]] = [],
streaming: bool = False):
messages = []
messages.append({"role": "user", "content": prompt})
if streaming:
for inum, stream_resp in enumerate(self.checkPoint.model.chat(
self.checkPoint.tokenizer,
messages,
stream=True
)):
self.checkPoint.clear_torch_cache()
answer_result = AnswerResult()
answer_result.llm_output = {"answer": stream_resp}
yield answer_result
else:
response = self.checkPoint.model.chat(
self.checkPoint.tokenizer,
messages
)
self.checkPoint.clear_torch_cache()
answer_result = AnswerResult()
answer_result.llm_output = {"answer": response}
yield answer_result

View File

@ -9,7 +9,9 @@ import torch
import transformers import transformers
from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM, from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM,
AutoTokenizer, LlamaTokenizer) AutoTokenizer, LlamaTokenizer)
from configs.model_config import LLM_DEVICE from configs.model_config import LLM_DEVICE, LLM_MODEL, LORA_MODEL_PATH_BAICHUAN
from peft import PeftModel
from transformers.generation.utils import GenerationConfig
class LoaderCheckPoint: class LoaderCheckPoint:
""" """
@ -126,14 +128,34 @@ class LoaderCheckPoint:
# 根据当前设备GPU数量决定是否进行多卡部署 # 根据当前设备GPU数量决定是否进行多卡部署
num_gpus = torch.cuda.device_count() num_gpus = torch.cuda.device_count()
if num_gpus < 2 and self.device_map is None: if num_gpus < 2 and self.device_map is None:
model = ( # if LORA_MODEL_PATH_BAICHUAN is not None:
LoaderClass.from_pretrained(checkpoint, if LORA_MODEL_PATH_BAICHUAN:
config=self.model_config, if LLM_MODEL == "Baichuan-13B-Chat":
torch_dtype=torch.bfloat16 if self.bf16 else torch.float16, model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float16,
trust_remote_code=True) device_map="auto", trust_remote_code=True, )
.half() model.generation_config = GenerationConfig.from_pretrained(checkpoint)
.cuda() from configs.model_config import LLM_DEVICE, LORA_MODEL_PATH_BAICHUAN
) # if LORA_MODEL_PATH_BAICHUAN is not None:
if LORA_MODEL_PATH_BAICHUAN:
print("loading lora:{path}".format(path=LORA_MODEL_PATH_BAICHUAN))
model = PeftModel.from_pretrained(
model,
LORA_MODEL_PATH_BAICHUAN,
torch_dtype=torch.float16,
device_map={"": LLM_DEVICE}
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False,
trust_remote_code=True)
model.half().cuda()
else:
model = (
LoaderClass.from_pretrained(checkpoint,
config=self.model_config,
torch_dtype=torch.bfloat16 if self.bf16 else torch.float16,
trust_remote_code=True)
.half()
.cuda()
)
# 支持自定义cuda设备 # 支持自定义cuda设备
elif ":" in self.llm_device: elif ":" in self.llm_device:
model = LoaderClass.from_pretrained(checkpoint, model = LoaderClass.from_pretrained(checkpoint,