From 62047c880eb58b9efd2a3d8265776747ff079b22 Mon Sep 17 00:00:00 2001 From: Vincent chen <34907633+Vincent-ch99@users.noreply.github.com> Date: Thu, 10 Aug 2023 21:55:35 +0800 Subject: [PATCH] Adding an implementaion for involving with accessing Baichuan-13B-Chat (#1005) --- chains/local_doc_qa.py | 31 ++++++++++++------ configs/model_config.py | 9 ++++++ models/__init__.py | 2 ++ models/baichuan_llm.py | 71 +++++++++++++++++++++++++++++++++++++++++ models/loader/loader.py | 40 +++++++++++++++++------ 5 files changed, 134 insertions(+), 19 deletions(-) create mode 100644 models/baichuan_llm.py diff --git a/chains/local_doc_qa.py b/chains/local_doc_qa.py index 6085dfc..c1f1aa3 100644 --- a/chains/local_doc_qa.py +++ b/chains/local_doc_qa.py @@ -241,17 +241,28 @@ class LocalDocQA: else: prompt = query - answer_result_stream_result = self.llm_model_chain( - {"prompt": prompt, "history": chat_history, "streaming": streaming}) + # 接入baichuan的代码分支: + if LLM_MODEL == "Baichuan-13B-Chat": + for answer_result in self.llm_model_chain._generate_answer(prompt=prompt, history=chat_history, + streaming=streaming): + resp = answer_result.llm_output["answer"] + history = answer_result.history + response = {"query": query, + "result": resp, + "source_documents": related_docs_with_score} + yield response, history + else: # 原本逻辑分支: + answer_result_stream_result = self.llm_model_chain( + {"prompt": prompt, "history": chat_history, "streaming": streaming}) - for answer_result in answer_result_stream_result['answer_result_stream']: - resp = answer_result.llm_output["answer"] - history = answer_result.history - history[-1][0] = query - response = {"query": query, - "result": resp, - "source_documents": related_docs_with_score} - yield response, history + for answer_result in answer_result_stream_result['answer_result_stream']: + resp = answer_result.llm_output["answer"] + history = answer_result.history + history[-1][0] = query + response = {"query": query, + "result": resp, + "source_documents": related_docs_with_score} + yield response, history # query 查询内容 # vs_path 知识库路径 diff --git a/configs/model_config.py b/configs/model_config.py index f721c89..2ca08d6 100644 --- a/configs/model_config.py +++ b/configs/model_config.py @@ -156,6 +156,12 @@ llm_model_dict = { "local_model_path": None, "provides": "MOSSLLMChain" }, + "Baichuan-13b-Chat": { + "name": "Baichuan-13b-Chat", + "pretrained_model_name": "baichuan-inc/Baichuan-13b-Chat", + "local_model_path": None, + "provides": "BaichuanLLMChain" + }, # llama-cpp模型的兼容性问题参考https://github.com/abetlen/llama-cpp-python/issues/204 "ggml-vicuna-13b-1.1-q5": { "name": "ggml-vicuna-13b-1.1-q5", @@ -244,6 +250,9 @@ USE_LORA = True if LORA_NAME else False # LLM streaming reponse STREAMING = True +# 直接定义baichuan的lora完整路径即可 +LORA_MODEL_PATH_BAICHUAN="" + # Use p-tuning-v2 PrefixEncoder USE_PTUNING_V2 = False PTUNING_DIR='./ptuning-v2' diff --git a/models/__init__.py b/models/__init__.py index d7ae7d5..0975fde 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -3,3 +3,5 @@ from .llama_llm import LLamaLLMChain from .chatglmcpp_llm import ChatGLMCppLLMChain from .fastchat_openai_llm import FastChatOpenAILLMChain from .moss_llm import MOSSLLMChain +from .baichuan_llm import BaichuanLLMChain + diff --git a/models/baichuan_llm.py b/models/baichuan_llm.py new file mode 100644 index 0000000..1a8596c --- /dev/null +++ b/models/baichuan_llm.py @@ -0,0 +1,71 @@ +from abc import ABC +from langchain.llms.base import LLM +from typing import Optional, List +from models.loader import LoaderCheckPoint +from models.base import (BaseAnswer, + AnswerResult) + +class BaichuanLLMChain(BaseAnswer, LLM, ABC): + max_token: int = 10000 + temperature: float = 0.01 + top_p = 0.9 + checkPoint: LoaderCheckPoint = None + # history = [] + history_len: int = 10 + + def __init__(self, checkPoint: LoaderCheckPoint = None): + super().__init__() + self.checkPoint = checkPoint + + @property + def _llm_type(self) -> str: + return "BaichuanLLMChain" + + @property + def _check_point(self) -> LoaderCheckPoint: + return self.checkPoint + + @property + def _history_len(self) -> int: + return self.history_len + + def set_history_len(self, history_len: int = 10) -> None: + self.history_len = history_len + + def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: + print(f"__call:{prompt}") + response, _ = self.checkPoint.model.chat( + self.checkPoint.tokenizer, + prompt, + # history=[], + # max_length=self.max_token, + # temperature=self.temperature + ) + print(f"response:{response}") + print(f"+++++++++++++++++++++++++++++++++++") + return response + + def _generate_answer(self, prompt: str, + history: List[List[str]] = [], + streaming: bool = False): + messages = [] + messages.append({"role": "user", "content": prompt}) + if streaming: + for inum, stream_resp in enumerate(self.checkPoint.model.chat( + self.checkPoint.tokenizer, + messages, + stream=True + )): + self.checkPoint.clear_torch_cache() + answer_result = AnswerResult() + answer_result.llm_output = {"answer": stream_resp} + yield answer_result + else: + response = self.checkPoint.model.chat( + self.checkPoint.tokenizer, + messages + ) + self.checkPoint.clear_torch_cache() + answer_result = AnswerResult() + answer_result.llm_output = {"answer": response} + yield answer_result \ No newline at end of file diff --git a/models/loader/loader.py b/models/loader/loader.py index 6950673..6ba46cf 100644 --- a/models/loader/loader.py +++ b/models/loader/loader.py @@ -9,7 +9,9 @@ import torch import transformers from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer) -from configs.model_config import LLM_DEVICE +from configs.model_config import LLM_DEVICE, LLM_MODEL, LORA_MODEL_PATH_BAICHUAN +from peft import PeftModel +from transformers.generation.utils import GenerationConfig class LoaderCheckPoint: """ @@ -126,14 +128,34 @@ class LoaderCheckPoint: # 根据当前设备GPU数量决定是否进行多卡部署 num_gpus = torch.cuda.device_count() if num_gpus < 2 and self.device_map is None: - model = ( - LoaderClass.from_pretrained(checkpoint, - config=self.model_config, - torch_dtype=torch.bfloat16 if self.bf16 else torch.float16, - trust_remote_code=True) - .half() - .cuda() - ) + # if LORA_MODEL_PATH_BAICHUAN is not None: + if LORA_MODEL_PATH_BAICHUAN: + if LLM_MODEL == "Baichuan-13B-Chat": + model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float16, + device_map="auto", trust_remote_code=True, ) + model.generation_config = GenerationConfig.from_pretrained(checkpoint) + from configs.model_config import LLM_DEVICE, LORA_MODEL_PATH_BAICHUAN + # if LORA_MODEL_PATH_BAICHUAN is not None: + if LORA_MODEL_PATH_BAICHUAN: + print("loading lora:{path}".format(path=LORA_MODEL_PATH_BAICHUAN)) + model = PeftModel.from_pretrained( + model, + LORA_MODEL_PATH_BAICHUAN, + torch_dtype=torch.float16, + device_map={"": LLM_DEVICE} + ) + tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False, + trust_remote_code=True) + model.half().cuda() + else: + model = ( + LoaderClass.from_pretrained(checkpoint, + config=self.model_config, + torch_dtype=torch.bfloat16 if self.bf16 else torch.float16, + trust_remote_code=True) + .half() + .cuda() + ) # 支持自定义cuda设备 elif ":" in self.llm_device: model = LoaderClass.from_pretrained(checkpoint,