Adding an implementaion for involving with accessing Baichuan-13B-Chat (#1005)

2023-08-10 21:55:35 +08:00 · 2023-08-10 21:55:35 +08:00 · 62047c880e
parent 22c6192561
commit 62047c880e
5 changed files with 134 additions and 19 deletions
--- a/chains/local_doc_qa.py
+++ b/chains/local_doc_qa.py
@ -241,17 +241,28 @@ class LocalDocQA:
        else:
            prompt = query
-        answer_result_stream_result = self.llm_model_chain(
+        # 接入baichuan的代码分支：
-            {"prompt": prompt, "history": chat_history, "streaming": streaming})
+        if LLM_MODEL == "Baichuan-13B-Chat":
            for answer_result in self.llm_model_chain._generate_answer(prompt=prompt, history=chat_history,
                                                                       streaming=streaming):
                resp = answer_result.llm_output["answer"]
                history = answer_result.history
                response = {"query": query,
                            "result": resp,
                            "source_documents": related_docs_with_score}
                yield response, history
        else:  # 原本逻辑分支：
            answer_result_stream_result = self.llm_model_chain(
                {"prompt": prompt, "history": chat_history, "streaming": streaming})
-        for answer_result in answer_result_stream_result['answer_result_stream']:
+            for answer_result in answer_result_stream_result['answer_result_stream']:
-            resp = answer_result.llm_output["answer"]
+                resp = answer_result.llm_output["answer"]
-            history = answer_result.history
+                history = answer_result.history
-            history[-1][0] = query
+                history[-1][0] = query
-            response = {"query": query,
+                response = {"query": query,
-                        "result": resp,
+                            "result": resp,
-                        "source_documents": related_docs_with_score}
+                            "source_documents": related_docs_with_score}
-            yield response, history
+                yield response, history
    # query      查询内容
    # vs_path    知识库路径
--- a/configs/model_config.py
+++ b/configs/model_config.py
@ -156,6 +156,12 @@ llm_model_dict = {
        "local_model_path": None,
        "provides": "MOSSLLMChain"
    },
    "Baichuan-13b-Chat": {
        "name": "Baichuan-13b-Chat",
        "pretrained_model_name": "baichuan-inc/Baichuan-13b-Chat",
        "local_model_path": None,
        "provides": "BaichuanLLMChain"
    },
    # llama-cpp模型的兼容性问题参考https://github.com/abetlen/llama-cpp-python/issues/204
    "ggml-vicuna-13b-1.1-q5": {
        "name": "ggml-vicuna-13b-1.1-q5",
@ -244,6 +250,9 @@ USE_LORA = True if LORA_NAME else False
 # LLM streaming reponse
 STREAMING = True
 # 直接定义baichuan的lora完整路径即可
 LORA_MODEL_PATH_BAICHUAN=""
 # Use p-tuning-v2 PrefixEncoder
 USE_PTUNING_V2 = False
 PTUNING_DIR='./ptuning-v2'
--- a/models/init.py
+++ b/models/init.py
@ -3,3 +3,5 @@ from .llama_llm import LLamaLLMChain
 from .chatglmcpp_llm import ChatGLMCppLLMChain
 from .fastchat_openai_llm import FastChatOpenAILLMChain
 from .moss_llm import MOSSLLMChain
 from .baichuan_llm import BaichuanLLMChain
--- a/models/baichuan_llm.py
+++ b/models/baichuan_llm.py
@ -0,0 +1,71 @@
 from abc import ABC
 from langchain.llms.base import LLM
 from typing import Optional, List
 from models.loader import LoaderCheckPoint
 from models.base import (BaseAnswer,
                         AnswerResult)
 class BaichuanLLMChain(BaseAnswer, LLM, ABC):
    max_token: int = 10000
    temperature: float = 0.01
    top_p = 0.9
    checkPoint: LoaderCheckPoint = None
    # history = []
    history_len: int = 10
    def __init__(self, checkPoint: LoaderCheckPoint = None):
        super().__init__()
        self.checkPoint = checkPoint
    @property
    def _llm_type(self) -> str:
        return "BaichuanLLMChain"
    @property
    def _check_point(self) -> LoaderCheckPoint:
        return self.checkPoint
    @property
    def _history_len(self) -> int:
        return self.history_len
    def set_history_len(self, history_len: int = 10) -> None:
        self.history_len = history_len
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        print(f"__call:{prompt}")
        response, _ = self.checkPoint.model.chat(
            self.checkPoint.tokenizer,
            prompt,
            # history=[],
            # max_length=self.max_token,
            # temperature=self.temperature
        )
        print(f"response:{response}")
        print(f"+++++++++++++++++++++++++++++++++++")
        return response
    def _generate_answer(self, prompt: str,
                         history: List[List[str]] = [],
                         streaming: bool = False):
        messages = []
        messages.append({"role": "user", "content": prompt})
        if streaming:
            for inum, stream_resp in enumerate(self.checkPoint.model.chat(
                    self.checkPoint.tokenizer,
                    messages,
                    stream=True
            )):
                self.checkPoint.clear_torch_cache()
                answer_result = AnswerResult()
                answer_result.llm_output = {"answer": stream_resp}
                yield answer_result
        else:
            response = self.checkPoint.model.chat(
                self.checkPoint.tokenizer,
                messages
            )
            self.checkPoint.clear_torch_cache()
            answer_result = AnswerResult()
            answer_result.llm_output = {"answer": response}
            yield answer_result
--- a/models/loader/loader.py
+++ b/models/loader/loader.py
@ -9,7 +9,9 @@ import torch
 import transformers
 from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM,
                          AutoTokenizer, LlamaTokenizer)
-from configs.model_config import LLM_DEVICE
+from configs.model_config import LLM_DEVICE, LLM_MODEL, LORA_MODEL_PATH_BAICHUAN
 from peft import PeftModel
 from transformers.generation.utils import GenerationConfig
 class LoaderCheckPoint:
    """
@ -126,14 +128,34 @@ class LoaderCheckPoint:
                # 根据当前设备GPU数量决定是否进行多卡部署
                num_gpus = torch.cuda.device_count()
                if num_gpus < 2 and self.device_map is None:
-                    model = (
+                    # if LORA_MODEL_PATH_BAICHUAN is not None:
-                        LoaderClass.from_pretrained(checkpoint,
+                    if LORA_MODEL_PATH_BAICHUAN:
-                                                    config=self.model_config,
+                        if LLM_MODEL == "Baichuan-13B-Chat":
-                                                    torch_dtype=torch.bfloat16 if self.bf16 else torch.float16,
+                            model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float16,
-                                                    trust_remote_code=True)
+                                                                         device_map="auto", trust_remote_code=True, )
-                        .half()
+                            model.generation_config = GenerationConfig.from_pretrained(checkpoint)
-                        .cuda()
+                            from configs.model_config import LLM_DEVICE, LORA_MODEL_PATH_BAICHUAN
-                    )
+                            # if LORA_MODEL_PATH_BAICHUAN is not None:
                            if LORA_MODEL_PATH_BAICHUAN:
                                print("loading lora:{path}".format(path=LORA_MODEL_PATH_BAICHUAN))
                                model = PeftModel.from_pretrained(
                                    model,
                                    LORA_MODEL_PATH_BAICHUAN,
                                    torch_dtype=torch.float16,
                                    device_map={"": LLM_DEVICE}
                                )
                            tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False,
                                                                      trust_remote_code=True)
                            model.half().cuda()
                    else:
                        model = (
                            LoaderClass.from_pretrained(checkpoint,
                                                        config=self.model_config,
                                                        torch_dtype=torch.bfloat16 if self.bf16 else torch.float16,
                                                        trust_remote_code=True)
                                .half()
                                .cuda()
                        )
                # 支持自定义cuda设备
                elif ":" in self.llm_device:
                    model = LoaderClass.from_pretrained(checkpoint,