From 62047c880eb58b9efd2a3d8265776747ff079b22 Mon Sep 17 00:00:00 2001
From: Vincent chen <34907633+Vincent-ch99@users.noreply.github.com>
Date: Thu, 10 Aug 2023 21:55:35 +0800
Subject: [PATCH] Adding an implementaion for involving with accessing
 Baichuan-13B-Chat (#1005)

---
 chains/local_doc_qa.py  | 31 ++++++++++++------
 configs/model_config.py |  9 ++++++
 models/__init__.py      |  2 ++
 models/baichuan_llm.py  | 71 +++++++++++++++++++++++++++++++++++++++++
 models/loader/loader.py | 40 +++++++++++++++++------
 5 files changed, 134 insertions(+), 19 deletions(-)
 create mode 100644 models/baichuan_llm.py

diff --git a/chains/local_doc_qa.py b/chains/local_doc_qa.py
index 6085dfc..c1f1aa3 100644
--- a/chains/local_doc_qa.py
+++ b/chains/local_doc_qa.py
@@ -241,17 +241,28 @@ class LocalDocQA:
         else:
             prompt = query
 
-        answer_result_stream_result = self.llm_model_chain(
-            {"prompt": prompt, "history": chat_history, "streaming": streaming})
+        # 接入baichuan的代码分支：
+        if LLM_MODEL == "Baichuan-13B-Chat":
+            for answer_result in self.llm_model_chain._generate_answer(prompt=prompt, history=chat_history,
+                                                                       streaming=streaming):
+                resp = answer_result.llm_output["answer"]
+                history = answer_result.history
+                response = {"query": query,
+                            "result": resp,
+                            "source_documents": related_docs_with_score}
+                yield response, history
+        else:  # 原本逻辑分支：
+            answer_result_stream_result = self.llm_model_chain(
+                {"prompt": prompt, "history": chat_history, "streaming": streaming})
 
-        for answer_result in answer_result_stream_result['answer_result_stream']:
-            resp = answer_result.llm_output["answer"]
-            history = answer_result.history
-            history[-1][0] = query
-            response = {"query": query,
-                        "result": resp,
-                        "source_documents": related_docs_with_score}
-            yield response, history
+            for answer_result in answer_result_stream_result['answer_result_stream']:
+                resp = answer_result.llm_output["answer"]
+                history = answer_result.history
+                history[-1][0] = query
+                response = {"query": query,
+                            "result": resp,
+                            "source_documents": related_docs_with_score}
+                yield response, history
 
     # query      查询内容
     # vs_path    知识库路径
diff --git a/configs/model_config.py b/configs/model_config.py
index f721c89..2ca08d6 100644
--- a/configs/model_config.py
+++ b/configs/model_config.py
@@ -156,6 +156,12 @@ llm_model_dict = {
         "local_model_path": None,
         "provides": "MOSSLLMChain"
     },
+    "Baichuan-13b-Chat": {
+        "name": "Baichuan-13b-Chat",
+        "pretrained_model_name": "baichuan-inc/Baichuan-13b-Chat",
+        "local_model_path": None,
+        "provides": "BaichuanLLMChain"
+    },
     # llama-cpp模型的兼容性问题参考https://github.com/abetlen/llama-cpp-python/issues/204
     "ggml-vicuna-13b-1.1-q5": {
         "name": "ggml-vicuna-13b-1.1-q5",
@@ -244,6 +250,9 @@ USE_LORA = True if LORA_NAME else False
 # LLM streaming reponse
 STREAMING = True
 
+# 直接定义baichuan的lora完整路径即可
+LORA_MODEL_PATH_BAICHUAN=""
+
 # Use p-tuning-v2 PrefixEncoder
 USE_PTUNING_V2 = False
 PTUNING_DIR='./ptuning-v2'
diff --git a/models/__init__.py b/models/__init__.py
index d7ae7d5..0975fde 100644
--- a/models/__init__.py
+++ b/models/__init__.py
@@ -3,3 +3,5 @@ from .llama_llm import LLamaLLMChain
 from .chatglmcpp_llm import ChatGLMCppLLMChain
 from .fastchat_openai_llm import FastChatOpenAILLMChain
 from .moss_llm import MOSSLLMChain
+from .baichuan_llm import BaichuanLLMChain
+
diff --git a/models/baichuan_llm.py b/models/baichuan_llm.py
new file mode 100644
index 0000000..1a8596c
--- /dev/null
+++ b/models/baichuan_llm.py
@@ -0,0 +1,71 @@
+from abc import ABC
+from langchain.llms.base import LLM
+from typing import Optional, List
+from models.loader import LoaderCheckPoint
+from models.base import (BaseAnswer,
+                         AnswerResult)
+
+class BaichuanLLMChain(BaseAnswer, LLM, ABC):
+    max_token: int = 10000
+    temperature: float = 0.01
+    top_p = 0.9
+    checkPoint: LoaderCheckPoint = None
+    # history = []
+    history_len: int = 10
+
+    def __init__(self, checkPoint: LoaderCheckPoint = None):
+        super().__init__()
+        self.checkPoint = checkPoint
+
+    @property
+    def _llm_type(self) -> str:
+        return "BaichuanLLMChain"
+
+    @property
+    def _check_point(self) -> LoaderCheckPoint:
+        return self.checkPoint
+
+    @property
+    def _history_len(self) -> int:
+        return self.history_len
+
+    def set_history_len(self, history_len: int = 10) -> None:
+        self.history_len = history_len
+
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        print(f"__call:{prompt}")
+        response, _ = self.checkPoint.model.chat(
+            self.checkPoint.tokenizer,
+            prompt,
+            # history=[],
+            # max_length=self.max_token,
+            # temperature=self.temperature
+        )
+        print(f"response:{response}")
+        print(f"+++++++++++++++++++++++++++++++++++")
+        return response
+
+    def _generate_answer(self, prompt: str,
+                         history: List[List[str]] = [],
+                         streaming: bool = False):
+        messages = []
+        messages.append({"role": "user", "content": prompt})
+        if streaming:
+            for inum, stream_resp in enumerate(self.checkPoint.model.chat(
+                    self.checkPoint.tokenizer,
+                    messages,
+                    stream=True
+            )):
+                self.checkPoint.clear_torch_cache()
+                answer_result = AnswerResult()
+                answer_result.llm_output = {"answer": stream_resp}
+                yield answer_result
+        else:
+            response = self.checkPoint.model.chat(
+                self.checkPoint.tokenizer,
+                messages
+            )
+            self.checkPoint.clear_torch_cache()
+            answer_result = AnswerResult()
+            answer_result.llm_output = {"answer": response}
+            yield answer_result
\ No newline at end of file
diff --git a/models/loader/loader.py b/models/loader/loader.py
index 6950673..6ba46cf 100644
--- a/models/loader/loader.py
+++ b/models/loader/loader.py
@@ -9,7 +9,9 @@ import torch
 import transformers
 from transformers import (AutoConfig, AutoModel, AutoModelForCausalLM,
                           AutoTokenizer, LlamaTokenizer)
-from configs.model_config import LLM_DEVICE
+from configs.model_config import LLM_DEVICE, LLM_MODEL, LORA_MODEL_PATH_BAICHUAN
+from peft import PeftModel
+from transformers.generation.utils import GenerationConfig
 
 class LoaderCheckPoint:
     """
@@ -126,14 +128,34 @@ class LoaderCheckPoint:
                 # 根据当前设备GPU数量决定是否进行多卡部署
                 num_gpus = torch.cuda.device_count()
                 if num_gpus < 2 and self.device_map is None:
-                    model = (
-                        LoaderClass.from_pretrained(checkpoint,
-                                                    config=self.model_config,
-                                                    torch_dtype=torch.bfloat16 if self.bf16 else torch.float16,
-                                                    trust_remote_code=True)
-                        .half()
-                        .cuda()
-                    )
+                    # if LORA_MODEL_PATH_BAICHUAN is not None:
+                    if LORA_MODEL_PATH_BAICHUAN:
+                        if LLM_MODEL == "Baichuan-13B-Chat":
+                            model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float16,
+                                                                         device_map="auto", trust_remote_code=True, )
+                            model.generation_config = GenerationConfig.from_pretrained(checkpoint)
+                            from configs.model_config import LLM_DEVICE, LORA_MODEL_PATH_BAICHUAN
+                            # if LORA_MODEL_PATH_BAICHUAN is not None:
+                            if LORA_MODEL_PATH_BAICHUAN:
+                                print("loading lora:{path}".format(path=LORA_MODEL_PATH_BAICHUAN))
+                                model = PeftModel.from_pretrained(
+                                    model,
+                                    LORA_MODEL_PATH_BAICHUAN,
+                                    torch_dtype=torch.float16,
+                                    device_map={"": LLM_DEVICE}
+                                )
+                            tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False,
+                                                                      trust_remote_code=True)
+                            model.half().cuda()
+                    else:
+                        model = (
+                            LoaderClass.from_pretrained(checkpoint,
+                                                        config=self.model_config,
+                                                        torch_dtype=torch.bfloat16 if self.bf16 else torch.float16,
+                                                        trust_remote_code=True)
+                                .half()
+                                .cuda()
+                        )
                 # 支持自定义cuda设备
                 elif ":" in self.llm_device:
                     model = LoaderClass.from_pretrained(checkpoint,