diff --git a/README.md b/README.md index d38e471..e883f68 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,53 @@ ![实现原理图2](img/langchain+chatglm2.png) +项目目录结构 +```text +. +├── agent +│ └── agent实现 +├── chains +│ ├── modules +│ └── chains实现 +├── configs +│ └── 系统初始化配置 +├── content +│ └── 临时附件上传位置 +├── docs +│ └── 项目文档 +├── fastchat +│ ├── api +│ └── 一个fastchat langchain的LLM远程调用拓展 +├── img +│ └── 项目资源目录 +├── loras +│ └── 默认的本地lora文件存放路径 +├── model +│ └── 默认的本地checkpoint存放路径 +├── models +│ ├── extensions +│ │ └── LLM内部拓展包 +│ ├── loader +│ │ └── 项目checkpoint加载器,支持chatglm(AutoModel)、量化模型(llama.cpp)、其它模型(AutoModelForCausalLM),兼容lora、ptuning_v2微调文件加载 +│ └── 实现chatglm、LLama等一些模型的langchain LLM wrapper integrations +├── nltk_data +│ ├── corpora +│ │ └── cmudict +│ ├── taggers +│ │ └── averaged_perceptron_tagger +│ └── tokenizers +│ └── punkt +├── ptuning-v2 +├── textsplitter +│ └── 中文语义分割 +├── utils +│ └── 系统工具 +└── vector_store + └── faiss本地矢量库索引文件 + + +``` + 🚩 本项目未涉及微调、训练过程,但可利用微调或训练对本项目效果进行优化。 🌐 [AutoDL 镜像](https://www.codewithgpu.com/i/imClumsyPanda/langchain-ChatGLM/langchain-ChatGLM) @@ -33,7 +80,7 @@ - ChatGLM-6B 模型硬件需求 注:如未将模型下载至本地,请执行前检查`$HOME/.cache/huggingface/`文件夹剩余空间,模型文件下载至本地需要 15 GB 存储空间。 - + 注:一些其它的可选启动项见[项目启动选项](docs/StartOption.md) 模型下载方法可参考 [常见问题](docs/FAQ.md) 中 Q8。 | **量化等级** | **最低 GPU 显存**(推理) | **最低 GPU 显存**(高效参数微调) | diff --git a/api.py b/api.py index 558baac..681b815 100644 --- a/api.py +++ b/api.py @@ -19,6 +19,9 @@ from configs.model_config import (VS_ROOT_PATH, UPLOAD_ROOT_PATH, EMBEDDING_DEVI EMBEDDING_MODEL, LLM_MODEL, NLTK_DATA_PATH, VECTOR_SEARCH_TOP_K, LLM_HISTORY_LEN, OPEN_CROSS_DOMAIN) from agent import bing_search as agent_bing_search +import models.shared as shared +from models.loader.args import parser +from models.loader import LoaderCheckPoint nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path @@ -173,8 +176,8 @@ async def list_docs( async def delete_docs( knowledge_base_id: str = Query(..., - description="Knowledge Base Name(注意此方法仅删除上传的文件并不会删除知识库(FAISS)内数据)", - example="kb1"), + description="Knowledge Base Name(注意此方法仅删除上传的文件并不会删除知识库(FAISS)内数据)", + example="kb1"), doc_name: Optional[str] = Query( None, description="doc name", example="doc_name_1.pdf" ), @@ -258,9 +261,12 @@ async def chat( ], ), ): - for resp, history in local_doc_qa.llm._call( - prompt=question, history=history, streaming=True - ): + + for answer_result in local_doc_qa.llm.generatorAnswer(prompt=question, history=history, + streaming=True): + + resp = answer_result.llm_output["answer"] + history = answer_result.history pass return ChatMessage( @@ -312,6 +318,7 @@ async def stream_chat(websocket: WebSocket, knowledge_base_id: str): ) turn += 1 + async def document(): return RedirectResponse(url="/docs") @@ -333,10 +340,14 @@ async def bing_search( source_documents=[], ) + def api_start(host, port): global app global local_doc_qa + llm_model_ins = shared.loaderLLM() + llm_model_ins.set_history_len(LLM_HISTORY_LEN) + app = FastAPI() # Add CORS middleware to allow all origins # 在config.py中设置OPEN_DOMAIN=True,允许跨域 @@ -365,18 +376,22 @@ def api_start(host, port): local_doc_qa = LocalDocQA() local_doc_qa.init_cfg( - llm_model=LLM_MODEL, + llm_model=llm_model_ins, embedding_model=EMBEDDING_MODEL, embedding_device=EMBEDDING_DEVICE, - llm_history_len=LLM_HISTORY_LEN, top_k=VECTOR_SEARCH_TOP_K, ) uvicorn.run(app, host=host, port=port) + if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="0.0.0.0") parser.add_argument("--port", type=int, default=7861) - args = parser.parse_args() + # 初始化消息 + args = None + args = parser.parse_args(args=['--model-dir', '/media/checkpoint/', '--model', 'chatglm-6b', '--no-remote-model']) + args_dict = vars(args) + shared.loaderCheckPoint = LoaderCheckPoint(args_dict) api_start(args.host, args.port) diff --git a/cli_demo.py b/cli_demo.py index 9961faf..485f4fc 100644 --- a/cli_demo.py +++ b/cli_demo.py @@ -2,7 +2,9 @@ from configs.model_config import * from chains.local_doc_qa import LocalDocQA import os import nltk - +from models.loader.args import parser +import models.shared as shared +from models.loader import LoaderCheckPoint nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path # Show reply with source text from input document @@ -10,11 +12,17 @@ REPLY_WITH_SOURCE = True def main(): + args = None + args = parser.parse_args() + args_dict = vars(args) + shared.loaderCheckPoint = LoaderCheckPoint(args_dict) + llm_model_ins = shared.loaderLLM() + llm_model_ins.history_len = LLM_HISTORY_LEN + local_doc_qa = LocalDocQA() - local_doc_qa.init_cfg(llm_model=LLM_MODEL, + local_doc_qa.init_cfg(llm_model=llm_model_ins, embedding_model=EMBEDDING_MODEL, embedding_device=EMBEDDING_DEVICE, - llm_history_len=LLM_HISTORY_LEN, top_k=VECTOR_SEARCH_TOP_K) vs_path = None while not vs_path: diff --git a/docs/StartOption.md b/docs/StartOption.md new file mode 100644 index 0000000..7564fd3 --- /dev/null +++ b/docs/StartOption.md @@ -0,0 +1,76 @@ + +#### 项目启动选项 +```test +usage: langchina-ChatGLM [-h] [--no-remote-model] [--model MODEL] [--lora LORA] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--cpu] [--auto-devices] [--gpu-memory GPU_MEMORY [GPU_MEMORY ...]] [--cpu-memory CPU_MEMORY] + [--load-in-8bit] [--bf16] + +基于langchain和chatGML的LLM文档阅读器 + +options: + -h, --help show this help message and exit + --no-remote-model remote in the model on loader checkpoint, if your load local model to add the ` --no-remote-model` + --model MODEL Name of the model to load by default. + --lora LORA Name of the LoRA to apply to the model by default. + --model-dir MODEL_DIR + Path to directory with all the models + --lora-dir LORA_DIR Path to directory with all the loras + --cpu Use the CPU to generate text. Warning: Training on CPU is extremely slow. + --auto-devices Automatically split the model across the available GPU(s) and CPU. + --gpu-memory GPU_MEMORY [GPU_MEMORY ...] + Maxmimum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB. + --cpu-memory CPU_MEMORY + Maximum CPU memory in GiB to allocate for offloaded weights. Same as above. + --load-in-8bit Load the model with 8-bit precision. + --bf16 Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. + +``` + +#### 示例 + +- 1、加载本地模型 + +```text +--model-dir 本地checkpoint存放文件夹 +--model 模型名称 +--no-remote-model 不从远程加载模型 +``` +```shell +$ python cli_demo.py --model-dir /media/mnt/ --model chatglm-6b --no-remote-model +``` + +- 2、低精度加载模型 +```text +--model-dir 本地checkpoint存放文件夹 +--model 模型名称 +--no-remote-model 不从远程加载模型 +--load-in-8bit 以8位精度加载模型 +``` +```shell +$ python cli_demo.py --model-dir /media/mnt/ --model chatglm-6b --no-remote-model --load-in-8bit +``` + + +- 3、使用cpu预测模型 +```text +--model-dir 本地checkpoint存放文件夹 +--model 模型名称 +--no-remote-model 不从远程加载模型 +--cpu 使用CPU生成文本。警告:CPU上的训练非常缓慢。 +``` +```shell +$ python cli_demo.py --model-dir /media/mnt/ --model chatglm-6b --no-remote-model --cpu +``` + + + +- 3、加载lora微调文件 +```text +--model-dir 本地checkpoint存放文件夹 +--model 模型名称 +--no-remote-model 不从远程加载模型 +--lora-dir 本地lora存放文件夹 +--lora lora名称 +``` +```shell +$ python cli_demo.py --model-dir /media/mnt/ --model chatglm-6b --no-remote-model --lora-dir /media/mnt/loras --lora chatglm-step100 +```