diff --git a/README.md b/README.md
index d38e471..e883f68 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,53 @@
 
 ![实现原理图2](img/langchain+chatglm2.png)
 
+项目目录结构
+```text
+.
+├── agent
+│   └── agent实现
+├── chains
+│   ├── modules
+│   └── chains实现
+├── configs
+│   └── 系统初始化配置
+├── content
+│   └── 临时附件上传位置
+├── docs
+│   └── 项目文档
+├── fastchat
+│   ├── api
+│   └── 一个fastchat langchain的LLM远程调用拓展
+├── img
+│   └── 项目资源目录
+├── loras
+│   └── 默认的本地lora文件存放路径
+├── model
+│   └── 默认的本地checkpoint存放路径
+├── models
+│   ├── extensions
+│   │   └── LLM内部拓展包
+│   ├── loader
+│   │   └── 项目checkpoint加载器，支持chatglm（AutoModel）、量化模型（llama.cpp）、其它模型（AutoModelForCausalLM）,兼容lora、ptuning_v2微调文件加载
+│   └── 实现chatglm、LLama等一些模型的langchain LLM wrapper integrations
+├── nltk_data
+│   ├── corpora
+│   │   └── cmudict
+│   ├── taggers
+│   │   └── averaged_perceptron_tagger
+│   └── tokenizers
+│       └── punkt
+├── ptuning-v2 
+├── textsplitter
+│   └── 中文语义分割
+├── utils
+│   └── 系统工具
+└── vector_store
+    └── faiss本地矢量库索引文件
+
+
+```
+
 🚩 本项目未涉及微调、训练过程，但可利用微调或训练对本项目效果进行优化。
 
 🌐 [AutoDL 镜像](https://www.codewithgpu.com/i/imClumsyPanda/langchain-ChatGLM/langchain-ChatGLM)
@@ -33,7 +80,7 @@
 - ChatGLM-6B 模型硬件需求
 
     注：如未将模型下载至本地，请执行前检查`$HOME/.cache/huggingface/`文件夹剩余空间，模型文件下载至本地需要 15 GB 存储空间。
-
+    注：一些其它的可选启动项见[项目启动选项](docs/StartOption.md)
     模型下载方法可参考 [常见问题](docs/FAQ.md) 中 Q8。
   
     | **量化等级**   | **最低 GPU 显存**（推理） | **最低 GPU 显存**（高效参数微调） |
diff --git a/api.py b/api.py
index 558baac..681b815 100644
--- a/api.py
+++ b/api.py
@@ -19,6 +19,9 @@ from configs.model_config import (VS_ROOT_PATH, UPLOAD_ROOT_PATH, EMBEDDING_DEVI
                                   EMBEDDING_MODEL, LLM_MODEL, NLTK_DATA_PATH,
                                   VECTOR_SEARCH_TOP_K, LLM_HISTORY_LEN, OPEN_CROSS_DOMAIN)
 from agent import bing_search as agent_bing_search
+import models.shared as shared
+from models.loader.args import parser
+from models.loader import LoaderCheckPoint
 
 nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
 
@@ -173,8 +176,8 @@ async def list_docs(
 
 async def delete_docs(
         knowledge_base_id: str = Query(...,
-                                      description="Knowledge Base Name(注意此方法仅删除上传的文件并不会删除知识库(FAISS)内数据)",
-                                      example="kb1"),
+                                       description="Knowledge Base Name(注意此方法仅删除上传的文件并不会删除知识库(FAISS)内数据)",
+                                       example="kb1"),
         doc_name: Optional[str] = Query(
             None, description="doc name", example="doc_name_1.pdf"
         ),
@@ -258,9 +261,12 @@ async def chat(
             ],
         ),
 ):
-    for resp, history in local_doc_qa.llm._call(
-            prompt=question, history=history, streaming=True
-    ):
+
+    for answer_result in local_doc_qa.llm.generatorAnswer(prompt=question, history=history,
+                                                          streaming=True):
+
+        resp = answer_result.llm_output["answer"]
+        history = answer_result.history
         pass
 
     return ChatMessage(
@@ -312,6 +318,7 @@ async def stream_chat(websocket: WebSocket, knowledge_base_id: str):
         )
         turn += 1
 
+
 async def document():
     return RedirectResponse(url="/docs")
 
@@ -333,10 +340,14 @@ async def bing_search(
         source_documents=[],
     )
 
+
 def api_start(host, port):
     global app
     global local_doc_qa
 
+    llm_model_ins = shared.loaderLLM()
+    llm_model_ins.set_history_len(LLM_HISTORY_LEN)
+
     app = FastAPI()
     # Add CORS middleware to allow all origins
     # 在config.py中设置OPEN_DOMAIN=True，允许跨域
@@ -365,18 +376,22 @@ def api_start(host, port):
 
     local_doc_qa = LocalDocQA()
     local_doc_qa.init_cfg(
-        llm_model=LLM_MODEL,
+        llm_model=llm_model_ins,
         embedding_model=EMBEDDING_MODEL,
         embedding_device=EMBEDDING_DEVICE,
-        llm_history_len=LLM_HISTORY_LEN,
         top_k=VECTOR_SEARCH_TOP_K,
     )
     uvicorn.run(app, host=host, port=port)
 
 
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+
     parser.add_argument("--host", type=str, default="0.0.0.0")
     parser.add_argument("--port", type=int, default=7861)
-    args = parser.parse_args()
+    # 初始化消息
+    args = None
+    args = parser.parse_args(args=['--model-dir', '/media/checkpoint/',  '--model', 'chatglm-6b', '--no-remote-model'])
+    args_dict = vars(args)
+    shared.loaderCheckPoint = LoaderCheckPoint(args_dict)
     api_start(args.host, args.port)
diff --git a/cli_demo.py b/cli_demo.py
index 9961faf..485f4fc 100644
--- a/cli_demo.py
+++ b/cli_demo.py
@@ -2,7 +2,9 @@ from configs.model_config import *
 from chains.local_doc_qa import LocalDocQA
 import os
 import nltk
-
+from models.loader.args import parser
+import models.shared as shared
+from models.loader import LoaderCheckPoint
 nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
 
 # Show reply with source text from input document
@@ -10,11 +12,17 @@ REPLY_WITH_SOURCE = True
 
 
 def main():
+    args = None
+    args = parser.parse_args()
+    args_dict = vars(args)
+    shared.loaderCheckPoint = LoaderCheckPoint(args_dict)
+    llm_model_ins = shared.loaderLLM()
+    llm_model_ins.history_len = LLM_HISTORY_LEN
+
     local_doc_qa = LocalDocQA()
-    local_doc_qa.init_cfg(llm_model=LLM_MODEL,
+    local_doc_qa.init_cfg(llm_model=llm_model_ins,
                           embedding_model=EMBEDDING_MODEL,
                           embedding_device=EMBEDDING_DEVICE,
-                          llm_history_len=LLM_HISTORY_LEN,
                           top_k=VECTOR_SEARCH_TOP_K)
     vs_path = None
     while not vs_path:
diff --git a/docs/StartOption.md b/docs/StartOption.md
new file mode 100644
index 0000000..7564fd3
--- /dev/null
+++ b/docs/StartOption.md
@@ -0,0 +1,76 @@
+
+#### 项目启动选项
+```test
+usage: langchina-ChatGLM [-h] [--no-remote-model] [--model MODEL] [--lora LORA] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--cpu] [--auto-devices] [--gpu-memory GPU_MEMORY [GPU_MEMORY ...]] [--cpu-memory CPU_MEMORY]
+                         [--load-in-8bit] [--bf16]
+
+基于langchain和chatGML的LLM文档阅读器
+
+options:
+  -h, --help            show this help message and exit
+  --no-remote-model     remote in the model on loader checkpoint, if your load local model to add the ` --no-remote-model`
+  --model MODEL         Name of the model to load by default.
+  --lora LORA           Name of the LoRA to apply to the model by default.
+  --model-dir MODEL_DIR
+                        Path to directory with all the models
+  --lora-dir LORA_DIR   Path to directory with all the loras
+  --cpu                 Use the CPU to generate text. Warning: Training on CPU is extremely slow.
+  --auto-devices        Automatically split the model across the available GPU(s) and CPU.
+  --gpu-memory GPU_MEMORY [GPU_MEMORY ...]
+                        Maxmimum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.
+  --cpu-memory CPU_MEMORY
+                        Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.
+  --load-in-8bit        Load the model with 8-bit precision.
+  --bf16                Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
+
+```
+
+#### 示例
+
+- 1、加载本地模型
+
+```text
+--model-dir 本地checkpoint存放文件夹
+--model  模型名称
+--no-remote-model 不从远程加载模型
+```
+```shell
+$  python cli_demo.py --model-dir /media/mnt/ --model chatglm-6b --no-remote-model
+```
+
+- 2、低精度加载模型
+```text
+--model-dir 本地checkpoint存放文件夹
+--model  模型名称
+--no-remote-model 不从远程加载模型
+--load-in-8bit   以8位精度加载模型
+```
+```shell
+$ python cli_demo.py --model-dir /media/mnt/ --model chatglm-6b --no-remote-model --load-in-8bit   
+```
+
+
+- 3、使用cpu预测模型
+```text
+--model-dir 本地checkpoint存放文件夹
+--model  模型名称
+--no-remote-model 不从远程加载模型
+--cpu   使用CPU生成文本。警告：CPU上的训练非常缓慢。
+```
+```shell
+$ python cli_demo.py --model-dir /media/mnt/ --model chatglm-6b --no-remote-model --cpu 
+```
+
+
+
+- 3、加载lora微调文件
+```text
+--model-dir 本地checkpoint存放文件夹
+--model  模型名称
+--no-remote-model 不从远程加载模型
+--lora-dir   本地lora存放文件夹
+--lora lora名称
+```
+```shell
+$ python cli_demo.py --model-dir /media/mnt/ --model chatglm-6b --no-remote-model --lora-dir  /media/mnt/loras --lora chatglm-step100
+```