diff --git a/README.md b/README.md index d61faaa..7d0f74c 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,6 @@ Web UI 可以实现如下功能: - [ ] 实现调用 API 的 Web UI Demo ## 项目交流群 -![二维码](img/qr_code_11.jpg) +![二维码](img/qr_code_12.jpg) 🎉 langchain-ChatGLM 项目交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。 diff --git a/api.py b/api.py index 90ba1dc..6f09ae5 100644 --- a/api.py +++ b/api.py @@ -170,36 +170,32 @@ async def delete_docs( async def chat( - knowledge_base_id: str = Body(..., description="知识库名字", example="kb1"), - question: str = Body(..., description="问题", example="工伤保险是什么?"), + knowledge_base_id: str = Body(..., description="Knowledge Base Name", example="kb1"), + question: str = Body(..., description="Question", example="工伤保险是什么?"), history: List[List[str]] = Body( [], - description="问题及答案的历史记录", + description="History of previous questions and answers", example=[ [ - "这里是问题,如:工伤保险是什么?", - "答案:工伤保险是指用人单位按照国家规定,为本单位的职工和用人单位的其他人员,缴纳工伤保险费,由保险机构按照国家规定的标准,给予工伤保险待遇的社会保险制度。", + "工伤保险是什么?", + "工伤保险是指用人单位按照国家规定,为本单位的职工和用人单位的其他人员,缴纳工伤保险费,由保险机构按照国家规定的标准,给予工伤保险待遇的社会保险制度。", ] ], ), ): vs_path = os.path.join(VS_ROOT_PATH, knowledge_base_id) - resp = {} - if os.path.exists(vs_path) and knowledge_base_id: - for resp, history in local_doc_qa.get_knowledge_based_answer( - query=question, vs_path=vs_path, chat_history=history, streaming=False - ): - pass - source_documents = [ - f"""出处 [{inum + 1}] {os.path.split(doc.metadata['source'])[-1]}:\n\n{doc.page_content}\n\n""" - f"""相关度:{doc.metadata['score']}\n\n""" - for inum, doc in enumerate(resp["source_documents"]) - ] - else: - for resp_s, history in local_doc_qa.llm._call(prompt=question, history=history, streaming=False): - pass - resp["result"] = resp_s - source_documents =[("当前知识库为空,如需基于知识库进行问答,请先加载知识库后,再进行提问。")] + if not os.path.exists(vs_path): + raise ValueError(f"Knowledge base {knowledge_base_id} not found") + + for resp, history in local_doc_qa.get_knowledge_based_answer( + query=question, vs_path=vs_path, chat_history=history, streaming=True + ): + pass + source_documents = [ + f"""出处 [{inum + 1}] {os.path.split(doc.metadata['source'])[-1]}:\n\n{doc.page_content}\n\n""" + f"""相关度:{doc.metadata['score']}\n\n""" + for inum, doc in enumerate(resp["source_documents"]) + ] return ChatMessage( question=question, diff --git a/img/qr_code_11.jpg b/img/qr_code_11.jpg deleted file mode 100644 index a52d511..0000000 Binary files a/img/qr_code_11.jpg and /dev/null differ diff --git a/img/qr_code_12.jpg b/img/qr_code_12.jpg new file mode 100644 index 0000000..e5e4262 Binary files /dev/null and b/img/qr_code_12.jpg differ diff --git a/textsplitter/chinese_text_splitter.py b/textsplitter/chinese_text_splitter.py index 2b18579..6b6c136 100644 --- a/textsplitter/chinese_text_splitter.py +++ b/textsplitter/chinese_text_splitter.py @@ -3,56 +3,73 @@ import re from typing import List from configs.model_config import SENTENCE_SIZE + class ChineseTextSplitter(CharacterTextSplitter): def __init__(self, pdf: bool = False, **kwargs): super().__init__(**kwargs) self.pdf = pdf - def split_text1(self, text: str) -> List[str]: + def split_text1(self, text: str, use_document_segmentation: bool = False) -> List[str]: + # use_document_segmentation参数指定是否用语义切分文档,此处采取的文档语义分割模型为达摩院开源的nlp_bert_document-segmentation_chinese-base,论文见https://arxiv.org/abs/2107.09278 + # 如果使用模型进行文档语义切分,那么需要安装modelscope[nlp]:pip install "modelscope[nlp]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html + # 考虑到使用了三个模型,可能对于低配置gpu不太友好,因此这里将模型load进cpu计算,有需要的话可以替换device为自己的显卡id if self.pdf: text = re.sub(r"\n{3,}", "\n", text) text = re.sub('\s', ' ', text) text = text.replace("\n\n", "") - sent_sep_pattern = re.compile('([﹒﹔;﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; - sent_list = [] - for ele in sent_sep_pattern.split(text): - if sent_sep_pattern.match(ele) and sent_list: - sent_list[-1] += ele - elif ele: - sent_list.append(ele) + if use_document_segmentation: + result = p(documents=text) + sent_list = [i for i in result["text"].split("\n\t") if i] + else: + sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; + sent_list = [] + for ele in sent_sep_pattern.split(text): + if sent_sep_pattern.match(ele) and sent_list: + sent_list[-1] += ele + elif ele: + sent_list.append(ele) return sent_list - def split_text(self, text: str) -> List[str]: + def split_text(self, text: str, use_document_segmentation: bool = False) -> List[str]: if self.pdf: text = re.sub(r"\n{3,}", r"\n", text) text = re.sub('\s', " ", text) text = re.sub("\n\n", "", text) - text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符 - text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号 - text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号 - text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text) - # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号 - text = text.rstrip() # 段尾如果有多余的\n就去掉它 - # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。 - ls = [i for i in text.split("\n") if i] - for ele in ls: - if len(ele) > SENTENCE_SIZE: - ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele) - ele1_ls = ele1.split("\n") - for ele_ele1 in ele1_ls: - if len(ele_ele1) > SENTENCE_SIZE: - ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1) - ele2_ls = ele_ele2.split("\n") - for ele_ele2 in ele2_ls: - if len(ele_ele2) > SENTENCE_SIZE: - ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2) - ele2_id = ele2_ls.index(ele_ele2) - ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[ele2_id + 1:] - ele_id = ele1_ls.index(ele_ele1) - ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:] - - id = ls.index(ele) - ls = ls[:id] + [i for i in ele1_ls if i] + ls[id+1:] - return ls - + if use_document_segmentation: + from modelscope.pipelines import pipeline + p = pipeline( + task="document-segmentation", + model='damo/nlp_bert_document-segmentation_chinese-base', + device="cpu") + result = p(documents=text) + sent_list = [i for i in result["text"].split("\n\t") if i] + return sent_list + else: + text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符 + text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号 + text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号 + text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text) + # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号 + text = text.rstrip() # 段尾如果有多余的\n就去掉它 + # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。 + ls = [i for i in text.split("\n") if i] + for ele in ls: + if len(ele) > SENTENCE_SIZE: + ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele) + ele1_ls = ele1.split("\n") + for ele_ele1 in ele1_ls: + if len(ele_ele1) > SENTENCE_SIZE: + ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1) + ele2_ls = ele_ele2.split("\n") + for ele_ele2 in ele2_ls: + if len(ele_ele2) > SENTENCE_SIZE: + ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2) + ele2_id = ele2_ls.index(ele_ele2) + ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[ + ele2_id + 1:] + ele_id = ele1_ls.index(ele_ele1) + ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:] + id = ls.index(ele) + ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:] + return ls