From e2d7452c7b21eaff37d6f34b91165c24d88f00e0 Mon Sep 17 00:00:00 2001 From: imClumsyPanda Date: Sat, 6 May 2023 23:26:49 +0800 Subject: [PATCH] update chinese_text_splitter.py --- chains/local_doc_qa.py | 3 ++- configs/model_config.py | 3 +++ textsplitter/chinese_text_splitter.py | 39 ++++++++++++++++++++++++--- webui.py | 4 +-- 4 files changed, 43 insertions(+), 6 deletions(-) diff --git a/chains/local_doc_qa.py b/chains/local_doc_qa.py index f2e3be6..da83da7 100644 --- a/chains/local_doc_qa.py +++ b/chains/local_doc_qa.py @@ -103,7 +103,8 @@ def similarity_search_with_score_by_vector( doc.page_content += doc0.page_content if not isinstance(doc, Document): raise ValueError(f"Could not find document for id {_id}, got {doc}") - docs.append((doc, scores[0][j])) + doc_score = min([scores[0][id] for id in [indices[0].tolist().index(i) for i in id_seq if i in indices[0]]]) + docs.append((doc, doc_score)) torch_gc() return docs diff --git a/configs/model_config.py b/configs/model_config.py index 859273b..8de2919 100644 --- a/configs/model_config.py +++ b/configs/model_config.py @@ -50,6 +50,9 @@ PROMPT_TEMPLATE = """已知信息: 根据上述已知信息,简洁和专业的来回答用户的问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题” 或 “没有提供足够的相关信息”,不允许在答案中添加编造成分,答案请使用中文。 问题是:{question}""" +# 文本分句长度 +SENTENCE_SIZE = 100 + # 匹配后单段上下文长度 CHUNK_SIZE = 250 diff --git a/textsplitter/chinese_text_splitter.py b/textsplitter/chinese_text_splitter.py index 72b1903..2b18579 100644 --- a/textsplitter/chinese_text_splitter.py +++ b/textsplitter/chinese_text_splitter.py @@ -1,19 +1,19 @@ from langchain.text_splitter import CharacterTextSplitter import re from typing import List - +from configs.model_config import SENTENCE_SIZE class ChineseTextSplitter(CharacterTextSplitter): def __init__(self, pdf: bool = False, **kwargs): super().__init__(**kwargs) self.pdf = pdf - def split_text(self, text: str) -> List[str]: + def split_text1(self, text: str) -> List[str]: if self.pdf: text = re.sub(r"\n{3,}", "\n", text) text = re.sub('\s', ' ', text) text = text.replace("\n\n", "") - sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; + sent_sep_pattern = re.compile('([﹒﹔;﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; sent_list = [] for ele in sent_sep_pattern.split(text): if sent_sep_pattern.match(ele) and sent_list: @@ -22,4 +22,37 @@ class ChineseTextSplitter(CharacterTextSplitter): sent_list.append(ele) return sent_list + def split_text(self, text: str) -> List[str]: + if self.pdf: + text = re.sub(r"\n{3,}", r"\n", text) + text = re.sub('\s', " ", text) + text = re.sub("\n\n", "", text) + text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符 + text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号 + text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号 + text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text) + # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号 + text = text.rstrip() # 段尾如果有多余的\n就去掉它 + # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。 + ls = [i for i in text.split("\n") if i] + for ele in ls: + if len(ele) > SENTENCE_SIZE: + ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele) + ele1_ls = ele1.split("\n") + for ele_ele1 in ele1_ls: + if len(ele_ele1) > SENTENCE_SIZE: + ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1) + ele2_ls = ele_ele2.split("\n") + for ele_ele2 in ele2_ls: + if len(ele_ele2) > SENTENCE_SIZE: + ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2) + ele2_id = ele2_ls.index(ele_ele2) + ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[ele2_id + 1:] + ele_id = ele1_ls.index(ele_ele1) + ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:] + + id = ls.index(ele) + ls = ls[:id] + [i for i in ele1_ls if i] + ls[id+1:] + return ls + diff --git a/webui.py b/webui.py index 571987d..1936ebf 100644 --- a/webui.py +++ b/webui.py @@ -15,8 +15,8 @@ def get_vs_list(): lst = os.listdir(VS_ROOT_PATH) if not lst: return lst_default - lst.sort(reverse=True) - return lst + lst_default + lst.sort() + return lst_default + lst vs_list = get_vs_list()