update textsplitter

This commit is contained in:
imClumsyPanda 2023-05-07 12:06:09 +08:00
parent 4150af003a
commit d898c7dd6c
2 changed files with 7 additions and 2 deletions

View File

@ -1,2 +1 @@
from .chinese_text_splitter import ChineseTextSplitter
from .chinese_text_splitter import *

View File

@ -18,8 +18,14 @@ class ChineseTextSplitter(CharacterTextSplitter):
text = re.sub('\s', ' ', text) text = re.sub('\s', ' ', text)
text = text.replace("\n\n", "") text = text.replace("\n\n", "")
if use_document_segmentation: if use_document_segmentation:
from modelscope.pipelines import pipeline
p = pipeline(
task="document-segmentation",
model='damo/nlp_bert_document-segmentation_chinese-base',
device="cpu")
result = p(documents=text) result = p(documents=text)
sent_list = [i for i in result["text"].split("\n\t") if i] sent_list = [i for i in result["text"].split("\n\t") if i]
return sent_list
else: else:
sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del
sent_list = [] sent_list = []