From d898c7dd6c464a27e7c7d53cbb7646a7ceee2b13 Mon Sep 17 00:00:00 2001 From: imClumsyPanda Date: Sun, 7 May 2023 12:06:09 +0800 Subject: [PATCH] update textsplitter --- textsplitter/__init__.py | 3 +-- textsplitter/chinese_text_splitter.py | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/textsplitter/__init__.py b/textsplitter/__init__.py index 38a0587..b56fdfe 100644 --- a/textsplitter/__init__.py +++ b/textsplitter/__init__.py @@ -1,2 +1 @@ - -from .chinese_text_splitter import * \ No newline at end of file +from .chinese_text_splitter import ChineseTextSplitter diff --git a/textsplitter/chinese_text_splitter.py b/textsplitter/chinese_text_splitter.py index 6b6c136..4795699 100644 --- a/textsplitter/chinese_text_splitter.py +++ b/textsplitter/chinese_text_splitter.py @@ -18,8 +18,14 @@ class ChineseTextSplitter(CharacterTextSplitter): text = re.sub('\s', ' ', text) text = text.replace("\n\n", "") if use_document_segmentation: + from modelscope.pipelines import pipeline + p = pipeline( + task="document-segmentation", + model='damo/nlp_bert_document-segmentation_chinese-base', + device="cpu") result = p(documents=text) sent_list = [i for i in result["text"].split("\n\t") if i] + return sent_list else: sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; sent_list = []