diff --git a/textsplitter/ali_text_splitter.py b/textsplitter/ali_text_splitter.py index c5f2896..1e62eb6 100644 --- a/textsplitter/ali_text_splitter.py +++ b/textsplitter/ali_text_splitter.py @@ -1,12 +1,7 @@ from langchain.text_splitter import CharacterTextSplitter import re from typing import List -from modelscope.pipelines import pipeline -p = pipeline( - task="document-segmentation", - model='damo/nlp_bert_document-segmentation_chinese-base', - device="cpu") class AliTextSplitter(CharacterTextSplitter): def __init__(self, pdf: bool = False, **kwargs): @@ -21,6 +16,12 @@ class AliTextSplitter(CharacterTextSplitter): text = re.sub(r"\n{3,}", r"\n", text) text = re.sub('\s', " ", text) text = re.sub("\n\n", "", text) + from modelscope.pipelines import pipeline + + p = pipeline( + task="document-segmentation", + model='damo/nlp_bert_document-segmentation_chinese-base', + device="cpu") result = p(documents=text) sent_list = [i for i in result["text"].split("\n\t") if i] return sent_list