update textsplitter

This commit is contained in:
imClumsyPanda 2023-05-08 19:23:06 +08:00
parent 47e9bdb122
commit 64275cb703
1 changed files with 6 additions and 5 deletions

View File

@ -1,12 +1,7 @@
from langchain.text_splitter import CharacterTextSplitter from langchain.text_splitter import CharacterTextSplitter
import re import re
from typing import List from typing import List
from modelscope.pipelines import pipeline
p = pipeline(
task="document-segmentation",
model='damo/nlp_bert_document-segmentation_chinese-base',
device="cpu")
class AliTextSplitter(CharacterTextSplitter): class AliTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, **kwargs): def __init__(self, pdf: bool = False, **kwargs):
@ -21,6 +16,12 @@ class AliTextSplitter(CharacterTextSplitter):
text = re.sub(r"\n{3,}", r"\n", text) text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text) text = re.sub('\s', " ", text)
text = re.sub("\n\n", "", text) text = re.sub("\n\n", "", text)
from modelscope.pipelines import pipeline
p = pipeline(
task="document-segmentation",
model='damo/nlp_bert_document-segmentation_chinese-base',
device="cpu")
result = p(documents=text) result = p(documents=text)
sent_list = [i for i in result["text"].split("\n\t") if i] sent_list = [i for i in result["text"].split("\n\t") if i]
return sent_list return sent_list