update import pkgs and format

This commit is contained in:
imClumsyPanda 2023-08-10 21:50:38 +08:00
parent 8a4d9168fa
commit 8d463a31fd
3 changed files with 11 additions and 4 deletions

View File

@ -1,3 +1,3 @@
from .chinese_text_splitter import ChineseTextSplitter from .chinese_text_splitter import ChineseTextSplitter
from .ali_text_splitter import AliTextSplitter from .ali_text_splitter import AliTextSplitter
from .zh_title_enhance import zh_title_enhance from .zh_title_enhance import zh_title_enhance

View File

@ -16,7 +16,14 @@ class AliTextSplitter(CharacterTextSplitter):
text = re.sub(r"\n{3,}", r"\n", text) text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text) text = re.sub('\s', " ", text)
text = re.sub("\n\n", "", text) text = re.sub("\n\n", "", text)
from modelscope.pipelines import pipeline try:
from modelscope.pipelines import pipeline
except ImportError:
raise ImportError(
"Could not import modelscope python package. "
"Please install modelscope with `pip install modelscope`. "
)
p = pipeline( p = pipeline(
task="document-segmentation", task="document-segmentation",

View File

@ -1,11 +1,11 @@
from langchain.text_splitter import CharacterTextSplitter from langchain.text_splitter import CharacterTextSplitter
import re import re
from typing import List from typing import List
from configs.model_config import SENTENCE_SIZE from configs.model_config import CHUNK_SIZE
class ChineseTextSplitter(CharacterTextSplitter): class ChineseTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, sentence_size: int = SENTENCE_SIZE, **kwargs): def __init__(self, pdf: bool = False, sentence_size: int = CHUNK_SIZE, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.pdf = pdf self.pdf = pdf
self.sentence_size = sentence_size self.sentence_size = sentence_size