diff --git a/chains/local_doc_qa.py b/chains/local_doc_qa.py index 2f7d8da..6085dfc 100644 --- a/chains/local_doc_qa.py +++ b/chains/local_doc_qa.py @@ -8,7 +8,6 @@ from typing import List from utils import torch_gc from tqdm import tqdm from pypinyin import lazy_pinyin -from loader import UnstructuredPaddleImageLoader, UnstructuredPaddlePDFLoader from models.base import (BaseAnswer, AnswerResult) from models.loader.args import parser @@ -59,6 +58,7 @@ def tree(filepath, ignore_dir_names=None, ignore_file_names=None): def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_TITLE_ENHANCE): + if filepath.lower().endswith(".md"): loader = UnstructuredFileLoader(filepath, mode="elements") docs = loader.load() @@ -67,10 +67,14 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_T textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size) docs = loader.load_and_split(textsplitter) elif filepath.lower().endswith(".pdf"): + # 暂且将paddle相关的loader改为动态加载,可以在不上传pdf/image知识文件的前提下使用protobuf=4.x + from loader import UnstructuredPaddlePDFLoader loader = UnstructuredPaddlePDFLoader(filepath) textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size) docs = loader.load_and_split(textsplitter) elif filepath.lower().endswith(".jpg") or filepath.lower().endswith(".png"): + # 暂且将paddle相关的loader改为动态加载,可以在不上传pdf/image知识文件的前提下使用protobuf=4.x + from loader import UnstructuredPaddleImageLoader loader = UnstructuredPaddleImageLoader(filepath, mode="elements") textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size) docs = loader.load_and_split(text_splitter=textsplitter)