From dd3617fcdf9cdc791eafa9b7b32a47550aee168e Mon Sep 17 00:00:00 2001 From: "liunux@home" Date: Thu, 20 Jul 2023 12:32:37 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B0=86paddle=E7=9B=B8=E5=85=B3loader?= =?UTF-8?q?=E6=94=B9=E4=B8=BA=E5=8A=A8=E6=80=81=E5=BC=95=E5=85=A5=EF=BC=8C?= =?UTF-8?q?=E5=8F=AF=E4=BB=A5=E5=9C=A8=E4=B8=8D=E4=B8=8A=E4=BC=A0pdf/image?= =?UTF-8?q?=E7=9F=A5=E8=AF=86=E6=96=87=E4=BB=B6=E7=9A=84=E5=89=8D=E6=8F=90?= =?UTF-8?q?=E4=B8=8B=E4=BD=BF=E7=94=A8protobuf=3D4.x=E3=80=82=20=E8=BF=99?= =?UTF-8?q?=E6=A0=B7=E5=8F=AF=E4=BB=A5=E4=BD=BF=E7=94=A8=E6=9C=80=E6=96=B0?= =?UTF-8?q?=E7=89=88streamlit=E5=92=8Cchainlit=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chains/local_doc_qa.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/chains/local_doc_qa.py b/chains/local_doc_qa.py index 2f7d8da..6085dfc 100644 --- a/chains/local_doc_qa.py +++ b/chains/local_doc_qa.py @@ -8,7 +8,6 @@ from typing import List from utils import torch_gc from tqdm import tqdm from pypinyin import lazy_pinyin -from loader import UnstructuredPaddleImageLoader, UnstructuredPaddlePDFLoader from models.base import (BaseAnswer, AnswerResult) from models.loader.args import parser @@ -59,6 +58,7 @@ def tree(filepath, ignore_dir_names=None, ignore_file_names=None): def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_TITLE_ENHANCE): + if filepath.lower().endswith(".md"): loader = UnstructuredFileLoader(filepath, mode="elements") docs = loader.load() @@ -67,10 +67,14 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_T textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size) docs = loader.load_and_split(textsplitter) elif filepath.lower().endswith(".pdf"): + # 暂且将paddle相关的loader改为动态加载,可以在不上传pdf/image知识文件的前提下使用protobuf=4.x + from loader import UnstructuredPaddlePDFLoader loader = UnstructuredPaddlePDFLoader(filepath) textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size) docs = loader.load_and_split(textsplitter) elif filepath.lower().endswith(".jpg") or filepath.lower().endswith(".png"): + # 暂且将paddle相关的loader改为动态加载,可以在不上传pdf/image知识文件的前提下使用protobuf=4.x + from loader import UnstructuredPaddleImageLoader loader = UnstructuredPaddleImageLoader(filepath, mode="elements") textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size) docs = loader.load_and_split(text_splitter=textsplitter)