update loaders

This commit is contained in:
imClumsyPanda 2023-05-13 11:16:51 +08:00
parent d2716addd6
commit 6a273501ee
2 changed files with 3 additions and 3 deletions

View File

@ -11,7 +11,7 @@ class UnstructuredPaddleImageLoader(UnstructuredFileLoader):
def _get_elements(self) -> List: def _get_elements(self) -> List:
def image_ocr_txt(filepath, dir_path="tmp_files"): def image_ocr_txt(filepath, dir_path="tmp_files"):
full_dir_path = os.path.join(filepath, dir_path) full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
if not os.path.exists(full_dir_path): if not os.path.exists(full_dir_path):
os.makedirs(full_dir_path) os.makedirs(full_dir_path)
filename = os.path.split(filepath)[-1] filename = os.path.split(filepath)[-1]

View File

@ -12,14 +12,14 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
def _get_elements(self) -> List: def _get_elements(self) -> List:
def pdf_ocr_txt(filepath, dir_path="tmp_files"): def pdf_ocr_txt(filepath, dir_path="tmp_files"):
full_dir_path = os.path.join(filepath, dir_path) full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
if not os.path.exists(full_dir_path): if not os.path.exists(full_dir_path):
os.makedirs(full_dir_path) os.makedirs(full_dir_path)
filename = os.path.split(filepath)[-1] filename = os.path.split(filepath)[-1]
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False) ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
doc = fitz.open(filepath) doc = fitz.open(filepath)
txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename)) txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
img_name = os.path.join(full_dir_path, 'tmp.png') img_name = os.path.join(full_dir_path, '.tmp.png')
with open(txt_file_path, 'w', encoding='utf-8') as fout: with open(txt_file_path, 'w', encoding='utf-8') as fout:
for i in range(doc.page_count): for i in range(doc.page_count):