diff --git a/loader/image_loader.py b/loader/image_loader.py index d8e2047..1013e82 100644 --- a/loader/image_loader.py +++ b/loader/image_loader.py @@ -11,7 +11,7 @@ class UnstructuredPaddleImageLoader(UnstructuredFileLoader): def _get_elements(self) -> List: def image_ocr_txt(filepath, dir_path="tmp_files"): - full_dir_path = os.path.join(filepath, dir_path) + full_dir_path = os.path.join(os.path.dirname(filepath), dir_path) if not os.path.exists(full_dir_path): os.makedirs(full_dir_path) filename = os.path.split(filepath)[-1] diff --git a/loader/pdf_loader.py b/loader/pdf_loader.py index ff886d6..a27eec1 100644 --- a/loader/pdf_loader.py +++ b/loader/pdf_loader.py @@ -12,14 +12,14 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader): def _get_elements(self) -> List: def pdf_ocr_txt(filepath, dir_path="tmp_files"): - full_dir_path = os.path.join(filepath, dir_path) + full_dir_path = os.path.join(os.path.dirname(filepath), dir_path) if not os.path.exists(full_dir_path): os.makedirs(full_dir_path) filename = os.path.split(filepath)[-1] ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False) doc = fitz.open(filepath) txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename)) - img_name = os.path.join(full_dir_path, 'tmp.png') + img_name = os.path.join(full_dir_path, '.tmp.png') with open(txt_file_path, 'w', encoding='utf-8') as fout: for i in range(doc.page_count):