diff --git a/loader/pdf_loader.py b/loader/pdf_loader.py index 3414121..2981d7a 100644 --- a/loader/pdf_loader.py +++ b/loader/pdf_loader.py @@ -15,13 +15,11 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader): full_dir_path = os.path.join(os.path.dirname(filepath), dir_path) if not os.path.exists(full_dir_path): os.makedirs(full_dir_path) - filename = os.path.split(filepath)[-1] ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False) doc = fitz.open(filepath) - txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename)) + txt_file_path = os.path.join(full_dir_path, f"{os.path.split(filepath)[-1]}.txt") img_name = os.path.join(full_dir_path, 'tmp.png') with open(txt_file_path, 'w', encoding='utf-8') as fout: - for i in range(doc.page_count): page = doc[i] text = page.get_text("")