diff --git a/loader/image_loader.py b/loader/image_loader.py index b14899a..d9e468e 100644 --- a/loader/image_loader.py +++ b/loader/image_loader.py @@ -15,7 +15,7 @@ class UnstructuredPaddleImageLoader(UnstructuredFileLoader): if not os.path.exists(full_dir_path): os.makedirs(full_dir_path) filename = os.path.split(filepath)[-1] - ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False) + ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False, show_log=False) result = ocr.ocr(img=filepath) ocr_result = [i[1][0] for line in result for i in line] diff --git a/loader/pdf_loader.py b/loader/pdf_loader.py index 58dd7a5..67eb826 100644 --- a/loader/pdf_loader.py +++ b/loader/pdf_loader.py @@ -15,7 +15,7 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader): full_dir_path = os.path.join(os.path.dirname(filepath), dir_path) if not os.path.exists(full_dir_path): os.makedirs(full_dir_path) - ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False) + ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False, show_log=False) doc = fitz.open(filepath) txt_file_path = os.path.join(full_dir_path, f"{os.path.split(filepath)[-1]}.txt") img_name = os.path.join(full_dir_path, 'tmp.png')