update pdf_loader.py

This commit is contained in:
imClumsyPanda 2023-05-21 15:05:35 +08:00
parent 94b4599cda
commit 3712eec6a9
1 changed files with 2 additions and 1 deletions

View File

@ -29,7 +29,8 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
img_list = page.get_images() img_list = page.get_images()
for img in img_list: for img in img_list:
pix = fitz.Pixmap(doc, img[0]) pix = fitz.Pixmap(doc, img[0])
if pix.n - pix.alpha >= 4:
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.save(img_name) pix.save(img_name)
result = ocr.ocr(img_name) result = ocr.ocr(img_name)