update pdf_loader.py
This commit is contained in:
parent
94b4599cda
commit
3712eec6a9
|
|
@ -29,7 +29,8 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
|
||||||
img_list = page.get_images()
|
img_list = page.get_images()
|
||||||
for img in img_list:
|
for img in img_list:
|
||||||
pix = fitz.Pixmap(doc, img[0])
|
pix = fitz.Pixmap(doc, img[0])
|
||||||
|
if pix.n - pix.alpha >= 4:
|
||||||
|
pix = fitz.Pixmap(fitz.csRGB, pix)
|
||||||
pix.save(img_name)
|
pix.save(img_name)
|
||||||
|
|
||||||
result = ocr.ocr(img_name)
|
result = ocr.ocr(img_name)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue