59 lines
2.4 KiB
Python
59 lines
2.4 KiB
Python
"""Loader that loads image files."""
|
|
from typing import List
|
|
|
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|
from paddleocr import PaddleOCR
|
|
import os
|
|
import fitz
|
|
import nltk
|
|
from configs.model_config import NLTK_DATA_PATH
|
|
|
|
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
|
|
|
|
class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
|
|
"""Loader that uses unstructured to load image files, such as PNGs and JPGs."""
|
|
|
|
def _get_elements(self) -> List:
|
|
def pdf_ocr_txt(filepath, dir_path="tmp_files"):
|
|
full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
|
|
if not os.path.exists(full_dir_path):
|
|
os.makedirs(full_dir_path)
|
|
ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False, show_log=False)
|
|
doc = fitz.open(filepath)
|
|
txt_file_path = os.path.join(full_dir_path, f"{os.path.split(filepath)[-1]}.txt")
|
|
img_name = os.path.join(full_dir_path, 'tmp.png')
|
|
with open(txt_file_path, 'w', encoding='utf-8') as fout:
|
|
for i in range(doc.page_count):
|
|
page = doc[i]
|
|
text = page.get_text("")
|
|
fout.write(text)
|
|
fout.write("\n")
|
|
|
|
img_list = page.get_images()
|
|
for img in img_list:
|
|
pix = fitz.Pixmap(doc, img[0])
|
|
if pix.n - pix.alpha >= 4:
|
|
pix = fitz.Pixmap(fitz.csRGB, pix)
|
|
pix.save(img_name)
|
|
|
|
result = ocr.ocr(img_name)
|
|
ocr_result = [i[1][0] for line in result for i in line]
|
|
fout.write("\n".join(ocr_result))
|
|
if os.path.exists(img_name):
|
|
os.remove(img_name)
|
|
return txt_file_path
|
|
|
|
txt_file_path = pdf_ocr_txt(self.file_path)
|
|
from unstructured.partition.text import partition_text
|
|
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
|
|
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.pdf")
|
|
loader = UnstructuredPaddlePDFLoader(filepath, mode="elements")
|
|
docs = loader.load()
|
|
for doc in docs:
|
|
print(doc)
|