Langchain-Chatchat/document_loaders/customiedpdfloader.py

71 lines
2.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
import tqdm
class CustomizedPDFLoader(UnstructuredFileLoader):
def _get_elements(self) -> List:
def pdf2text(filepath):
import PyPDF2
mypdf = open(filepath,mode='rb')
doc = PyPDF2.PdfReader(mypdf)
page_count = len(doc.pages)
print(f"文档页数:{page_count}")
i = 0
resp = ""
b_unit = tqdm.tqdm(total=page_count, desc="CustomizedPDFLoader context page index: 0")
while i < page_count:
# 更新描述
b_unit.set_description("CustomizedPDFLoader context page index: {}".format(i+1))
# 立即显示进度条更新结果
b_unit.refresh()
first_page = doc.pages[i]
text= first_page.extract_text()
resp += text + "\n"
i = i+1
return resp
# def pdf2text(filepath):
# import fitz # pyMuPDF里面的fitz包不要与pip install fitz混淆
# from rapidocr_onnxruntime import RapidOCR
# import numpy as np
# ocr = RapidOCR()
# doc = fitz.open(filepath)
# resp = ""
# b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0")
# for i, page in enumerate(doc):
# # 更新描述
# b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i))
# # 立即显示进度条更新结果
# b_unit.refresh()
# # TODO: 依据文本与图片顺序调整处理方式
# text = page.get_text("")
# resp += text + "\n"
# img_list = page.get_images()
# for img in img_list:
# pix = fitz.Pixmap(doc, img[0])
# img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
# result, _ = ocr(img_array)
# if result:
# ocr_result = [line[1] for line in result]
# resp += "\n".join(ocr_result)
# # 更新进度
# b_unit.update(1)
# return resp
text = pdf2text(self.file_path)
from unstructured.partition.text import partition_text
return partition_text(text=text, **self.unstructured_kwargs)
if __name__ == "__main__":
loader = CustomizedPDFLoader(file_path="变电站设备验收规范第28 部分避雷针.pdf")
docs = loader.load()
print(docs)