from typing import List from langchain.document_loaders.unstructured import UnstructuredFileLoader import tqdm class CustomizedPDFLoader(UnstructuredFileLoader): def _get_elements(self) -> List: def pdf2text(filepath): import PyPDF2 mypdf = open(filepath,mode='rb') doc = PyPDF2.PdfReader(mypdf) page_count = len(doc.pages) print(f"文档页数:{page_count}") i = 0 resp = "" b_unit = tqdm.tqdm(total=page_count, desc="CustomizedPDFLoader context page index: 0") while i < page_count: # 更新描述 b_unit.set_description("CustomizedPDFLoader context page index: {}".format(i+1)) # 立即显示进度条更新结果 b_unit.refresh() first_page = doc.pages[i] text= first_page.extract_text() resp += text + "\n" i = i+1 return resp # def pdf2text(filepath): # import fitz # pyMuPDF里面的fitz包,不要与pip install fitz混淆 # from rapidocr_onnxruntime import RapidOCR # import numpy as np # ocr = RapidOCR() # doc = fitz.open(filepath) # resp = "" # b_unit = tqdm.tqdm(total=doc.page_count, desc="RapidOCRPDFLoader context page index: 0") # for i, page in enumerate(doc): # # 更新描述 # b_unit.set_description("RapidOCRPDFLoader context page index: {}".format(i)) # # 立即显示进度条更新结果 # b_unit.refresh() # # TODO: 依据文本与图片顺序调整处理方式 # text = page.get_text("") # resp += text + "\n" # img_list = page.get_images() # for img in img_list: # pix = fitz.Pixmap(doc, img[0]) # img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1) # result, _ = ocr(img_array) # if result: # ocr_result = [line[1] for line in result] # resp += "\n".join(ocr_result) # # 更新进度 # b_unit.update(1) # return resp text = pdf2text(self.file_path) from unstructured.partition.text import partition_text return partition_text(text=text, **self.unstructured_kwargs) if __name__ == "__main__": loader = CustomizedPDFLoader(file_path="变电站设备验收规范第28 部分避雷针.pdf") docs = loader.load() print(docs)