from abc import ABC, abstractmethod from typing import List import json from langchain.document_loaders.unstructured import UnstructuredFileLoader import re from document_loader.ocr import get_ocr class RapidOCRLoader(UnstructuredFileLoader): """图片的OCR识别类""" def _get_elements(self) -> List: def img2text(filepath): resp = "" ocr = get_ocr() result, _ = ocr(filepath) if result: ocr_result = [line[1] for line in result] resp += "\n".join(ocr_result) return resp text = img2text(self.file_path) from unstructured.partition.text import partition_text return partition_text(text=text, **self.unstructured_kwargs) if __name__ == "__main__": #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG") #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG") #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/20230726163834.png") #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/QQ截图20230726163813.png") loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG") #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/032002200511-91445598.pdf") #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/fapiao.jpg") docs = loader.load() context = "\n".join([doc.page_content for doc in docs]) print(context) # extractor = IdentityCardExtractor() # jsonString = extractor.extract_text(context) # print(jsonString) # context = remove_blank_lines(context) # print("*"*20) # print(context) # info = extract_id_card_info(context) # jsonString = json.dumps(info, ensure_ascii=False) # print(jsonString)