ComponentDevelopment/OCRPython/document_loader/imgloader.py

49 lines
1.9 KiB
Python

from abc import ABC, abstractmethod
from typing import List
import json
from langchain.document_loaders.unstructured import UnstructuredFileLoader
import re
from document_loader.ocr import get_ocr
class RapidOCRLoader(UnstructuredFileLoader):
"""图片的OCR识别类"""
def _get_elements(self) -> List:
def img2text(filepath):
resp = ""
ocr = get_ocr()
result, _ = ocr(filepath)
if result:
ocr_result = [line[1] for line in result]
resp += "\n".join(ocr_result)
return resp
text = img2text(self.file_path)
from unstructured.partition.text import partition_text
return partition_text(text=text, **self.unstructured_kwargs)
if __name__ == "__main__":
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/20230726163834.png")
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/QQ截图20230726163813.png")
loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/032002200511-91445598.pdf")
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/fapiao.jpg")
docs = loader.load()
context = "\n".join([doc.page_content for doc in docs])
print(context)
# extractor = IdentityCardExtractor()
# jsonString = extractor.extract_text(context)
# print(jsonString)
# context = remove_blank_lines(context)
# print("*"*20)
# print(context)
# info = extract_id_card_info(context)
# jsonString = json.dumps(info, ensure_ascii=False)
# print(jsonString)