49 lines
1.9 KiB
Python
49 lines
1.9 KiB
Python
from abc import ABC, abstractmethod
|
|
from typing import List
|
|
import json
|
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|
import re
|
|
from document_loader.ocr import get_ocr
|
|
|
|
|
|
class RapidOCRLoader(UnstructuredFileLoader):
|
|
"""图片的OCR识别类"""
|
|
def _get_elements(self) -> List:
|
|
def img2text(filepath):
|
|
resp = ""
|
|
ocr = get_ocr()
|
|
result, _ = ocr(filepath)
|
|
if result:
|
|
ocr_result = [line[1] for line in result]
|
|
resp += "\n".join(ocr_result)
|
|
return resp
|
|
|
|
text = img2text(self.file_path)
|
|
from unstructured.partition.text import partition_text
|
|
return partition_text(text=text, **self.unstructured_kwargs)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
|
|
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
|
|
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/20230726163834.png")
|
|
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/QQ截图20230726163813.png")
|
|
loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
|
|
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/032002200511-91445598.pdf")
|
|
#loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/fapiao.jpg")
|
|
|
|
docs = loader.load()
|
|
context = "\n".join([doc.page_content for doc in docs])
|
|
print(context)
|
|
|
|
# extractor = IdentityCardExtractor()
|
|
# jsonString = extractor.extract_text(context)
|
|
# print(jsonString)
|
|
# context = remove_blank_lines(context)
|
|
# print("*"*20)
|
|
# print(context)
|
|
# info = extract_id_card_info(context)
|
|
# jsonString = json.dumps(info, ensure_ascii=False)
|
|
# print(jsonString) |