174 lines
6.7 KiB
Python
174 lines
6.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
import sys
|
|
import io
|
|
from extractor.identitycard_extractor import IdentityCardExtractor
|
|
from document_loader.imgloader import RapidOCRLoader
|
|
from configs.basic_config import logger
|
|
import base64
|
|
import time
|
|
import os
|
|
import json
|
|
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
|
|
def extractIdCardInfo(type:int, filePath1: str, filePath2: str)->str:
|
|
if (0 == type):
|
|
return extractIdCardInfoByPath(filePath1, filePath2)
|
|
elif (1 == type):
|
|
return extractIdCardInfoByBase64Data(filePath1,filePath2)
|
|
else:
|
|
pass
|
|
|
|
def extractIdCardInfoByPath(filePath1: str, filePath2: str)->str:
|
|
start_time = time.time() # 记录结束时间
|
|
context = ""
|
|
jsonString = ""
|
|
try:
|
|
if 0 != len(filePath1) and os.path.exists(filePath1):
|
|
loader = RapidOCRLoader(filePath1)
|
|
docs = loader.load()
|
|
context = "\n".join([doc.page_content for doc in docs])
|
|
if 0 != len(filePath2) and os.path.exists(filePath2):
|
|
loader = RapidOCRLoader(filePath2)
|
|
docs = loader.load()
|
|
context += "\n".join([doc.page_content for doc in docs])
|
|
|
|
extractor = IdentityCardExtractor()
|
|
tempdict = extractor.extract_text(context)
|
|
json_string = json.dumps(tempdict, ensure_ascii=False)
|
|
except Exception as e:
|
|
logger.error(e)
|
|
|
|
end_time = time.time() # 记录结束时间
|
|
execution_time = end_time - start_time # 计算执行时间
|
|
logger.info(f"extractIdCardInfo 耗时{execution_time}秒")
|
|
return jsonString
|
|
|
|
def extractIdCardInfoByBase64Data(base64data1:str, base64Data2: str)->str:
|
|
image_data1 = ""
|
|
image_data2 = ""
|
|
logger.info(f"extractIdCardInfoByBase64Data")
|
|
start_time = time.time() # 记录结束时间
|
|
jsonString = ""
|
|
try:
|
|
if 0!=len(base64data1):
|
|
logger.info(f"not base64data1.empty()")
|
|
image_data1 = base64.b64decode(base64data1)
|
|
with open("file1.png", "wb") as file:
|
|
file.write(image_data1)
|
|
|
|
if 0!=len(base64Data2):
|
|
logger.info(f"not base64Data2.empty()")
|
|
image_data2 = base64.b64decode(base64Data2)
|
|
with open("file2.png", "wb") as file:
|
|
file.write(image_data2)
|
|
|
|
if os.path.exists("file1.png") and os.path.exists("file2.png"):
|
|
logger.info(f"file1.png and file2.png exist")
|
|
jsonString = extractIdCardInfoByPath("file1.png","file2.png")
|
|
os.remove("file1.png")
|
|
os.remove("file2.png")
|
|
elif os.path.exists("file1.png"):
|
|
logger.info(f"file1.png exist")
|
|
jsonString = extractIdCardInfoByPath("file1.png","")
|
|
os.remove("file1.png")
|
|
elif os.path.exists("file2.png"):
|
|
logger.info(f"file2.png exist")
|
|
jsonString = extractIdCardInfoByPath("file2.png","")
|
|
os.remove("file2.png")
|
|
except Exception as e:
|
|
logger.error(e)
|
|
|
|
end_time = time.time() # 记录结束时间
|
|
execution_time = end_time - start_time # 计算执行时间
|
|
logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒")
|
|
return jsonString
|
|
|
|
# with open('/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG', 'rb') as image_file:
|
|
# base64_image_string = base64.b64encode(image_file.read()).decode('utf-8')
|
|
|
|
# jsonString = extractIdCardInfoByBase64Data(base64_image_string,"")
|
|
# print(jsonString)
|
|
|
|
# jsonString = extractIdCardInfoByPath("/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG","")
|
|
# print(jsonString)
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
logger.info(f"main.py len of parameter: {len(sys.argv)}")
|
|
jsonString = ""
|
|
if len(sys.argv) > 3:
|
|
logger.info(f"{sys.argv[1]}")
|
|
logger.info(f"{sys.argv[2]}")
|
|
logger.info(f"{sys.argv[3]}")
|
|
jsonString = extractIdCardInfo(int(sys.argv[1]), sys.argv[2], sys.argv[3])
|
|
elif len(sys.argv) > 1:
|
|
logger.info(f"python 脚本里的接收到的参数是:")
|
|
logger.info(f"{sys.argv[1]}")
|
|
logger.info(f"开始执行sys.stdin.read")
|
|
input_data = sys.stdin.read()
|
|
logger.info("")
|
|
logger.info(f"{len(input_data)}")
|
|
split_data = input_data.split(os.linesep)
|
|
data1 = ""
|
|
data2 = ""
|
|
if 2 == len(split_data):
|
|
data1 = split_data[0]
|
|
data2 = split_data[1]
|
|
elif 1 == len(split_data):
|
|
data1 = split_data[0]
|
|
logger.info(f"{len(data1)}")
|
|
logger.info(f"{len(data2)}")
|
|
|
|
jsonString = extractIdCardInfo(int(sys.argv[1]), data1, data2)
|
|
|
|
print(jsonString)
|
|
|
|
except KeyboardInterrupt:
|
|
logger.error("KeyboardInterrupt")
|
|
|
|
except IndexError:
|
|
# 如果参数过长,捕获 IndexError 异常并进行处理
|
|
logger.error("参数过长,未指定足够的参数")
|
|
|
|
except OverflowError:
|
|
# 如果发生了 OverflowError 异常,捕获并进行相应的处理
|
|
logger.error("命令行参数过长,导致溢出错误")
|
|
|
|
except EOFError:
|
|
# 如果达到输入流的末尾,捕获 EOFError 异常并进行处理
|
|
logger.error("已经到达输入流的末尾")
|
|
|
|
except Exception as e:
|
|
# 如果发生了其他异常,捕获并进行相应的处理
|
|
logger.error("发生了异常:", e)
|
|
|
|
|
|
|
|
|
|
# if __name__ == "__main__":
|
|
# #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
|
|
# #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
|
|
# #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/20230726163834.png")
|
|
# #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/QQ截图20230726163813.png")
|
|
# # loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
|
|
# # #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/032002200511-91445598.pdf")
|
|
# # #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/fapiao.jpg")
|
|
|
|
# # docs = loader.load()
|
|
# # context = "\n".join([doc.page_content for doc in docs])
|
|
# # print(context)
|
|
|
|
# # extractor = IdentityCardExtractor()
|
|
# # jsonString = extractor.extract_text(context)
|
|
# # print(jsonString)
|
|
# # context = remove_blank_lines(context)
|
|
# # print("*"*20)
|
|
# # print(context)
|
|
# # info = extract_id_card_info(context)
|
|
# # jsonString = json.dumps(info, ensure_ascii=False)
|
|
# # print(jsonString)
|
|
|
|
# result = extractIdCardInfo("/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
|
|
# print(result)
|