fix ‘安徽省’多个‘安’的识别和地址里以数字打头的问题
This commit is contained in:
parent
213bb69f73
commit
75f8a71768
|
|
@ -9,6 +9,7 @@ BASE64_DATA_INCOMPLETE = 20003
|
||||||
NO_TEXT_RECOGNIZED = 20004
|
NO_TEXT_RECOGNIZED = 20004
|
||||||
OCR_RECOGNIZE_OTHER_EXCEPTION = 20005
|
OCR_RECOGNIZE_OTHER_EXCEPTION = 20005
|
||||||
RECOGNITION_INFO_PARSE_OTHER_EXCEPTION = 20006
|
RECOGNITION_INFO_PARSE_OTHER_EXCEPTION = 20006
|
||||||
|
NO_DEFINED_FUNCTION_ERROR = 20007
|
||||||
|
|
||||||
|
|
||||||
error_codes = {
|
error_codes = {
|
||||||
|
|
@ -19,6 +20,7 @@ error_codes = {
|
||||||
NO_TEXT_RECOGNIZED: "识别本地图片路径存在,但没有识别出文字",
|
NO_TEXT_RECOGNIZED: "识别本地图片路径存在,但没有识别出文字",
|
||||||
OCR_RECOGNIZE_OTHER_EXCEPTION: "OCR识别身份证其他异常",
|
OCR_RECOGNIZE_OTHER_EXCEPTION: "OCR识别身份证其他异常",
|
||||||
RECOGNITION_INFO_PARSE_OTHER_EXCEPTION: "身份证信息解析其他异常",
|
RECOGNITION_INFO_PARSE_OTHER_EXCEPTION: "身份证信息解析其他异常",
|
||||||
|
NO_DEFINED_FUNCTION_ERROR:"没有实现的功能"
|
||||||
}
|
}
|
||||||
# 是否显示详细日志
|
# 是否显示详细日志
|
||||||
log_verbose = True
|
log_verbose = True
|
||||||
|
|
|
||||||
|
|
@ -98,6 +98,10 @@ class IdentityCardExtractor(Extractor):
|
||||||
else:
|
else:
|
||||||
pattern = r"\d{18,}$" # 匹配1位以上的数字结尾
|
pattern = r"\d{18,}$" # 匹配1位以上的数字结尾
|
||||||
data["address"] = re.sub(pattern, "", data["address"])
|
data["address"] = re.sub(pattern, "", data["address"])
|
||||||
|
if data["address"].startswith("安安徽省"):
|
||||||
|
data["address"] ="安徽省" + data["address"][len("安安徽省"):]
|
||||||
|
if re.match(r'^[\d_-]+', data["address"]):
|
||||||
|
data["address"] = re.sub(r'^[\d_-]+', '', data["address"])
|
||||||
# 提取身份证号码
|
# 提取身份证号码
|
||||||
id_number = re.search(r"([123456]\d{17}|[123456]\d{16}[Xx])", text, re.DOTALL)
|
id_number = re.search(r"([123456]\d{17}|[123456]\d{16}[Xx])", text, re.DOTALL)
|
||||||
if id_number:
|
if id_number:
|
||||||
|
|
@ -248,25 +252,19 @@ class IdentityCardExtractor(Extractor):
|
||||||
# 公民身份号码
|
# 公民身份号码
|
||||||
# 513401197807087411
|
# 513401197807087411
|
||||||
# """
|
# """
|
||||||
text = """中华人民共和国
|
# text = """中华人民共和国
|
||||||
居民身份证
|
# 居民身份证
|
||||||
签发机关
|
# 签发机关东至县公安局
|
||||||
木里县公安局
|
# 有效期限2021.07.02-2031.07.02
|
||||||
有效期限
|
# 姓名程文友
|
||||||
2020.03.16-2025.03.16
|
# 性别男民族汉
|
||||||
名
|
# 出生2002年2月7日
|
||||||
蒋子古
|
# 住址安
|
||||||
姓
|
# 安徽省东至县胜利镇湖滨
|
||||||
男
|
# 村张四组12号
|
||||||
民族彝
|
# 公民身份号码
|
||||||
出生
|
# 342921200202074436
|
||||||
2005年1月4日
|
# """
|
||||||
住址
|
|
||||||
四川省木里藏族自治县耗
|
|
||||||
牛坪乡泥珠村5组29号
|
|
||||||
公民身份号码
|
|
||||||
513422200501044415
|
|
||||||
"""
|
|
||||||
#
|
#
|
||||||
# text = """姓名苏龙格德·胡尔查巴特尔
|
# text = """姓名苏龙格德·胡尔查巴特尔
|
||||||
# 性别男民族蒙古
|
# 性别男民族蒙古
|
||||||
|
|
@ -351,9 +349,6 @@ text = """中华人民共和国
|
||||||
# 村峡山口村民组156号
|
# 村峡山口村民组156号
|
||||||
# 公民身份号码
|
# 公民身份号码
|
||||||
# 43038119831124301X"""
|
# 43038119831124301X"""
|
||||||
# extractor = IdentityCardExtractor()
|
|
||||||
# jsonstring = extractor.extract_textbyPaddle(text)
|
|
||||||
# print(jsonstring)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -380,3 +375,22 @@ text = """中华人民共和国
|
||||||
# else:
|
# else:
|
||||||
# print("未找到性别信息")
|
# print("未找到性别信息")
|
||||||
|
|
||||||
|
# text = """姓名热西提·伊明
|
||||||
|
# 性别
|
||||||
|
# 民族维吾尔
|
||||||
|
# 出生
|
||||||
|
# 1971年6月6日
|
||||||
|
# 住址
|
||||||
|
# 19995-6
|
||||||
|
# 新疆新和县无鲁都斯巴格镇
|
||||||
|
# 它乾城社区6组19号
|
||||||
|
# 公民身份号码
|
||||||
|
# 652925197106062519
|
||||||
|
# 中华人民共和国
|
||||||
|
# 居民身份证
|
||||||
|
# 有效期限
|
||||||
|
# 2021.07.16-长期
|
||||||
|
# """
|
||||||
|
# extractor = IdentityCardExtractor()
|
||||||
|
# jsonstring = extractor.extract_textbyPaddle(text)
|
||||||
|
# print(jsonstring)
|
||||||
|
|
|
||||||
|
|
@ -29,14 +29,13 @@ class OCRRecognition:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extractIdCardInfoByPath(filePath1: str = "", filePath2: str = "") -> str:
|
def extractIdCardInfoByPath(filePath1: str = "", filePath2: str = "") -> str:
|
||||||
logger.info(f"ocr加载开始计时")
|
# logger.info(f"ocr加载开始计时")
|
||||||
start_time = time.time() # 记录结束时间
|
start_time = time.time() # 记录结束时间
|
||||||
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
|
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
|
||||||
text = ""
|
text = ""
|
||||||
finalResult = {
|
finalResult = {
|
||||||
"code": LOCAL_PATH_NOT_EXIST,
|
"code": LOCAL_PATH_NOT_EXIST,
|
||||||
"msg": error_codes[LOCAL_PATH_NOT_EXIST],
|
"msg": error_codes[LOCAL_PATH_NOT_EXIST],
|
||||||
|
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
if len(filePath1) > 0:
|
if len(filePath1) > 0:
|
||||||
|
|
@ -73,7 +72,7 @@ class OCRRecognition:
|
||||||
jsonString = json.dumps(tempdict, ensure_ascii=False)
|
jsonString = json.dumps(tempdict, ensure_ascii=False)
|
||||||
end_time = time.time() # 记录结束时间
|
end_time = time.time() # 记录结束时间
|
||||||
execution_time = end_time - start_time # 计算执行时间
|
execution_time = end_time - start_time # 计算执行时间
|
||||||
logger.info(f"extractIdCardInfoByPath 耗时{execution_time}秒")
|
# logger.info(f"extractIdCardInfoByPath 耗时{execution_time}秒")
|
||||||
return jsonString
|
return jsonString
|
||||||
else:
|
else:
|
||||||
finalResult["code"] = NO_TEXT_RECOGNIZED
|
finalResult["code"] = NO_TEXT_RECOGNIZED
|
||||||
|
|
@ -87,7 +86,7 @@ class OCRRecognition:
|
||||||
"code": BASE64_DATA_INCOMPLETE,
|
"code": BASE64_DATA_INCOMPLETE,
|
||||||
"msg": error_codes[BASE64_DATA_INCOMPLETE],
|
"msg": error_codes[BASE64_DATA_INCOMPLETE],
|
||||||
}
|
}
|
||||||
logger.info(f"extractIdCardInfoByBase64Data")
|
# logger.info(f"extractIdCardInfoByBase64Data")
|
||||||
start_time = time.time() # 记录结束时间
|
start_time = time.time() # 记录结束时间
|
||||||
jsonString = ""
|
jsonString = ""
|
||||||
try:
|
try:
|
||||||
|
|
@ -118,5 +117,5 @@ class OCRRecognition:
|
||||||
|
|
||||||
end_time = time.time() # 记录结束时间
|
end_time = time.time() # 记录结束时间
|
||||||
execution_time = end_time - start_time # 计算执行时间
|
execution_time = end_time - start_time # 计算执行时间
|
||||||
logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒")
|
# logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒")
|
||||||
return jsonString
|
return jsonString
|
||||||
|
|
|
||||||
|
|
@ -1,102 +1,8 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
import sys
|
|
||||||
import io
|
|
||||||
import os
|
|
||||||
from paddleocr import PaddleOCR
|
|
||||||
import time
|
|
||||||
from configs.basic_config import logger
|
|
||||||
from extractor.identitycard_extractor import IdentityCardExtractor
|
|
||||||
import base64
|
|
||||||
import json
|
|
||||||
|
|
||||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
from util import ocr_recognition
|
||||||
def extractIdCardInfo(type:int, filePath1: str, filePath2: str)->str:
|
|
||||||
if (0 == type):
|
|
||||||
return extractIdCardInfoByPath(filePath1, filePath2)
|
|
||||||
elif (1 == type):
|
|
||||||
return extractIdCardInfoByBase64Data(filePath1,filePath2)
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def extractIdCardInfoByPath(filePath1: str, filePath2: str)->str:
|
jsonString = ocr_recognition.OCRRecognition.extractIdCardInfoByPath("./images/han.jpg","")
|
||||||
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
|
|
||||||
text = ""
|
|
||||||
start_time = time.time() # 记录结束时间
|
|
||||||
jsonString = ""
|
|
||||||
try:
|
|
||||||
if os.path.exists(filePath1):
|
|
||||||
result = ocr.ocr(filePath1, cls=False)
|
|
||||||
for idx in range(len(result)):
|
|
||||||
res = result[idx]
|
|
||||||
for line in res:
|
|
||||||
text += (line[1][0] + '\n')
|
|
||||||
|
|
||||||
if os.path.exists(filePath2):
|
|
||||||
result = ocr.ocr(filePath2, cls=False)
|
|
||||||
for idx in range(len(result)):
|
|
||||||
res = result[idx]
|
|
||||||
for line in res:
|
|
||||||
text += (line[1][0] + '\n')
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(e)
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
if 0 != len(text):
|
|
||||||
logger.info(f"text:{text}")
|
|
||||||
extractor = IdentityCardExtractor()
|
|
||||||
tempdict = extractor.extract_textbyPaddle(text)
|
|
||||||
jsonString = json.dumps(tempdict, ensure_ascii=False)
|
|
||||||
end_time = time.time() # 记录结束时间
|
|
||||||
execution_time = end_time - start_time # 计算执行时间
|
|
||||||
logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒")
|
|
||||||
return jsonString
|
|
||||||
|
|
||||||
def extractIdCardInfoByBase64Data(base64data1:str, base64Data2: str)->str:
|
|
||||||
logger.info(f"extractIdCardInfoByBase64Data")
|
|
||||||
start_time = time.time() # 记录结束时间
|
|
||||||
jsonString = ""
|
|
||||||
try:
|
|
||||||
if 0!=len(base64data1):
|
|
||||||
logger.info(f"not base64data1.empty()")
|
|
||||||
image_data1 = base64.b64decode(base64data1)
|
|
||||||
with open("file1.png", "wb") as file:
|
|
||||||
file.write(image_data1)
|
|
||||||
|
|
||||||
if 0!=len(base64Data2):
|
|
||||||
logger.info(f"not base64Data2.empty()")
|
|
||||||
image_data2 = base64.b64decode(base64Data2)
|
|
||||||
with open("file2.png", "wb") as file:
|
|
||||||
file.write(image_data2)
|
|
||||||
|
|
||||||
if os.path.exists("file1.png") and os.path.exists("file2.png"):
|
|
||||||
logger.info(f"file1.png and file2.png exist")
|
|
||||||
jsonString = extractIdCardInfoByPath("file1.png","file2.png")
|
|
||||||
os.remove("file1.png")
|
|
||||||
os.remove("file2.png")
|
|
||||||
elif os.path.exists("file1.png"):
|
|
||||||
logger.info(f"file1.png exist")
|
|
||||||
jsonString = extractIdCardInfoByPath("file1.png","")
|
|
||||||
os.remove("file1.png")
|
|
||||||
elif os.path.exists("file2.png"):
|
|
||||||
logger.info(f"file2.png exist")
|
|
||||||
jsonString = extractIdCardInfoByPath("file2.png","")
|
|
||||||
os.remove("file2.png")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(e)
|
|
||||||
|
|
||||||
end_time = time.time() # 记录结束时间
|
|
||||||
execution_time = end_time - start_time # 计算执行时间
|
|
||||||
logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒")
|
|
||||||
return jsonString
|
|
||||||
|
|
||||||
# with open('/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG', 'rb') as image_file:
|
|
||||||
# base64_image_string = base64.b64encode(image_file.read()).decode('utf-8')
|
|
||||||
#
|
|
||||||
# jsonString = extractIdCardInfoByBase64Data(base64_image_string,"")
|
|
||||||
# jsonString = extractIdCardInfoByBase64Data("/Users/wangvivi/Desktop/Code/ocrtest/images/2.jpg","/Users/wangvivi/Desktop/Code/ocrtest/images/1.jpg")
|
|
||||||
# print(jsonString)
|
|
||||||
# #
|
|
||||||
jsonString = extractIdCardInfoByPath("./images/han.jpg","")
|
|
||||||
print(jsonString)
|
print(jsonString)
|
||||||
# jsonString = extractIdCardInfoByPath("/Users/wangvivi/Desktop/Code/ocrtest/images/2.jpg","/Users/wangvivi/Desktop/Code/ocrtest/images/1.jpg")
|
# jsonString = extractIdCardInfoByPath("/Users/wangvivi/Desktop/Code/ocrtest/images/2.jpg","/Users/wangvivi/Desktop/Code/ocrtest/images/1.jpg")
|
||||||
# print(jsonString)
|
# print(jsonString)
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from util import ocr_recognition
|
from util import ocr_recognition
|
||||||
from configs.basic_config import *
|
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
|
|
@ -12,7 +11,6 @@ class IdentifyRecognitionParams(BaseModel):
|
||||||
|
|
||||||
@app.post("/recognition")
|
@app.post("/recognition")
|
||||||
async def recognition(params: IdentifyRecognitionParams):
|
async def recognition(params: IdentifyRecognitionParams):
|
||||||
# logger.info(f"python recognition里的参数,{params.type}, {params.recognitionFrontData}, {params.recognitionBackData}")
|
|
||||||
returnStr = ocr_recognition.OCRRecognition.extractIdCardInfo(params.type, params.recognitionFrontData, params.recognitionBackData)
|
returnStr = ocr_recognition.OCRRecognition.extractIdCardInfo(params.type, params.recognitionFrontData, params.recognitionBackData)
|
||||||
return returnStr
|
return returnStr
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue