From 75f8a7176812708549412a215c09291e432801c1 Mon Sep 17 00:00:00 2001 From: weiweiw <14335254+weiweiw22@user.noreply.gitee.com> Date: Wed, 14 Aug 2024 16:24:11 +0800 Subject: [PATCH] =?UTF-8?q?fix=20=E2=80=98=E5=AE=89=E5=BE=BD=E7=9C=81?= =?UTF-8?q?=E2=80=99=E5=A4=9A=E4=B8=AA=E2=80=98=E5=AE=89=E2=80=99=E7=9A=84?= =?UTF-8?q?=E8=AF=86=E5=88=AB=E5=92=8C=E5=9C=B0=E5=9D=80=E9=87=8C=E4=BB=A5?= =?UTF-8?q?=E6=95=B0=E5=AD=97=E6=89=93=E5=A4=B4=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- OCRPython/configs/basic_config.py | 2 + OCRPython/extractor/identitycard_extractor.py | 58 ++++++----- OCRPython/util/ocr_recognition.py | 9 +- OCRPython/validate.py | 98 +------------------ OCRPython/webApp.py | 2 - 5 files changed, 44 insertions(+), 125 deletions(-) diff --git a/OCRPython/configs/basic_config.py b/OCRPython/configs/basic_config.py index 11385a9..55f3e50 100644 --- a/OCRPython/configs/basic_config.py +++ b/OCRPython/configs/basic_config.py @@ -9,6 +9,7 @@ BASE64_DATA_INCOMPLETE = 20003 NO_TEXT_RECOGNIZED = 20004 OCR_RECOGNIZE_OTHER_EXCEPTION = 20005 RECOGNITION_INFO_PARSE_OTHER_EXCEPTION = 20006 +NO_DEFINED_FUNCTION_ERROR = 20007 error_codes = { @@ -19,6 +20,7 @@ error_codes = { NO_TEXT_RECOGNIZED: "识别本地图片路径存在,但没有识别出文字", OCR_RECOGNIZE_OTHER_EXCEPTION: "OCR识别身份证其他异常", RECOGNITION_INFO_PARSE_OTHER_EXCEPTION: "身份证信息解析其他异常", + NO_DEFINED_FUNCTION_ERROR:"没有实现的功能" } # 是否显示详细日志 log_verbose = True diff --git a/OCRPython/extractor/identitycard_extractor.py b/OCRPython/extractor/identitycard_extractor.py index 31a6de8..7ab87cc 100644 --- a/OCRPython/extractor/identitycard_extractor.py +++ b/OCRPython/extractor/identitycard_extractor.py @@ -98,6 +98,10 @@ class IdentityCardExtractor(Extractor): else: pattern = r"\d{18,}$" # 匹配1位以上的数字结尾 data["address"] = re.sub(pattern, "", data["address"]) + if data["address"].startswith("安安徽省"): + data["address"] ="安徽省" + data["address"][len("安安徽省"):] + if re.match(r'^[\d_-]+', data["address"]): + data["address"] = re.sub(r'^[\d_-]+', '', data["address"]) # 提取身份证号码 id_number = re.search(r"([123456]\d{17}|[123456]\d{16}[Xx])", text, re.DOTALL) if id_number: @@ -248,25 +252,19 @@ class IdentityCardExtractor(Extractor): # 公民身份号码 # 513401197807087411 # """ -text = """中华人民共和国 -居民身份证 -签发机关 -木里县公安局 -有效期限 -2020.03.16-2025.03.16 -名 -蒋子古 -姓 -男 -民族彝 -出生 -2005年1月4日 -住址 -四川省木里藏族自治县耗 -牛坪乡泥珠村5组29号 -公民身份号码 -513422200501044415 -""" +# text = """中华人民共和国 +# 居民身份证 +# 签发机关东至县公安局 +# 有效期限2021.07.02-2031.07.02 +# 姓名程文友 +# 性别男民族汉 +# 出生2002年2月7日 +# 住址安 +# 安徽省东至县胜利镇湖滨 +# 村张四组12号 +# 公民身份号码 +# 342921200202074436 +# """ # # text = """姓名苏龙格德·胡尔查巴特尔 # 性别男民族蒙古 @@ -351,9 +349,6 @@ text = """中华人民共和国 # 村峡山口村民组156号 # 公民身份号码 # 43038119831124301X""" -# extractor = IdentityCardExtractor() -# jsonstring = extractor.extract_textbyPaddle(text) -# print(jsonstring) @@ -380,3 +375,22 @@ text = """中华人民共和国 # else: # print("未找到性别信息") +# text = """姓名热西提·伊明 +# 性别 +# 民族维吾尔 +# 出生 +# 1971年6月6日 +# 住址 +# 19995-6 +# 新疆新和县无鲁都斯巴格镇 +# 它乾城社区6组19号 +# 公民身份号码 +# 652925197106062519 +# 中华人民共和国 +# 居民身份证 +# 有效期限 +# 2021.07.16-长期 +# """ +# extractor = IdentityCardExtractor() +# jsonstring = extractor.extract_textbyPaddle(text) +# print(jsonstring) diff --git a/OCRPython/util/ocr_recognition.py b/OCRPython/util/ocr_recognition.py index b17e8c6..74be4e7 100644 --- a/OCRPython/util/ocr_recognition.py +++ b/OCRPython/util/ocr_recognition.py @@ -29,14 +29,13 @@ class OCRRecognition: @staticmethod def extractIdCardInfoByPath(filePath1: str = "", filePath2: str = "") -> str: - logger.info(f"ocr加载开始计时") + # logger.info(f"ocr加载开始计时") start_time = time.time() # 记录结束时间 ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory text = "" finalResult = { "code": LOCAL_PATH_NOT_EXIST, "msg": error_codes[LOCAL_PATH_NOT_EXIST], - } try: if len(filePath1) > 0: @@ -73,7 +72,7 @@ class OCRRecognition: jsonString = json.dumps(tempdict, ensure_ascii=False) end_time = time.time() # 记录结束时间 execution_time = end_time - start_time # 计算执行时间 - logger.info(f"extractIdCardInfoByPath 耗时{execution_time}秒") + # logger.info(f"extractIdCardInfoByPath 耗时{execution_time}秒") return jsonString else: finalResult["code"] = NO_TEXT_RECOGNIZED @@ -87,7 +86,7 @@ class OCRRecognition: "code": BASE64_DATA_INCOMPLETE, "msg": error_codes[BASE64_DATA_INCOMPLETE], } - logger.info(f"extractIdCardInfoByBase64Data") + # logger.info(f"extractIdCardInfoByBase64Data") start_time = time.time() # 记录结束时间 jsonString = "" try: @@ -118,5 +117,5 @@ class OCRRecognition: end_time = time.time() # 记录结束时间 execution_time = end_time - start_time # 计算执行时间 - logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒") + # logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒") return jsonString diff --git a/OCRPython/validate.py b/OCRPython/validate.py index 892c7e6..9319bd0 100644 --- a/OCRPython/validate.py +++ b/OCRPython/validate.py @@ -1,102 +1,8 @@ # -*- coding: utf-8 -*- -import sys -import io -import os -from paddleocr import PaddleOCR -import time -from configs.basic_config import logger -from extractor.identitycard_extractor import IdentityCardExtractor -import base64 -import json -sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') -def extractIdCardInfo(type:int, filePath1: str, filePath2: str)->str: - if (0 == type): - return extractIdCardInfoByPath(filePath1, filePath2) - elif (1 == type): - return extractIdCardInfoByBase64Data(filePath1,filePath2) - else: - pass +from util import ocr_recognition -def extractIdCardInfoByPath(filePath1: str, filePath2: str)->str: - ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory - text = "" - start_time = time.time() # 记录结束时间 - jsonString = "" - try: - if os.path.exists(filePath1): - result = ocr.ocr(filePath1, cls=False) - for idx in range(len(result)): - res = result[idx] - for line in res: - text += (line[1][0] + '\n') - - if os.path.exists(filePath2): - result = ocr.ocr(filePath2, cls=False) - for idx in range(len(result)): - res = result[idx] - for line in res: - text += (line[1][0] + '\n') - except Exception as e: - logger.error(e) - print(e) - - if 0 != len(text): - logger.info(f"text:{text}") - extractor = IdentityCardExtractor() - tempdict = extractor.extract_textbyPaddle(text) - jsonString = json.dumps(tempdict, ensure_ascii=False) - end_time = time.time() # 记录结束时间 - execution_time = end_time - start_time # 计算执行时间 - logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒") - return jsonString - -def extractIdCardInfoByBase64Data(base64data1:str, base64Data2: str)->str: - logger.info(f"extractIdCardInfoByBase64Data") - start_time = time.time() # 记录结束时间 - jsonString = "" - try: - if 0!=len(base64data1): - logger.info(f"not base64data1.empty()") - image_data1 = base64.b64decode(base64data1) - with open("file1.png", "wb") as file: - file.write(image_data1) - - if 0!=len(base64Data2): - logger.info(f"not base64Data2.empty()") - image_data2 = base64.b64decode(base64Data2) - with open("file2.png", "wb") as file: - file.write(image_data2) - - if os.path.exists("file1.png") and os.path.exists("file2.png"): - logger.info(f"file1.png and file2.png exist") - jsonString = extractIdCardInfoByPath("file1.png","file2.png") - os.remove("file1.png") - os.remove("file2.png") - elif os.path.exists("file1.png"): - logger.info(f"file1.png exist") - jsonString = extractIdCardInfoByPath("file1.png","") - os.remove("file1.png") - elif os.path.exists("file2.png"): - logger.info(f"file2.png exist") - jsonString = extractIdCardInfoByPath("file2.png","") - os.remove("file2.png") - except Exception as e: - logger.error(e) - - end_time = time.time() # 记录结束时间 - execution_time = end_time - start_time # 计算执行时间 - logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒") - return jsonString - -# with open('/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG', 'rb') as image_file: -# base64_image_string = base64.b64encode(image_file.read()).decode('utf-8') -# -# jsonString = extractIdCardInfoByBase64Data(base64_image_string,"") -# jsonString = extractIdCardInfoByBase64Data("/Users/wangvivi/Desktop/Code/ocrtest/images/2.jpg","/Users/wangvivi/Desktop/Code/ocrtest/images/1.jpg") -# print(jsonString) -# # -jsonString = extractIdCardInfoByPath("./images/han.jpg","") +jsonString = ocr_recognition.OCRRecognition.extractIdCardInfoByPath("./images/han.jpg","") print(jsonString) # jsonString = extractIdCardInfoByPath("/Users/wangvivi/Desktop/Code/ocrtest/images/2.jpg","/Users/wangvivi/Desktop/Code/ocrtest/images/1.jpg") # print(jsonString) diff --git a/OCRPython/webApp.py b/OCRPython/webApp.py index 73fe873..d635bf2 100644 --- a/OCRPython/webApp.py +++ b/OCRPython/webApp.py @@ -1,7 +1,6 @@ from fastapi import FastAPI from pydantic import BaseModel from util import ocr_recognition -from configs.basic_config import * app = FastAPI() @@ -12,7 +11,6 @@ class IdentifyRecognitionParams(BaseModel): @app.post("/recognition") async def recognition(params: IdentifyRecognitionParams): - # logger.info(f"python recognition里的参数,{params.type}, {params.recognitionFrontData}, {params.recognitionBackData}") returnStr = ocr_recognition.OCRRecognition.extractIdCardInfo(params.type, params.recognitionFrontData, params.recognitionBackData) return returnStr