fix ‘安徽省’多个‘安’的识别和地址里以数字打头的问题

2024-08-14 16:24:11 +08:00 · 2024-08-14 16:24:11 +08:00 · 75f8a71768
parent 213bb69f73
commit 75f8a71768
5 changed files with 44 additions and 125 deletions
--- a/OCRPython/configs/basic_config.py
+++ b/OCRPython/configs/basic_config.py
@ -9,6 +9,7 @@ BASE64_DATA_INCOMPLETE = 20003
 NO_TEXT_RECOGNIZED = 20004
 OCR_RECOGNIZE_OTHER_EXCEPTION = 20005
 RECOGNITION_INFO_PARSE_OTHER_EXCEPTION = 20006
+NO_DEFINED_FUNCTION_ERROR = 20007


 error_codes = {
@ -19,6 +20,7 @@ error_codes = {
    NO_TEXT_RECOGNIZED: "识别本地图片路径存在，但没有识别出文字",
    OCR_RECOGNIZE_OTHER_EXCEPTION: "OCR识别身份证其他异常",
    RECOGNITION_INFO_PARSE_OTHER_EXCEPTION: "身份证信息解析其他异常",
+    NO_DEFINED_FUNCTION_ERROR:"没有实现的功能"
 }
 # 是否显示详细日志
 log_verbose = True
--- a/OCRPython/extractor/identitycard_extractor.py
+++ b/OCRPython/extractor/identitycard_extractor.py
@ -98,6 +98,10 @@ class IdentityCardExtractor(Extractor):
                else:
                    pattern = r"\d{18,}$"  # 匹配1位以上的数字结尾
                    data["address"] = re.sub(pattern, "", data["address"])
+            if data["address"].startswith("安安徽省"):
+                data["address"] ="安徽省" + data["address"][len("安安徽省"):]
+            if re.match(r'^[\d_-]+', data["address"]):
+                data["address"] = re.sub(r'^[\d_-]+', '', data["address"])
            # 提取身份证号码
            id_number = re.search(r"([123456]\d{17}|[123456]\d{16}[Xx])", text, re.DOTALL)
            if id_number:
@ -248,25 +252,19 @@ class IdentityCardExtractor(Extractor):
 # 公民身份号码
 # 513401197807087411
 # """
-text = """中华人民共和国
-居民身份证
-签发机关
-木里县公安局
-有效期限
-2020.03.16-2025.03.16
-名
-蒋子古
-姓
-男
-民族彝
-出生
-2005年1月4日
-住址
-四川省木里藏族自治县耗
-牛坪乡泥珠村5组29号
-公民身份号码
-513422200501044415
-"""
+# text = """中华人民共和国
+# 居民身份证
+# 签发机关东至县公安局
+# 有效期限2021.07.02-2031.07.02
+# 姓名程文友
+# 性别男民族汉
+# 出生2002年2月7日
+# 住址安
+# 安徽省东至县胜利镇湖滨
+# 村张四组12号
+# 公民身份号码
+# 342921200202074436
+# """
 #
 # text = """姓名苏龙格德·胡尔查巴特尔
 # 性别男民族蒙古
@ -351,9 +349,6 @@ text = """中华人民共和国
 # 村峡山口村民组156号
 # 公民身份号码
 # 43038119831124301X"""
-# extractor = IdentityCardExtractor()
-# jsonstring = extractor.extract_textbyPaddle(text)
-# print(jsonstring)



@ -380,3 +375,22 @@ text = """中华人民共和国
 # else:
 #     print("未找到性别信息")

+# text = """姓名热西提·伊明
+# 性别
+# 民族维吾尔
+# 出生
+# 1971年6月6日
+# 住址
+# 19995-6
+# 新疆新和县无鲁都斯巴格镇
+# 它乾城社区6组19号
+# 公民身份号码
+# 652925197106062519
+# 中华人民共和国
+# 居民身份证
+# 有效期限
+# 2021.07.16-长期
+# """
+# extractor = IdentityCardExtractor()
+# jsonstring = extractor.extract_textbyPaddle(text)
+# print(jsonstring)
--- a/OCRPython/util/ocr_recognition.py
+++ b/OCRPython/util/ocr_recognition.py
@ -29,14 +29,13 @@ class OCRRecognition:

    @staticmethod
    def extractIdCardInfoByPath(filePath1: str = "", filePath2: str = "") -> str:
-        logger.info(f"ocr加载开始计时")
+        # logger.info(f"ocr加载开始计时")
        start_time = time.time()  # 记录结束时间
        ocr = PaddleOCR(use_angle_cls=True, lang="ch")  # need to run only once to download and load model into memory
        text = ""
        finalResult = {
            "code": LOCAL_PATH_NOT_EXIST,
            "msg": error_codes[LOCAL_PATH_NOT_EXIST],
-
        }
        try:
            if len(filePath1) > 0:
@ -73,7 +72,7 @@ class OCRRecognition:
            jsonString = json.dumps(tempdict, ensure_ascii=False)
            end_time = time.time()  # 记录结束时间
            execution_time = end_time - start_time  # 计算执行时间
-            logger.info(f"extractIdCardInfoByPath 耗时{execution_time}秒")
+            # logger.info(f"extractIdCardInfoByPath 耗时{execution_time}秒")
            return jsonString
        else:
            finalResult["code"] = NO_TEXT_RECOGNIZED
@ -87,7 +86,7 @@ class OCRRecognition:
            "code": BASE64_DATA_INCOMPLETE,
            "msg": error_codes[BASE64_DATA_INCOMPLETE],
        }
-        logger.info(f"extractIdCardInfoByBase64Data")
+        # logger.info(f"extractIdCardInfoByBase64Data")
        start_time = time.time()  # 记录结束时间
        jsonString = ""
        try:
@ -118,5 +117,5 @@ class OCRRecognition:

        end_time = time.time()  # 记录结束时间
        execution_time = end_time - start_time  # 计算执行时间
-        logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒")
+        # logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒")
        return jsonString
--- a/OCRPython/validate.py
+++ b/OCRPython/validate.py
@ -1,102 +1,8 @@
 # -*- coding: utf-8 -*-
-import sys
-import io
-import os
-from paddleocr import PaddleOCR
-import time
-from configs.basic_config import logger
-from extractor.identitycard_extractor import IdentityCardExtractor
-import base64
-import json

-sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
-def extractIdCardInfo(type:int, filePath1: str, filePath2: str)->str:
-    if (0 == type):
-        return extractIdCardInfoByPath(filePath1, filePath2)
-    elif (1 == type):
-        return extractIdCardInfoByBase64Data(filePath1,filePath2)
-    else:
-        pass
+from util  import ocr_recognition

-def extractIdCardInfoByPath(filePath1: str, filePath2: str)->str:
-    ocr = PaddleOCR(use_angle_cls=True, lang="ch")  # need to run only once to download and load model into memory
-    text = ""
-    start_time = time.time()  # 记录结束时间
-    jsonString = ""
-    try:
-        if os.path.exists(filePath1):
-            result = ocr.ocr(filePath1, cls=False)
-            for idx in range(len(result)):
-                res = result[idx]
-                for line in res:
-                    text += (line[1][0] + '\n')
-
-        if os.path.exists(filePath2):
-            result = ocr.ocr(filePath2, cls=False)
-            for idx in range(len(result)):
-                res = result[idx]
-                for line in res:
-                    text += (line[1][0] + '\n')
-    except Exception as e:
-        logger.error(e)
-        print(e)
-
-    if 0 != len(text):
-        logger.info(f"text:{text}")
-        extractor = IdentityCardExtractor()
-        tempdict = extractor.extract_textbyPaddle(text)
-        jsonString = json.dumps(tempdict, ensure_ascii=False)
-        end_time = time.time()  # 记录结束时间
-        execution_time = end_time - start_time  # 计算执行时间
-        logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒")
-    return jsonString
-
-def extractIdCardInfoByBase64Data(base64data1:str, base64Data2: str)->str:
-    logger.info(f"extractIdCardInfoByBase64Data")
-    start_time = time.time()  # 记录结束时间
-    jsonString = ""
-    try:
-        if 0!=len(base64data1):
-            logger.info(f"not base64data1.empty()")
-            image_data1 = base64.b64decode(base64data1)
-            with open("file1.png", "wb") as file:
-                file.write(image_data1)
-
-        if 0!=len(base64Data2):
-            logger.info(f"not base64Data2.empty()")
-            image_data2 = base64.b64decode(base64Data2)
-            with open("file2.png", "wb") as file:
-                file.write(image_data2)
-
-        if os.path.exists("file1.png") and os.path.exists("file2.png"):
-            logger.info(f"file1.png and file2.png exist")
-            jsonString = extractIdCardInfoByPath("file1.png","file2.png")
-            os.remove("file1.png")
-            os.remove("file2.png")
-        elif os.path.exists("file1.png"):
-            logger.info(f"file1.png exist")
-            jsonString = extractIdCardInfoByPath("file1.png","")
-            os.remove("file1.png")
-        elif os.path.exists("file2.png"):
-            logger.info(f"file2.png exist")
-            jsonString = extractIdCardInfoByPath("file2.png","")
-            os.remove("file2.png")
-    except Exception as e:
-        logger.error(e)
-
-    end_time = time.time()  # 记录结束时间
-    execution_time = end_time - start_time  # 计算执行时间
-    logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒")
-    return jsonString
-
-# with open('/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG', 'rb') as image_file:
-#     base64_image_string = base64.b64encode(image_file.read()).decode('utf-8')
-#
-# jsonString = extractIdCardInfoByBase64Data(base64_image_string,"")
-# jsonString = extractIdCardInfoByBase64Data("/Users/wangvivi/Desktop/Code/ocrtest/images/2.jpg","/Users/wangvivi/Desktop/Code/ocrtest/images/1.jpg")
-# print(jsonString)
-# #
-jsonString = extractIdCardInfoByPath("./images/han.jpg","")
+jsonString = ocr_recognition.OCRRecognition.extractIdCardInfoByPath("./images/han.jpg","")
 print(jsonString)
 # jsonString = extractIdCardInfoByPath("/Users/wangvivi/Desktop/Code/ocrtest/images/2.jpg","/Users/wangvivi/Desktop/Code/ocrtest/images/1.jpg")
 # print(jsonString)
--- a/OCRPython/webApp.py
+++ b/OCRPython/webApp.py
@ -1,7 +1,6 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 from util  import ocr_recognition
-from configs.basic_config import *

 app = FastAPI()

@ -12,7 +11,6 @@ class IdentifyRecognitionParams(BaseModel):

@app.post("/recognition")
 async def recognition(params: IdentifyRecognitionParams):
-    # logger.info(f"python recognition里的参数，{params.type}, {params.recognitionFrontData}, {params.recognitionBackData}")
    returnStr = ocr_recognition.OCRRecognition.extractIdCardInfo(params.type, params.recognitionFrontData, params.recognitionBackData)
    return returnStr