From be92b7cfde527ddcbdc7082cef08ef721ca821ef Mon Sep 17 00:00:00 2001 From: weiweiw <14335254+weiweiw22@user.noreply.gitee.com> Date: Tue, 30 Apr 2024 10:43:16 +0800 Subject: [PATCH] fix the identity recognition issue for minority population --- OCRPython/extractor/identitycard_extractor.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/OCRPython/extractor/identitycard_extractor.py b/OCRPython/extractor/identitycard_extractor.py index 352e289..2a9ea6c 100644 --- a/OCRPython/extractor/identitycard_extractor.py +++ b/OCRPython/extractor/identitycard_extractor.py @@ -64,7 +64,15 @@ class IdentityCardExtractor(Extractor): def extract_textbyPaddle(self, text:str)->dict: try: - result = {} + result = { + "issuingAuthority": "", + "validTime": "", + "name": "", + "gender": "", + "ethnicity": "", + "dateOfBirth": "", + "address": "", + "idNumber": ""} # 提取签发机关 issuing_authority = re.search(r"签发机关\n*(.+?)\n", text, re.DOTALL) if issuing_authority: @@ -80,7 +88,7 @@ class IdentityCardExtractor(Extractor): if name: tempName = name.group(1).strip() if tempName in "性别男" or tempName in "性别女": - name = re.search(r"(.*?)\s*姓名", text, re.DOTALL) + name = re.search(r"(\S+)\s*姓名", text, re.DOTALL) result["name"] = name.group(1).strip() else: result["name"] = name.group(1).strip() @@ -284,6 +292,16 @@ class IdentityCardExtractor(Extractor): # 公民身份号码 # 513422200501044415 # """ +# +# text = """姓名苏龙格德·胡尔查巴特尔 +# 性别男民族蒙古 +# 出生1973年10支月27日 +# 内蒙古赤峰市巴林右旗沙布 +# 台苏木树中嘎查 +# 中华人民共和国 +# 居民身份证 +# 签发机关巴林右旗公安局 +# 有效期限2004.10.27-2024.10.26""" # extractor = IdentityCardExtractor() # jsonstring = extractor.extract_textbyPaddle(text) # print(jsonstring)