fix the identity recognition issue for minority population

This commit is contained in:
weiweiw 2024-04-30 10:43:16 +08:00
parent 4e920f5542
commit be92b7cfde
1 changed files with 20 additions and 2 deletions

View File

@ -64,7 +64,15 @@ class IdentityCardExtractor(Extractor):
def extract_textbyPaddle(self, text:str)->dict:
try:
result = {}
result = {
"issuingAuthority": "",
"validTime": "",
"name": "",
"gender": "",
"ethnicity": "",
"dateOfBirth": "",
"address": "",
"idNumber": ""}
# 提取签发机关
issuing_authority = re.search(r"签发机关\n*(.+?)\n", text, re.DOTALL)
if issuing_authority:
@ -80,7 +88,7 @@ class IdentityCardExtractor(Extractor):
if name:
tempName = name.group(1).strip()
if tempName in "性别男" or tempName in "性别女":
name = re.search(r"(.*?)\s*姓名", text, re.DOTALL)
name = re.search(r"(\S+)\s*姓名", text, re.DOTALL)
result["name"] = name.group(1).strip()
else:
result["name"] = name.group(1).strip()
@ -284,6 +292,16 @@ class IdentityCardExtractor(Extractor):
# 公民身份号码
# 513422200501044415
# """
#
# text = """姓名苏龙格德·胡尔查巴特尔
# 性别男民族蒙古
# 出生1973年10支月27日
# 内蒙古赤峰市巴林右旗沙布
# 台苏木树中嘎查
# 中华人民共和国
# 居民身份证
# 签发机关巴林右旗公安局
# 有效期限2004.10.27-2024.10.26"""
# extractor = IdentityCardExtractor()
# jsonstring = extractor.extract_textbyPaddle(text)
# print(jsonstring)