ComponentDevelopment/OCRPython/extractor/identitycard_extractor.py

102 lines
3.2 KiB
Python
Raw Normal View History

2024-04-22 10:02:06 +08:00
import re
from extractor import Extractor
2024-04-22 17:12:21 +08:00
from configs.basic_config import logger
2024-04-22 10:02:06 +08:00
class IdentityCardExtractor(Extractor):
2024-04-22 17:12:21 +08:00
def extract_text(self, text:str)->dict:
try:
patterns = {
"issuingAuthority": r"签发机关\n(.+?)\n",
"validTime": r"有效期限\n(.+?)\n",
"name": r"(\S+)\n(?:男|女)",
"gender": r"(\S+)民族",
"ethnicity": r"民族(\S+)",
"dateOfBirth": r"(\d+年\d+月\d+日)",
"address": r"(住址|佳址)\s*(.*?)公民身份号码",
"idNumber": r"(\d{18}|\d{17}[Xx])"
}
tempText = self.remove_blank_lines(text)
# 提取信息
info = {}
for key, pattern in patterns.items():
match = re.search(pattern, tempText,re.DOTALL)
if match:
if "address" == key:
tempStr = match.group(2).strip()
else:
tempStr = match.group(1).strip()
info[key] = tempStr.replace("\n", "")
return info
except Exception as e:
print(e)
logger.error(e)
return {}
2024-04-22 10:02:06 +08:00
def extract_textbyPaddle(self, text:str)->dict:
2024-04-22 17:12:21 +08:00
try:
patterns = {
2024-04-24 08:41:51 +08:00
"issuingAuthority": r"签发机关\n*(.+?)\n",
2024-04-22 17:12:21 +08:00
"validTime": r"有效期限\n*(.+?)\n",
"name": r"姓名(.*?)\n", #####
"gender": r"(\S)民族",
"ethnicity": r"民族(\S+)",
"dateOfBirth": r"(\d+年\d+月\d+日)",
"address": r"(住址|佳址)\s*(.*?)公民身份号码",
"idNumber": r"(\d{18}|\d{17}[Xx])"
}
2024-04-22 10:02:06 +08:00
2024-04-22 17:12:21 +08:00
tempText = self.remove_blank_lines(text)
# 提取信息
info = {}
for key, pattern in patterns.items():
match = re.search(pattern, tempText,re.DOTALL)
if match:
if "address" == key:
tempStr = match.group(2).strip()
else:
tempStr = match.group(1).strip()
info[key] = tempStr.replace("\n", "")
return info
except Exception as e:
print(e)
logger.error(e)
return {}
2024-04-22 10:02:06 +08:00
class InvoiceExtractor(Extractor):
def extract_text(self,text:str)->dict:
pass
# text = """中华人民共和国
# 居民身份证
# 签发机关合肥市公安局庐阳分局
# 有效期限
# 2022.07.10-长期
# 姓名陈玉振
# 性别男
# 民族汉
# 出生1973年6月19日
# 佳址安徽省合肥市庐阳区固镇
# 路3150号森林城A6地块1
# 幢2001室
# 公民身份号码
# 440203197306192118"""
2024-04-24 08:41:51 +08:00
# text = """
# 中华人民共和国
# 居民身份证
# oo
# 签发机关宿州市公安局桥分局
# 有效期限2023.01.18-2043.01.18
# 姓名郭乾坤
# 性别男民族汉
# 出生1994年10月17日
# 住址 安徽省宿州市场桥区朱仙
# 庄镇郭庙村郭家组6号
# 公民身份号码
# 34220119941017327X
# """
# extractor = IdentityCardExtractor()
#
# jsonstring = extractor.extract_textbyPaddle(text)
# print(jsonstring)