ComponentDevelopment/OCRPython/extractor/identitycard_extractor.py

73 lines
2.2 KiB
Python

import re
from extractor import Extractor
import json
class IdentityCardExtractor(Extractor):
def extract_text(self,text:str)->dict:
patterns = {
"issuingAuthority": r"签发机关\n(.+?)\n",
"validTime": r"有效期限\n(.+?)\n",
"name": r"(\S+)\n(?:男|女)",
"gender": r"(\S+)民族",
"ethnicity": r"民族(\S+)",
"dateOfBirth": r"(\d+年\d+月\d+日)",
"address": r"住址\s*(.*?)公民身份号码",
"idNumber": r"(\d{18}|\d{17}[Xx])"
}
tempText = self.remove_blank_lines(text)
# 提取信息
info = {}
for key, pattern in patterns.items():
match = re.search(pattern, tempText,re.DOTALL)
if match:
tempStr = match.group(1).strip()
info[key] = tempStr.replace("\n", "")
return info
def extract_textbyPaddle(self, text:str)->dict:
patterns = {
"issuingAuthority": r"签发机关\n(.+?)\n",
"validTime": r"有效期限\n(.+?)\n",
"name": r"姓名(.*?)\n", #####
"gender": r"(\S)民族",
"ethnicity": r"民族(\S+)",
"dateOfBirth": r"(\d+年\d+月\d+日)",
"address": r"住址|佳址\s*(.*?)公民身份号码",
"idNumber": r"(\d{18}|\d{17}[Xx])"
}
tempText = self.remove_blank_lines(text)
# 提取信息
info = {}
for key, pattern in patterns.items():
match = re.search(pattern, tempText,re.DOTALL)
if match:
tempStr = match.group(1).strip()
info[key] = tempStr.replace("\n", "")
return info
class InvoiceExtractor(Extractor):
def extract_text(self,text:str)->dict:
pass
# text = """中华人民共和国
# 居民身份证
# 签发机关合肥市公安局庐阳分局
# 有效期限
# 2022.07.10-长期
# 姓名陈玉振
# 性别男
# 民族汉
# 出生1973年6月19日
# 佳址安徽省合肥市庐阳区固镇
# 路3150号森林城A6地块1
# 幢2001室
# 公民身份号码
# 440203197306192118"""
# extractor = IdentityCardExtractor()
# jsonstring = extractor.extract_textbyPaddle(text)
# print(jsonstring)