102 lines
3.2 KiB
Python
102 lines
3.2 KiB
Python
import re
|
|
from extractor import Extractor
|
|
from configs.basic_config import logger
|
|
|
|
class IdentityCardExtractor(Extractor):
|
|
def extract_text(self, text:str)->dict:
|
|
try:
|
|
patterns = {
|
|
"issuingAuthority": r"签发机关\n(.+?)\n",
|
|
"validTime": r"有效期限\n(.+?)\n",
|
|
"name": r"(\S+)\n(?:男|女)",
|
|
"gender": r"(\S+)民族",
|
|
"ethnicity": r"民族(\S+)",
|
|
"dateOfBirth": r"(\d+年\d+月\d+日)",
|
|
"address": r"(住址|佳址)\s*(.*?)公民身份号码",
|
|
"idNumber": r"(\d{18}|\d{17}[Xx])"
|
|
}
|
|
tempText = self.remove_blank_lines(text)
|
|
# 提取信息
|
|
info = {}
|
|
for key, pattern in patterns.items():
|
|
match = re.search(pattern, tempText,re.DOTALL)
|
|
if match:
|
|
if "address" == key:
|
|
tempStr = match.group(2).strip()
|
|
else:
|
|
tempStr = match.group(1).strip()
|
|
info[key] = tempStr.replace("\n", "")
|
|
return info
|
|
except Exception as e:
|
|
print(e)
|
|
logger.error(e)
|
|
return {}
|
|
|
|
def extract_textbyPaddle(self, text:str)->dict:
|
|
try:
|
|
patterns = {
|
|
"issuingAuthority": r"签发机关\n*(.+?)\n",
|
|
"validTime": r"有效期限\n*(.+?)\n",
|
|
"name": r"姓名(.*?)\n", #####
|
|
"gender": r"(\S)民族",
|
|
"ethnicity": r"民族(\S+)",
|
|
"dateOfBirth": r"(\d+年\d+月\d+日)",
|
|
"address": r"(住址|佳址)\s*(.*?)公民身份号码",
|
|
"idNumber": r"(\d{18}|\d{17}[Xx])"
|
|
}
|
|
|
|
tempText = self.remove_blank_lines(text)
|
|
# 提取信息
|
|
info = {}
|
|
for key, pattern in patterns.items():
|
|
match = re.search(pattern, tempText,re.DOTALL)
|
|
if match:
|
|
if "address" == key:
|
|
tempStr = match.group(2).strip()
|
|
else:
|
|
tempStr = match.group(1).strip()
|
|
info[key] = tempStr.replace("\n", "")
|
|
return info
|
|
except Exception as e:
|
|
print(e)
|
|
logger.error(e)
|
|
return {}
|
|
|
|
|
|
class InvoiceExtractor(Extractor):
|
|
def extract_text(self,text:str)->dict:
|
|
pass
|
|
|
|
|
|
# text = """中华人民共和国
|
|
# 居民身份证
|
|
# 签发机关合肥市公安局庐阳分局
|
|
# 有效期限
|
|
# 2022.07.10-长期
|
|
# 姓名陈玉振
|
|
# 性别男
|
|
# 民族汉
|
|
# 出生1973年6月19日
|
|
# 佳址安徽省合肥市庐阳区固镇
|
|
# 路3150号森林城A6地块1
|
|
# 幢2001室
|
|
# 公民身份号码
|
|
# 440203197306192118"""
|
|
# text = """
|
|
# 中华人民共和国
|
|
# 居民身份证
|
|
# oo
|
|
# 签发机关宿州市公安局桥分局
|
|
# 有效期限2023.01.18-2043.01.18
|
|
# 姓名郭乾坤
|
|
# 性别男民族汉
|
|
# 出生1994年10月17日
|
|
# 住址 安徽省宿州市场桥区朱仙
|
|
# 庄镇郭庙村郭家组6号
|
|
# 公民身份号码
|
|
# 34220119941017327X
|
|
# """
|
|
# extractor = IdentityCardExtractor()
|
|
#
|
|
# jsonstring = extractor.extract_textbyPaddle(text)
|
|
# print(jsonstring) |