73 lines
2.2 KiB
Python
73 lines
2.2 KiB
Python
import re
|
|
from extractor import Extractor
|
|
import json
|
|
|
|
class IdentityCardExtractor(Extractor):
|
|
def extract_text(self,text:str)->dict:
|
|
patterns = {
|
|
"issuingAuthority": r"签发机关\n(.+?)\n",
|
|
"validTime": r"有效期限\n(.+?)\n",
|
|
"name": r"(\S+)\n(?:男|女)",
|
|
"gender": r"(\S+)民族",
|
|
"ethnicity": r"民族(\S+)",
|
|
"dateOfBirth": r"(\d+年\d+月\d+日)",
|
|
"address": r"住址\s*(.*?)公民身份号码",
|
|
"idNumber": r"(\d{18}|\d{17}[Xx])"
|
|
}
|
|
tempText = self.remove_blank_lines(text)
|
|
# 提取信息
|
|
info = {}
|
|
for key, pattern in patterns.items():
|
|
match = re.search(pattern, tempText,re.DOTALL)
|
|
if match:
|
|
tempStr = match.group(1).strip()
|
|
info[key] = tempStr.replace("\n", "")
|
|
return info
|
|
|
|
def extract_textbyPaddle(self, text:str)->dict:
|
|
patterns = {
|
|
"issuingAuthority": r"签发机关\n(.+?)\n",
|
|
"validTime": r"有效期限\n(.+?)\n",
|
|
"name": r"姓名(.*?)\n", #####
|
|
"gender": r"(\S)民族",
|
|
"ethnicity": r"民族(\S+)",
|
|
"dateOfBirth": r"(\d+年\d+月\d+日)",
|
|
"address": r"住址|佳址\s*(.*?)公民身份号码",
|
|
"idNumber": r"(\d{18}|\d{17}[Xx])"
|
|
}
|
|
|
|
tempText = self.remove_blank_lines(text)
|
|
# 提取信息
|
|
info = {}
|
|
for key, pattern in patterns.items():
|
|
match = re.search(pattern, tempText,re.DOTALL)
|
|
if match:
|
|
tempStr = match.group(1).strip()
|
|
info[key] = tempStr.replace("\n", "")
|
|
return info
|
|
|
|
|
|
class InvoiceExtractor(Extractor):
|
|
def extract_text(self,text:str)->dict:
|
|
pass
|
|
|
|
|
|
# text = """中华人民共和国
|
|
# 居民身份证
|
|
# 签发机关合肥市公安局庐阳分局
|
|
# 有效期限
|
|
# 2022.07.10-长期
|
|
# 姓名陈玉振
|
|
# 性别男
|
|
# 民族汉
|
|
# 出生1973年6月19日
|
|
# 佳址安徽省合肥市庐阳区固镇
|
|
# 路3150号森林城A6地块1
|
|
# 幢2001室
|
|
# 公民身份号码
|
|
# 440203197306192118"""
|
|
|
|
# extractor = IdentityCardExtractor()
|
|
|
|
# jsonstring = extractor.extract_textbyPaddle(text)
|
|
# print(jsonstring) |