2024-04-22 10:02:06 +08:00
|
|
|
import re
|
|
|
|
|
from extractor import Extractor
|
2024-04-22 17:12:21 +08:00
|
|
|
from configs.basic_config import logger
|
2024-04-22 10:02:06 +08:00
|
|
|
|
|
|
|
|
class IdentityCardExtractor(Extractor):
|
2024-04-22 17:12:21 +08:00
|
|
|
def extract_text(self, text:str)->dict:
|
|
|
|
|
try:
|
|
|
|
|
patterns = {
|
|
|
|
|
"issuingAuthority": r"签发机关\n(.+?)\n",
|
|
|
|
|
"validTime": r"有效期限\n(.+?)\n",
|
|
|
|
|
"name": r"(\S+)\n(?:男|女)",
|
|
|
|
|
"gender": r"(\S+)民族",
|
|
|
|
|
"ethnicity": r"民族(\S+)",
|
|
|
|
|
"dateOfBirth": r"(\d+年\d+月\d+日)",
|
|
|
|
|
"address": r"(住址|佳址)\s*(.*?)公民身份号码",
|
|
|
|
|
"idNumber": r"(\d{18}|\d{17}[Xx])"
|
|
|
|
|
}
|
|
|
|
|
tempText = self.remove_blank_lines(text)
|
|
|
|
|
# 提取信息
|
|
|
|
|
info = {}
|
|
|
|
|
for key, pattern in patterns.items():
|
|
|
|
|
match = re.search(pattern, tempText,re.DOTALL)
|
|
|
|
|
if match:
|
|
|
|
|
if "address" == key:
|
|
|
|
|
tempStr = match.group(2).strip()
|
|
|
|
|
else:
|
|
|
|
|
tempStr = match.group(1).strip()
|
|
|
|
|
info[key] = tempStr.replace("\n", "")
|
|
|
|
|
return info
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(e)
|
|
|
|
|
logger.error(e)
|
|
|
|
|
return {}
|
2024-04-22 10:02:06 +08:00
|
|
|
|
2024-04-30 09:06:28 +08:00
|
|
|
# def extract_textbyPaddle(self, text:str)->dict:
|
|
|
|
|
# try:
|
|
|
|
|
# patterns = {
|
|
|
|
|
# "issuingAuthority": r"签发机关\n*(.+?)\n",
|
|
|
|
|
# "validTime": r"有效期限\n*(.+?)\n",
|
|
|
|
|
# "name": r"姓名(.*?)\n", #####
|
|
|
|
|
# "gender": r"(\S)民族",
|
|
|
|
|
# "ethnicity": r"民族(\S+)",
|
|
|
|
|
# "dateOfBirth": r"(\d+年\d+月\d+日)",
|
|
|
|
|
# "address": r"(住址|佳址)\s*(.*?)公民身份号码",
|
|
|
|
|
# "idNumber": r"(\d{18}|\d{17}[Xx])"
|
|
|
|
|
# }
|
|
|
|
|
#
|
|
|
|
|
# tempText = self.remove_blank_lines(text)
|
|
|
|
|
# # 提取信息
|
|
|
|
|
# info = {}
|
|
|
|
|
# for key, pattern in patterns.items():
|
|
|
|
|
# match = re.search(pattern, tempText,re.DOTALL)
|
|
|
|
|
# if match:
|
|
|
|
|
# if "address" == key:
|
|
|
|
|
# tempStr = match.group(2).strip()
|
|
|
|
|
# else:
|
|
|
|
|
# tempStr = match.group(1).strip()
|
|
|
|
|
# info[key] = tempStr.replace("\n", "")
|
|
|
|
|
# return info
|
|
|
|
|
# except Exception as e:
|
|
|
|
|
# print(e)
|
|
|
|
|
# logger.error(e)
|
|
|
|
|
# return {}
|
|
|
|
|
|
2024-04-22 10:02:06 +08:00
|
|
|
def extract_textbyPaddle(self, text:str)->dict:
|
2024-04-22 17:12:21 +08:00
|
|
|
try:
|
2024-04-30 10:43:16 +08:00
|
|
|
result = {
|
|
|
|
|
"issuingAuthority": "",
|
|
|
|
|
"validTime": "",
|
|
|
|
|
"name": "",
|
|
|
|
|
"gender": "",
|
|
|
|
|
"ethnicity": "",
|
|
|
|
|
"dateOfBirth": "",
|
|
|
|
|
"address": "",
|
|
|
|
|
"idNumber": ""}
|
2024-04-30 09:06:28 +08:00
|
|
|
# 提取签发机关
|
2024-05-06 10:44:19 +08:00
|
|
|
issuing_authority = re.search(r"(签发机关|签发机美)\n*(.+?)\n", text, re.DOTALL)
|
2024-04-30 09:06:28 +08:00
|
|
|
if issuing_authority:
|
2024-05-06 10:44:19 +08:00
|
|
|
result["issuingAuthority"] = issuing_authority.group(2).strip()
|
2024-04-22 10:02:06 +08:00
|
|
|
|
2024-04-30 09:06:28 +08:00
|
|
|
# 提取有效期限
|
|
|
|
|
valid_time = re.search(r"有效期限\n*(\d{4}\.\d{2}\.\d{2}-\S+)", text, re.DOTALL)
|
|
|
|
|
if valid_time:
|
|
|
|
|
result["validTime"] = valid_time.group(1).strip()
|
|
|
|
|
|
|
|
|
|
# 提取姓名
|
|
|
|
|
name = re.search(r"姓名\s*(.*?)\n", text,re.DOTALL)
|
|
|
|
|
if name:
|
|
|
|
|
tempName = name.group(1).strip()
|
2024-05-06 10:44:19 +08:00
|
|
|
if tempName in "性别男" or tempName in "性别女" or "性别男" in tempName or "性别女" in tempName:
|
2024-04-30 10:43:16 +08:00
|
|
|
name = re.search(r"(\S+)\s*姓名", text, re.DOTALL)
|
2024-04-30 09:06:28 +08:00
|
|
|
result["name"] = name.group(1).strip()
|
|
|
|
|
else:
|
|
|
|
|
result["name"] = name.group(1).strip()
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
name = re.search(r"米名(\S*)址\s*(\S+)", text, re.DOTALL)
|
|
|
|
|
if name:
|
|
|
|
|
result["name"] = name.group(2).strip()
|
|
|
|
|
else:
|
|
|
|
|
name = re.search(r"名\s*(\S+)\s*姓\s*", text, re.DOTALL)
|
|
|
|
|
if name:
|
|
|
|
|
result["name"] = name.group(1).strip()
|
2024-04-22 17:12:21 +08:00
|
|
|
else:
|
2024-04-30 09:06:28 +08:00
|
|
|
name = re.search(r"(\S+)\s*(男|女|性别)", text, re.DOTALL)
|
|
|
|
|
if name:
|
|
|
|
|
result["name"] = name.group(1).strip()
|
|
|
|
|
|
|
|
|
|
# 提取民族
|
2024-05-06 10:44:19 +08:00
|
|
|
ethnicity = re.search(r"民\s*(族|旅)\s*(\S+)", text, re.DOTALL)
|
2024-04-30 09:06:28 +08:00
|
|
|
if ethnicity:
|
2024-05-06 10:44:19 +08:00
|
|
|
result["ethnicity"] = ethnicity.group(2).strip()
|
2024-04-30 09:06:28 +08:00
|
|
|
|
|
|
|
|
# 提取地址
|
|
|
|
|
address = re.search(r"(住址|佳址)(.*?)公民身份号码", text, re.DOTALL)
|
|
|
|
|
if address:
|
|
|
|
|
result["address"] = address.group(2).strip().replace("\n", "")
|
|
|
|
|
else:
|
|
|
|
|
address = re.search(r"(\S+省)(.*?)公民身份号码", text, re.DOTALL)
|
|
|
|
|
if address:
|
|
|
|
|
result["address"] = address.group(1).strip().replace("\n", "") + address.group(2).strip().replace("\n", "")
|
|
|
|
|
else:
|
|
|
|
|
address = re.search(r"(\S+市)(.*?)公民身份号码", text, re.DOTALL)
|
|
|
|
|
if address:
|
|
|
|
|
result["address"] = address.group(1).strip().replace("\n", "") + address.group(2).strip().replace("\n", "")
|
|
|
|
|
else:
|
|
|
|
|
address = re.search(r"(\S+县)(.*?)公民身份号码", text, re.DOTALL)
|
|
|
|
|
if address:
|
|
|
|
|
result["address"] = address.group(1).strip().replace("\n", "") + address.group(2).strip().replace("\n", "")
|
|
|
|
|
if result["address"]:
|
|
|
|
|
result["address"] = re.sub(r'[A-Z]', '', result["address"])
|
|
|
|
|
|
|
|
|
|
# 提取身份证号码
|
|
|
|
|
id_number = re.search(r"(\d{18}|\d{17}[Xx])", text, re.DOTALL)
|
|
|
|
|
if id_number:
|
|
|
|
|
result["idNumber"] = id_number.group(1).strip()
|
|
|
|
|
|
|
|
|
|
if result["idNumber"]:
|
|
|
|
|
# 提取出生日期
|
|
|
|
|
result["dateOfBirth"] = self.extract_birthday_from_id(result["idNumber"])
|
|
|
|
|
# 提取性别
|
|
|
|
|
result["gender"] = self.get_gender_from_id(result["idNumber"])
|
|
|
|
|
|
|
|
|
|
return result
|
2024-04-22 17:12:21 +08:00
|
|
|
except Exception as e:
|
|
|
|
|
print(e)
|
|
|
|
|
logger.error(e)
|
|
|
|
|
return {}
|
2024-04-22 10:02:06 +08:00
|
|
|
|
2024-04-30 09:06:28 +08:00
|
|
|
class InvoiceExtractor(Extractor):
|
|
|
|
|
def extract_text(self,text:str)->dict:
|
|
|
|
|
pass
|
2024-04-22 10:02:06 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# text = """中华人民共和国
|
|
|
|
|
# 居民身份证
|
|
|
|
|
# 签发机关合肥市公安局庐阳分局
|
|
|
|
|
# 有效期限
|
|
|
|
|
# 2022.07.10-长期
|
|
|
|
|
# 姓名陈玉振
|
|
|
|
|
# 性别男
|
|
|
|
|
# 民族汉
|
|
|
|
|
# 出生1973年6月19日
|
|
|
|
|
# 佳址安徽省合肥市庐阳区固镇
|
|
|
|
|
# 路3150号森林城A6地块1
|
|
|
|
|
# 幢2001室
|
|
|
|
|
# 公民身份号码
|
|
|
|
|
# 440203197306192118"""
|
2024-04-24 08:41:51 +08:00
|
|
|
# text = """
|
|
|
|
|
# 中华人民共和国
|
|
|
|
|
# 居民身份证
|
|
|
|
|
# oo
|
|
|
|
|
# 签发机关宿州市公安局桥分局
|
|
|
|
|
# 有效期限2023.01.18-2043.01.18
|
|
|
|
|
# 姓名郭乾坤
|
|
|
|
|
# 性别男民族汉
|
|
|
|
|
# 出生1994年10月17日
|
|
|
|
|
# 住址 安徽省宿州市场桥区朱仙
|
|
|
|
|
# 庄镇郭庙村郭家组6号
|
|
|
|
|
# 公民身份号码
|
|
|
|
|
# 34220119941017327X
|
|
|
|
|
# """
|
2024-04-30 09:06:28 +08:00
|
|
|
|
|
|
|
|
# text = """中华人民共和国
|
|
|
|
|
# 居民身份证
|
|
|
|
|
# 签发机关
|
|
|
|
|
# 盐源县公安局
|
|
|
|
|
# 出
|
|
|
|
|
# 有效期限
|
|
|
|
|
# 2013.06.18-2033.06.18
|
|
|
|
|
# 坐姓5性中出住
|
|
|
|
|
# 米名年别到生委址
|
|
|
|
|
# 江六斤
|
|
|
|
|
# 男民族彝
|
|
|
|
|
# 半口
|
|
|
|
|
# 1980
|
|
|
|
|
# 年4月20日
|
|
|
|
|
# 四川省盐源县盖租乡阿石
|
|
|
|
|
# 村6组4号
|
|
|
|
|
# 公民身份号码
|
|
|
|
|
# 513423198004203995
|
|
|
|
|
# """
|
|
|
|
|
#
|
|
|
|
|
# text = """姓名韩邀宇
|
|
|
|
|
# 性别男
|
|
|
|
|
# 民族汉
|
|
|
|
|
# 出生
|
|
|
|
|
# 1999年12月15日
|
|
|
|
|
# 住址
|
|
|
|
|
# 安徽省太和县税镇镇十里
|
|
|
|
|
# 沟村委会中韩村16号
|
|
|
|
|
# 公民身份号码
|
|
|
|
|
# 34122219991215183X"""
|
|
|
|
|
#
|
2024-04-24 08:41:51 +08:00
|
|
|
#
|
2024-04-30 09:06:28 +08:00
|
|
|
# text = """中华人民共和国
|
|
|
|
|
# 居民身份证
|
|
|
|
|
# 签发机关
|
|
|
|
|
# 木里县公安局
|
|
|
|
|
# 有效期限
|
|
|
|
|
# 2016.12.05-2026.12.05
|
|
|
|
|
# 姓性出住
|
|
|
|
|
# 马加加
|
|
|
|
|
# 女
|
|
|
|
|
# 民族彝
|
|
|
|
|
# 1998年1月2日
|
|
|
|
|
# 址
|
|
|
|
|
# 四川省木里藏族自治县博
|
|
|
|
|
# 科乡八科村麻窝地组18号
|
|
|
|
|
# 公民身份号码
|
|
|
|
|
# 513422199801023821"""
|
|
|
|
|
|
|
|
|
|
# text = """
|
|
|
|
|
# 毛阿卡
|
|
|
|
|
# 姓名
|
|
|
|
|
# 性别男
|
|
|
|
|
# 民族彝
|
|
|
|
|
# 3
|
|
|
|
|
# 出生
|
|
|
|
|
# 1976年3月3日
|
|
|
|
|
# 住址
|
|
|
|
|
# 四川省木里藏族自治县耗
|
|
|
|
|
# 牛坪乡泥珠村下泥珠组55
|
|
|
|
|
# 号
|
|
|
|
|
# 中国LHINA
|
|
|
|
|
# 公民身份号码
|
|
|
|
|
# 513423197603033997
|
|
|
|
|
# 中华人民共和国
|
|
|
|
|
# 居民身份证
|
|
|
|
|
# 签发机关
|
|
|
|
|
# 木里县公安局
|
|
|
|
|
# 有效期限
|
|
|
|
|
# 2017.06.06-2037.06.06
|
|
|
|
|
# """
|
|
|
|
|
# text = """中华人民共和国
|
|
|
|
|
# 居民身份证
|
|
|
|
|
# 国
|
|
|
|
|
# 签发机关
|
|
|
|
|
# 西昌市公安局
|
|
|
|
|
# 有效期限
|
|
|
|
|
# 2008.09.01-2028.09.01
|
|
|
|
|
# 逆姓5性中出日住
|
|
|
|
|
# 米名丰别刺生更址
|
|
|
|
|
# 祝九根惹
|
|
|
|
|
# 心
|
|
|
|
|
# 民
|
|
|
|
|
# 族彝
|
|
|
|
|
# 男
|
|
|
|
|
# E
|
|
|
|
|
# 1978
|
|
|
|
|
# 四川省西昌市磨盘乡大厂
|
|
|
|
|
# 村4组27号
|
|
|
|
|
# 公民身份号码
|
|
|
|
|
# 513401197807087411
|
|
|
|
|
# """
|
|
|
|
|
# text = """中华人民共和国
|
|
|
|
|
# 居民身份证
|
|
|
|
|
# 签发机关
|
|
|
|
|
# 木里县公安局
|
|
|
|
|
# 有效期限
|
|
|
|
|
# 2020.03.16-2025.03.16
|
|
|
|
|
# 名
|
|
|
|
|
# 蒋子古
|
|
|
|
|
# 姓
|
|
|
|
|
# 男
|
|
|
|
|
# 民族彝
|
|
|
|
|
# 出生
|
|
|
|
|
# 2005年1月4日
|
|
|
|
|
# 住址
|
|
|
|
|
# 四川省木里藏族自治县耗
|
|
|
|
|
# 牛坪乡泥珠村5组29号
|
|
|
|
|
# 公民身份号码
|
|
|
|
|
# 513422200501044415
|
|
|
|
|
# """
|
2024-04-30 10:43:16 +08:00
|
|
|
#
|
|
|
|
|
# text = """姓名苏龙格德·胡尔查巴特尔
|
|
|
|
|
# 性别男民族蒙古
|
|
|
|
|
# 出生1973年10支月27日
|
|
|
|
|
# 内蒙古赤峰市巴林右旗沙布
|
|
|
|
|
# 台苏木树中嘎查
|
|
|
|
|
# 中华人民共和国
|
|
|
|
|
# 居民身份证
|
|
|
|
|
# 签发机关巴林右旗公安局
|
|
|
|
|
# 有效期限2004.10.27-2024.10.26"""
|
2024-05-06 10:44:19 +08:00
|
|
|
# text ="""唐昌梅
|
|
|
|
|
# 姓名
|
|
|
|
|
# 性别女民族苗
|
|
|
|
|
# 出生1975年8月15日
|
|
|
|
|
# 住址
|
|
|
|
|
# 湖南省常德市鼎城区双桥
|
|
|
|
|
# 坪镇全家坪村3组
|
|
|
|
|
# 公民身份号码
|
|
|
|
|
# 433030197508150820"""
|
2024-04-30 09:06:28 +08:00
|
|
|
# extractor = IdentityCardExtractor()
|
2024-04-24 08:41:51 +08:00
|
|
|
# jsonstring = extractor.extract_textbyPaddle(text)
|
2024-04-30 09:06:28 +08:00
|
|
|
# print(jsonstring)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# text = """
|
|
|
|
|
# 姓名张三
|
|
|
|
|
# 性别男
|
|
|
|
|
# 民族汉
|
|
|
|
|
# # """
|
|
|
|
|
# text = """
|
|
|
|
|
# 江六斤
|
|
|
|
|
# 女民族彝
|
|
|
|
|
# """
|
|
|
|
|
# # 尝试使用 r"性别(\S)" 匹配性别
|
|
|
|
|
# gender_match = re.search(r'性别(\S)|(\S)民族', text)
|
|
|
|
|
# if gender_match:
|
|
|
|
|
# gender = gender_match.group(0)
|
|
|
|
|
# print("性别是:", gender)
|
|
|
|
|
# # else:
|
|
|
|
|
# # # 如果匹配不到,尝试使用 r"(\S)民族" 匹配
|
|
|
|
|
# # gender_match = re.search(r'(\S)民族', text)
|
|
|
|
|
# # if gender_match:
|
|
|
|
|
# # gender = gender_match.group(1)
|
|
|
|
|
# # print("性别是:", gender)
|
|
|
|
|
# else:
|
|
|
|
|
# print("未找到性别信息")
|
|
|
|
|
|