import re from extractor import Extractor import json class IdentityCardExtractor(Extractor): def extract_text(self,text:str)->dict: patterns = { "issuingAuthority": r"签发机关\n(.+?)\n", "validTime": r"有效期限\n(.+?)\n", "name": r"(\S+)\n(?:男|女)", "gender": r"(\S+)民族", "ethnicity": r"民族(\S+)", "dateOfBirth": r"(\d+年\d+月\d+日)", "address": r"住址\s*(.*?)公民身份号码", "idNumber": r"(\d{18}|\d{17}[Xx])" } tempText = self.remove_blank_lines(text) # 提取信息 info = {} for key, pattern in patterns.items(): match = re.search(pattern, tempText,re.DOTALL) if match: tempStr = match.group(1).strip() info[key] = tempStr.replace("\n", "") return info def extract_textbyPaddle(self, text:str)->dict: patterns = { "issuingAuthority": r"签发机关\n(.+?)\n", "validTime": r"有效期限\n(.+?)\n", "name": r"姓名(.*?)\n", ##### "gender": r"(\S)民族", "ethnicity": r"民族(\S+)", "dateOfBirth": r"(\d+年\d+月\d+日)", "address": r"住址|佳址\s*(.*?)公民身份号码", "idNumber": r"(\d{18}|\d{17}[Xx])" } tempText = self.remove_blank_lines(text) # 提取信息 info = {} for key, pattern in patterns.items(): match = re.search(pattern, tempText,re.DOTALL) if match: tempStr = match.group(1).strip() info[key] = tempStr.replace("\n", "") return info class InvoiceExtractor(Extractor): def extract_text(self,text:str)->dict: pass # text = """中华人民共和国 # 居民身份证 # 签发机关合肥市公安局庐阳分局 # 有效期限 # 2022.07.10-长期 # 姓名陈玉振 # 性别男 # 民族汉 # 出生1973年6月19日 # 佳址安徽省合肥市庐阳区固镇 # 路3150号森林城A6地块1 # 幢2001室 # 公民身份号码 # 440203197306192118""" # extractor = IdentityCardExtractor() # jsonstring = extractor.extract_textbyPaddle(text) # print(jsonstring)