from abc import ABC, abstractmethod import re class Extractor: @abstractmethod def extract_text(self, text:str)->dict: """extract IdentityCard or invoice information) into dictionary.""" @abstractmethod def extract_textbyPaddle(self, text:str)->dict: """extract IdentityCard or invoice information) into dictionary by paddle.""" def remove_blank_lines(self, text): # 使用splitlines()方法将字符串按行分割成列表,并去除空行 lines = [line for line in text.splitlines() if line.strip()] # 使用join()方法将列表中的行重新连接成字符串 cleaned_text = '\n'.join(lines) return cleaned_text def extract_birthday_from_id(self, id_number): # 假设身份证号码为18位 if len(id_number) == 18: year = id_number[6:10] month = id_number[10:12] day = id_number[12:14] return f"{year}年{int(month)}月{int(day)}日" else: return "" def get_gender_from_id(self, id_num): # 假设id_num是一个有效的身份证号码 gender = '男' if int(id_num[-2]) % 2 == 1 else '女' return gender # def remove_blank_lines(text:str)->str: # # 使用splitlines()方法将字符串按行分割成列表,并去除空行 # lines = [line for line in text.splitlines() if line.strip()] # # 使用join()方法将列表中的行重新连接成字符串 # cleaned_text = '\n'.join(lines) # return cleaned_text # def extract_id_card_info(text:str)->dict: # patterns = { # "issuingAuthority": r"签发机关\n(.+?)\n", # "validTime": r"有效期限\n(.+?)\n", # "name": r"(\S+)\n(?:男|女)", # "gender": r"(\S+)民族", # "ethnicity": r"民族(\S+)", # "dateOfBirth": r"(\d+年\d+月\d+日)", # "address": r"住址\s*(.*?)公民身份号码", # "idNumber": r"公民身份号码\n(\d+)" # } # tempText = remove_blank_lines(text) # # 提取信息 # info = {} # for key, pattern in patterns.items(): # match = re.search(pattern, tempText,re.DOTALL) # if match: # tempStr = match.group(1).strip() # info[key] = tempStr.replace("\n", "") # return info # def recongize_id_card(filePath:str): # loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG") # #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/032002200511-91445598.pdf") # #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/fapiao.jpg") # docs = loader.load() # context = "\n".join([doc.page_content for doc in docs]) # print(context)