67 lines
2.7 KiB
Python
67 lines
2.7 KiB
Python
from abc import ABC, abstractmethod
|
|
import re
|
|
class Extractor:
|
|
@abstractmethod
|
|
def extract_text(self, text:str)->dict:
|
|
"""extract IdentityCard or invoice information) into dictionary."""
|
|
@abstractmethod
|
|
def extract_textbyPaddle(self, text:str)->dict:
|
|
"""extract IdentityCard or invoice information) into dictionary by paddle."""
|
|
|
|
def remove_blank_lines(self, text):
|
|
# 使用splitlines()方法将字符串按行分割成列表,并去除空行
|
|
lines = [line for line in text.splitlines() if line.strip()]
|
|
# 使用join()方法将列表中的行重新连接成字符串
|
|
cleaned_text = '\n'.join(lines)
|
|
return cleaned_text
|
|
|
|
def extract_birthday_from_id(self, id_number):
|
|
# 假设身份证号码为18位
|
|
if len(id_number) == 18:
|
|
year = id_number[6:10]
|
|
month = id_number[10:12]
|
|
day = id_number[12:14]
|
|
return f"{year}年{int(month)}月{int(day)}日"
|
|
else:
|
|
return ""
|
|
def get_gender_from_id(self, id_num):
|
|
# 假设id_num是一个有效的身份证号码
|
|
gender = '男' if int(id_num[-2]) % 2 == 1 else '女'
|
|
return gender
|
|
|
|
# def remove_blank_lines(text:str)->str:
|
|
# # 使用splitlines()方法将字符串按行分割成列表,并去除空行
|
|
# lines = [line for line in text.splitlines() if line.strip()]
|
|
# # 使用join()方法将列表中的行重新连接成字符串
|
|
# cleaned_text = '\n'.join(lines)
|
|
# return cleaned_text
|
|
|
|
# def extract_id_card_info(text:str)->dict:
|
|
# patterns = {
|
|
# "issuingAuthority": r"签发机关\n(.+?)\n",
|
|
# "validTime": r"有效期限\n(.+?)\n",
|
|
# "name": r"(\S+)\n(?:男|女)",
|
|
# "gender": r"(\S+)民族",
|
|
# "ethnicity": r"民族(\S+)",
|
|
# "dateOfBirth": r"(\d+年\d+月\d+日)",
|
|
# "address": r"住址\s*(.*?)公民身份号码",
|
|
# "idNumber": r"公民身份号码\n(\d+)"
|
|
# }
|
|
# tempText = remove_blank_lines(text)
|
|
# # 提取信息
|
|
# info = {}
|
|
# for key, pattern in patterns.items():
|
|
# match = re.search(pattern, tempText,re.DOTALL)
|
|
# if match:
|
|
# tempStr = match.group(1).strip()
|
|
# info[key] = tempStr.replace("\n", "")
|
|
# return info
|
|
|
|
# def recongize_id_card(filePath:str):
|
|
# loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
|
|
# #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/032002200511-91445598.pdf")
|
|
# #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/fapiao.jpg")
|
|
|
|
# docs = loader.load()
|
|
# context = "\n".join([doc.page_content for doc in docs])
|
|
# print(context) |