ComponentDevelopment/OCRPython/extractor/extractor.py

67 lines
2.7 KiB
Python

from abc import ABC, abstractmethod
import re
class Extractor:
@abstractmethod
def extract_text(self, text:str)->dict:
"""extract IdentityCard or invoice information) into dictionary."""
@abstractmethod
def extract_textbyPaddle(self, text:str)->dict:
"""extract IdentityCard or invoice information) into dictionary by paddle."""
def remove_blank_lines(self, text):
# 使用splitlines()方法将字符串按行分割成列表,并去除空行
lines = [line for line in text.splitlines() if line.strip()]
# 使用join()方法将列表中的行重新连接成字符串
cleaned_text = '\n'.join(lines)
return cleaned_text
def extract_birthday_from_id(self, id_number):
# 假设身份证号码为18位
if len(id_number) == 18:
year = id_number[6:10]
month = id_number[10:12]
day = id_number[12:14]
return f"{year}{int(month)}{int(day)}"
else:
return ""
def get_gender_from_id(self, id_num):
# 假设id_num是一个有效的身份证号码
gender = '' if int(id_num[-2]) % 2 == 1 else ''
return gender
# def remove_blank_lines(text:str)->str:
# # 使用splitlines()方法将字符串按行分割成列表,并去除空行
# lines = [line for line in text.splitlines() if line.strip()]
# # 使用join()方法将列表中的行重新连接成字符串
# cleaned_text = '\n'.join(lines)
# return cleaned_text
# def extract_id_card_info(text:str)->dict:
# patterns = {
# "issuingAuthority": r"签发机关\n(.+?)\n",
# "validTime": r"有效期限\n(.+?)\n",
# "name": r"(\S+)\n(?:男|女)",
# "gender": r"(\S+)民族",
# "ethnicity": r"民族(\S+)",
# "dateOfBirth": r"(\d+年\d+月\d+日)",
# "address": r"住址\s*(.*?)公民身份号码",
# "idNumber": r"公民身份号码\n(\d+)"
# }
# tempText = remove_blank_lines(text)
# # 提取信息
# info = {}
# for key, pattern in patterns.items():
# match = re.search(pattern, tempText,re.DOTALL)
# if match:
# tempStr = match.group(1).strip()
# info[key] = tempStr.replace("\n", "")
# return info
# def recongize_id_card(filePath:str):
# loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
# #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/032002200511-91445598.pdf")
# #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/fapiao.jpg")
# docs = loader.load()
# context = "\n".join([doc.page_content for doc in docs])
# print(context)