ComponentDevelopment/OCRPython/extractor/extractor.py

from abc import ABC, abstractmethod
import re
class Extractor:
    @abstractmethod
    def extract_text(self, text:str)->dict:
        """extract IdentityCard or invoice information) into dictionary."""
    @abstractmethod
    def extract_textbyPaddle(self, text:str)->dict:
        """extract IdentityCard or invoice information) into dictionary by paddle."""

    def remove_blank_lines(self, text):
        # 使用splitlines()方法将字符串按行分割成列表，并去除空行
        lines = [line for line in text.splitlines() if line.strip()]
        # 使用join()方法将列表中的行重新连接成字符串
        cleaned_text = '\n'.join(lines)
        return cleaned_text

    def extract_birthday_from_id(self, id_number):
        # 假设身份证号码为18位
        if len(id_number) == 18:
            year = id_number[6:10]
            month = id_number[10:12]
            day = id_number[12:14]
            return f"{year}年{int(month)}月{int(day)}日"
        else:
            return ""
    def get_gender_from_id(self, id_num):
        # 假设id_num是一个有效的身份证号码
        gender = '男' if int(id_num[-2]) % 2 == 1 else '女'
        return gender

# def remove_blank_lines(text:str)->str:
#     # 使用splitlines()方法将字符串按行分割成列表，并去除空行
#     lines = [line for line in text.splitlines() if line.strip()]
#     # 使用join()方法将列表中的行重新连接成字符串
#     cleaned_text = '\n'.join(lines)
#     return cleaned_text

# def extract_id_card_info(text:str)->dict:
#     patterns = {
#         "issuingAuthority": r"签发机关\n(.+?)\n",
#         "validTime": r"有效期限\n(.+?)\n",
#         "name": r"(\S+)\n(?:男|女)",
#         "gender": r"(\S+)民族",
#         "ethnicity": r"民族(\S+)",
#         "dateOfBirth": r"(\d+年\d+月\d+日)",
#         "address": r"住址\s*(.*?)公民身份号码",
#         "idNumber": r"公民身份号码\n(\d+)"
#     }
#     tempText = remove_blank_lines(text)
#     # 提取信息
#     info = {}
#     for key, pattern in patterns.items():
#         match = re.search(pattern, tempText,re.DOTALL)
#         if match:
#             tempStr = match.group(1).strip()
#             info[key] = tempStr.replace("\n", "")
#     return info

# def recongize_id_card(filePath:str):
#     loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG")
#     #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/032002200511-91445598.pdf")
#     #loader = RapidOCRLoader(file_path="/Users/wangvivi/Desktop/Code/ocrtest/images/fapiao.jpg")

#     docs = loader.load()
#     context = "\n".join([doc.page_content for doc in docs])
#     print(context)