diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..2ab7314 Binary files /dev/null and b/.DS_Store differ diff --git a/OCRPython/.DS_Store b/OCRPython/.DS_Store index 5a23aaa..451627c 100644 Binary files a/OCRPython/.DS_Store and b/OCRPython/.DS_Store differ diff --git a/OCRPython/.idea/OCRPython.iml b/OCRPython/.idea/OCRPython.iml new file mode 100644 index 0000000..82ae1e8 --- /dev/null +++ b/OCRPython/.idea/OCRPython.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/OCRPython/.idea/inspectionProfiles/profiles_settings.xml b/OCRPython/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/OCRPython/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/OCRPython/.idea/modules.xml b/OCRPython/.idea/modules.xml new file mode 100644 index 0000000..ccb86f8 --- /dev/null +++ b/OCRPython/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/OCRPython/0.png b/OCRPython/0.png deleted file mode 100644 index 8a083fa..0000000 Binary files a/OCRPython/0.png and /dev/null differ diff --git a/OCRPython/add.py b/OCRPython/add.py deleted file mode 100644 index b2ba485..0000000 --- a/OCRPython/add.py +++ /dev/null @@ -1,16 +0,0 @@ -# -*- coding: utf-8 -*- -import sys -def helloword(filePath: str): - #print(f"helloword, filepath:{filePath}") - #print(f"Hello World!") - return "return Hello World!" - - -if __name__ == "__main__": - # print(f"len of parameters:{len(sys.argv)}") - # for i in range (1, len(sys.argv)): - # print(sys.argv[i]) - if len(sys.argv) >1: - jsonString = helloword(sys.argv[1]) - print(jsonString) - \ No newline at end of file diff --git a/OCRPython/IDRecognition b/OCRPython/backup/IDRecognition similarity index 100% rename from OCRPython/IDRecognition rename to OCRPython/backup/IDRecognition diff --git a/OCRPython/IDRecognition.py b/OCRPython/backup/IDRecognition.py similarity index 100% rename from OCRPython/IDRecognition.py rename to OCRPython/backup/IDRecognition.py diff --git a/OCRPython/ocr.py b/OCRPython/backup/ocr.py similarity index 100% rename from OCRPython/ocr.py rename to OCRPython/backup/ocr.py diff --git a/OCRPython/opencv.py b/OCRPython/backup/opencv.py similarity index 100% rename from OCRPython/opencv.py rename to OCRPython/backup/opencv.py diff --git a/OCRPython/padderOcrmain.py b/OCRPython/backup/padderOcrmain.py similarity index 100% rename from OCRPython/padderOcrmain.py rename to OCRPython/backup/padderOcrmain.py diff --git a/OCRPython/potencent.py b/OCRPython/backup/potencent.py similarity index 100% rename from OCRPython/potencent.py rename to OCRPython/backup/potencent.py diff --git a/OCRPython/binary.png b/OCRPython/binary.png deleted file mode 100644 index d387259..0000000 Binary files a/OCRPython/binary.png and /dev/null differ diff --git a/OCRPython/contours.png b/OCRPython/contours.png deleted file mode 100644 index 36a439c..0000000 Binary files a/OCRPython/contours.png and /dev/null differ diff --git a/OCRPython/dilation.png b/OCRPython/dilation.png deleted file mode 100644 index a9b9a60..0000000 Binary files a/OCRPython/dilation.png and /dev/null differ diff --git a/OCRPython/extractor/__pycache__/identitycard_extractor.cpython-312.pyc b/OCRPython/extractor/__pycache__/identitycard_extractor.cpython-312.pyc index f9fe49c..a89b0b0 100644 Binary files a/OCRPython/extractor/__pycache__/identitycard_extractor.cpython-312.pyc and b/OCRPython/extractor/__pycache__/identitycard_extractor.cpython-312.pyc differ diff --git a/OCRPython/extractor/__pycache__/identitycard_extractor.cpython-39.pyc b/OCRPython/extractor/__pycache__/identitycard_extractor.cpython-39.pyc index 4362240..f53fd1e 100644 Binary files a/OCRPython/extractor/__pycache__/identitycard_extractor.cpython-39.pyc and b/OCRPython/extractor/__pycache__/identitycard_extractor.cpython-39.pyc differ diff --git a/OCRPython/extractor/identitycard_extractor.py b/OCRPython/extractor/identitycard_extractor.py index 9899dd8..1ac2a0c 100644 --- a/OCRPython/extractor/identitycard_extractor.py +++ b/OCRPython/extractor/identitycard_extractor.py @@ -1,50 +1,66 @@ import re from extractor import Extractor -import json +from configs.basic_config import logger class IdentityCardExtractor(Extractor): - def extract_text(self,text:str)->dict: - patterns = { - "issuingAuthority": r"签发机关\n(.+?)\n", - "validTime": r"有效期限\n(.+?)\n", - "name": r"(\S+)\n(?:男|女)", - "gender": r"(\S+)民族", - "ethnicity": r"民族(\S+)", - "dateOfBirth": r"(\d+年\d+月\d+日)", - "address": r"住址\s*(.*?)公民身份号码", - "idNumber": r"(\d{18}|\d{17}[Xx])" - } - tempText = self.remove_blank_lines(text) - # 提取信息 - info = {} - for key, pattern in patterns.items(): - match = re.search(pattern, tempText,re.DOTALL) - if match: - tempStr = match.group(1).strip() - info[key] = tempStr.replace("\n", "") - return info + def extract_text(self, text:str)->dict: + try: + patterns = { + "issuingAuthority": r"签发机关\n(.+?)\n", + "validTime": r"有效期限\n(.+?)\n", + "name": r"(\S+)\n(?:男|女)", + "gender": r"(\S+)民族", + "ethnicity": r"民族(\S+)", + "dateOfBirth": r"(\d+年\d+月\d+日)", + "address": r"(住址|佳址)\s*(.*?)公民身份号码", + "idNumber": r"(\d{18}|\d{17}[Xx])" + } + tempText = self.remove_blank_lines(text) + # 提取信息 + info = {} + for key, pattern in patterns.items(): + match = re.search(pattern, tempText,re.DOTALL) + if match: + if "address" == key: + tempStr = match.group(2).strip() + else: + tempStr = match.group(1).strip() + info[key] = tempStr.replace("\n", "") + return info + except Exception as e: + print(e) + logger.error(e) + return {} def extract_textbyPaddle(self, text:str)->dict: - patterns = { - "issuingAuthority": r"签发机关\n(.+?)\n", - "validTime": r"有效期限\n(.+?)\n", - "name": r"姓名(.*?)\n", ##### - "gender": r"(\S)民族", - "ethnicity": r"民族(\S+)", - "dateOfBirth": r"(\d+年\d+月\d+日)", - "address": r"住址|佳址\s*(.*?)公民身份号码", - "idNumber": r"(\d{18}|\d{17}[Xx])" - } + try: + patterns = { + "issuingAuthority": r"签发机关\n(.+?)\n", + "validTime": r"有效期限\n*(.+?)\n", + "name": r"姓名(.*?)\n", ##### + "gender": r"(\S)民族", + "ethnicity": r"民族(\S+)", + "dateOfBirth": r"(\d+年\d+月\d+日)", + "address": r"(住址|佳址)\s*(.*?)公民身份号码", + "idNumber": r"(\d{18}|\d{17}[Xx])" + } - tempText = self.remove_blank_lines(text) - # 提取信息 - info = {} - for key, pattern in patterns.items(): - match = re.search(pattern, tempText,re.DOTALL) - if match: - tempStr = match.group(1).strip() - info[key] = tempStr.replace("\n", "") - return info + tempText = self.remove_blank_lines(text) + # 提取信息 + info = {} + for key, pattern in patterns.items(): + match = re.search(pattern, tempText,re.DOTALL) + if match: + if "address" == key: + tempStr = match.group(2).strip() + else: + tempStr = match.group(1).strip() + info[key] = tempStr.replace("\n", "") + return info + except Exception as e: + print(e) + logger.error(e) + return {} class InvoiceExtractor(Extractor): @@ -66,8 +82,21 @@ class InvoiceExtractor(Extractor): # 幢2001室 # 公民身份号码 # 440203197306192118""" +text = """ +中华人民共和国 +居民身份证 +oo +签发机关宿州市公安局桥分局 +有效期限2023.01.18-2043.01.18 +姓名郭乾坤 +性别男民族汉 +出生1994年10月17日 +住址 安徽省宿州市场桥区朱仙 +庄镇郭庙村郭家组6号 +公民身份号码 +34220119941017327X +""" +extractor = IdentityCardExtractor() -# extractor = IdentityCardExtractor() - -# jsonstring = extractor.extract_textbyPaddle(text) -# print(jsonstring) \ No newline at end of file +jsonstring = extractor.extract_textbyPaddle(text) +print(jsonstring) \ No newline at end of file diff --git a/OCRPython/gray.png b/OCRPython/gray.png deleted file mode 100644 index 56d57d5..0000000 Binary files a/OCRPython/gray.png and /dev/null differ diff --git a/OCRPython/images/1.jpg b/OCRPython/images/1.jpg new file mode 100644 index 0000000..d4240d4 Binary files /dev/null and b/OCRPython/images/1.jpg differ diff --git a/OCRPython/images/2.jpg b/OCRPython/images/2.jpg new file mode 100644 index 0000000..8a4db39 Binary files /dev/null and b/OCRPython/images/2.jpg differ diff --git a/OCRPython/logs/ocr_reconginition.log b/OCRPython/logs/ocr_reconginition.log index 63bdcec..c421abd 100644 --- a/OCRPython/logs/ocr_reconginition.log +++ b/OCRPython/logs/ocr_reconginition.log @@ -213,3 +213,12 @@ 2024-04-22 11:01:54,640 - main.py[line:119] - INFO: 0 2024-04-22 11:01:54,819 - ocr.py[line:19] - INFO: from rapidocr_onnxruntime import RapidOCR 2024-04-22 11:01:57,426 - main.py[line:42] - INFO: extractIdCardInfo 耗时2.785860061645508秒 +2024-04-22 15:28:23,501 - maincopy.py[line:81] - ERROR: 'NoneType' object has no attribute 'strip' +2024-04-22 16:18:56,272 - maincopy.py[line:38] - ERROR: 'NoneType' object has no attribute 'shape' +2024-04-22 16:21:44,808 - maincopy.py[line:38] - ERROR: 'NoneType' object has no attribute 'shape' +2024-04-22 16:28:20,400 - maincopy.py[line:38] - ERROR: 'NoneType' object has no attribute 'shape' +2024-04-22 16:28:20,400 - maincopy.py[line:83] - ERROR: local variable 'jsonString' referenced before assignment +2024-04-22 16:29:46,501 - maincopy.py[line:38] - ERROR: 'NoneType' object has no attribute 'shape' +2024-04-22 16:29:46,502 - maincopy.py[line:83] - ERROR: local variable 'jsonString' referenced before assignment +2024-04-22 16:32:09,871 - maincopy.py[line:38] - ERROR: 'NoneType' object has no attribute 'shape' +2024-04-22 16:32:09,872 - maincopy.py[line:83] - ERROR: local variable 'jsonString' referenced before assignment diff --git a/OCRPython/maincopy.py b/OCRPython/maincopy.py index edb49d6..679d422 100644 --- a/OCRPython/maincopy.py +++ b/OCRPython/maincopy.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- import sys import os -import io from paddleocr import PaddleOCR, draw_ocr import time from configs.basic_config import logger @@ -29,25 +28,25 @@ def extractIdCardInfoByPath(filePath1: str, filePath2: str)->str: text += (line[1][0] + '\n') if os.path.exists(filePath2): - result = ocr.ocr(filePath1, cls=False) + result = ocr.ocr(filePath2, cls=False) for idx in range(len(result)): res = result[idx] for line in res: text += (line[1][0] + '\n') except Exception as e: logger.error(e) + print(e) - logger.info(f"text:{text}") - extractor = IdentityCardExtractor() - jsonString = extractor.extract_textbyPaddle(text) - end_time = time.time() # 记录结束时间 - execution_time = end_time - start_time # 计算执行时间 - logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒") + if 0 != len(text): + logger.info(f"text:{text}") + extractor = IdentityCardExtractor() + jsonString = extractor.extract_textbyPaddle(text) + end_time = time.time() # 记录结束时间 + execution_time = end_time - start_time # 计算执行时间 + logger.info(f"extractIdCardInfoByBase64Data 耗时{execution_time}秒") return jsonString def extractIdCardInfoByBase64Data(base64data1:str, base64Data2: str)->str: - image_data1 = "" - image_data2 = "" logger.info(f"extractIdCardInfoByBase64Data") start_time = time.time() # 记录结束时间 jsonString = "" @@ -89,12 +88,13 @@ def extractIdCardInfoByBase64Data(base64data1:str, base64Data2: str)->str: # base64_image_string = base64.b64encode(image_file.read()).decode('utf-8') # # jsonString = extractIdCardInfoByBase64Data(base64_image_string,"") +# jsonString = extractIdCardInfoByBase64Data("/Users/wangvivi/Desktop/Code/ocrtest/images/2.jpg","/Users/wangvivi/Desktop/Code/ocrtest/images/1.jpg") # print(jsonString) # -# jsonString = extractIdCardInfoByPath("/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG","") +#jsonString = extractIdCardInfoByPath("/Users/wangvivi/Desktop/Code/ocrtest/images/id_card.JPG","") +# jsonString = extractIdCardInfoByPath("/Users/wangvivi/Desktop/Code/ocrtest/images/2.jpg","/Users/wangvivi/Desktop/Code/ocrtest/images/1.jpg") # print(jsonString) - if __name__ == "__main__": try: logger.info(f"main.py len of parameter: {len(sys.argv)}") diff --git a/OCRTool/.DS_Store b/OCRTool/.DS_Store index bc5c4ed..62fee18 100644 Binary files a/OCRTool/.DS_Store and b/OCRTool/.DS_Store differ diff --git a/OCRTool/.idea/.DS_Store b/OCRTool/.idea/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/OCRTool/.idea/.DS_Store differ diff --git a/OCRTool/.idea/vcs.xml b/OCRTool/.idea/vcs.xml new file mode 100644 index 0000000..62bd7a0 --- /dev/null +++ b/OCRTool/.idea/vcs.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/OCRTool/.idea/workspace.xml b/OCRTool/.idea/workspace.xml new file mode 100644 index 0000000..83b7f2f --- /dev/null +++ b/OCRTool/.idea/workspace.xml @@ -0,0 +1,204 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { + "associatedIndex": 2 +} + + + + + + + + + + + + + + + + + + + + + + + + + $USER_HOME$/.subversion + + + + + 1693992725062 + + + + + + + + + + + + + + + + + + + + + file://$PROJECT_DIR$/src/main/java/com/bonus/core/RecognitionController.java + 9 + + + file://$PROJECT_DIR$/src/main/java/com/bonus/core/OCRRecognition.java + 28 + + + file://$PROJECT_DIR$/src/main/java/com/bonus/core/OCRRecognition.java + 79 + + + + + + +