payoff_OCR/main.py

94 lines
2.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import sys
import json
import logging
import time
import glob
import re
import argparse
import shutil
import traceback
from datetime import datetime
from pdf_to_table import process_pdf_to_table
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
def process_pdf(pdf_path, output_dir, confidence_threshold=0.6):
"""处理单个PDF文件"""
try:
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 获取PDF文件名
pdf_filename = os.path.basename(pdf_path)
pdf_name = os.path.splitext(pdf_filename)[0]
logger.info(f"开始处理PDF: {pdf_path}")
# 处理PDF
result = process_pdf_to_table(
pdf_folder=os.path.dirname(pdf_path),
output_folder=output_dir,
confidence_threshold=confidence_threshold
)
# 生成JSON结果
result_json = {
'status': 'success',
'pdf_file': pdf_filename,
'result': result,
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
# 保存结果
result_path = os.path.join(output_dir, f"{pdf_name}_result.json")
with open(result_path, 'w', encoding='utf-8') as f:
json.dump(result_json, f, ensure_ascii=False, indent=2)
# 输出结果路径到标准输出供Java读取
print(json.dumps({
'status': 'success',
'result_path': result_path
}))
return 0
except Exception as e:
error_msg = f"处理PDF时出错: {str(e)}"
logger.error(error_msg)
# 输出错误信息到标准输出供Java读取
print(json.dumps({
'status': 'error',
'error': error_msg
}))
return 1
def main():
"""主函数"""
# 解析命令行参数
parser = argparse.ArgumentParser(description='PDF OCR处理工具')
parser.add_argument('--input', '-i', required=True, help='输入PDF文件路径')
parser.add_argument('--output', '-o', required=True, help='输出目录')
parser.add_argument('--confidence', '-c', type=float, default=0.6, help='置信度阈值')
args = parser.parse_args()
# 处理PDF
return process_pdf(args.input, args.output, args.confidence)
if __name__ == "__main__":
try:
sys.exit(main())
except Exception as e:
logger.error(f"程序执行出错: {str(e)}")
print(json.dumps({
'status': 'error',
'error': str(e)
}))
sys.exit(1)