payoff_OCR/main.py

94 lines
2.8 KiB
Python
Raw Normal View History

2025-07-15 16:52:10 +08:00
import os
import sys
import json
import logging
import time
import glob
import re
import argparse
import shutil
import traceback
from datetime import datetime
from pdf_to_table import process_pdf_to_table
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
def process_pdf(pdf_path, output_dir, confidence_threshold=0.6):
"""处理单个PDF文件"""
try:
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 获取PDF文件名
pdf_filename = os.path.basename(pdf_path)
pdf_name = os.path.splitext(pdf_filename)[0]
logger.info(f"开始处理PDF: {pdf_path}")
# 处理PDF
result = process_pdf_to_table(
pdf_folder=os.path.dirname(pdf_path),
output_folder=output_dir,
confidence_threshold=confidence_threshold
)
# 生成JSON结果
result_json = {
'status': 'success',
'pdf_file': pdf_filename,
'result': result,
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
# 保存结果
result_path = os.path.join(output_dir, f"{pdf_name}_result.json")
with open(result_path, 'w', encoding='utf-8') as f:
json.dump(result_json, f, ensure_ascii=False, indent=2)
# 输出结果路径到标准输出供Java读取
print(json.dumps({
'status': 'success',
'result_path': result_path
}))
return 0
except Exception as e:
error_msg = f"处理PDF时出错: {str(e)}"
logger.error(error_msg)
# 输出错误信息到标准输出供Java读取
print(json.dumps({
'status': 'error',
'error': error_msg
}))
return 1
def main():
"""主函数"""
# 解析命令行参数
parser = argparse.ArgumentParser(description='PDF OCR处理工具')
parser.add_argument('--input', '-i', required=True, help='输入PDF文件路径')
parser.add_argument('--output', '-o', required=True, help='输出目录')
parser.add_argument('--confidence', '-c', type=float, default=0.6, help='置信度阈值')
args = parser.parse_args()
# 处理PDF
return process_pdf(args.input, args.output, args.confidence)
if __name__ == "__main__":
try:
sys.exit(main())
except Exception as e:
logger.error(f"程序执行出错: {str(e)}")
print(json.dumps({
'status': 'error',
'error': str(e)
}))
sys.exit(1)