94 lines
2.8 KiB
Python
94 lines
2.8 KiB
Python
import os
|
||
import sys
|
||
import json
|
||
import logging
|
||
import time
|
||
import glob
|
||
import re
|
||
import argparse
|
||
import shutil
|
||
import traceback
|
||
from datetime import datetime
|
||
from pdf_to_table import process_pdf_to_table
|
||
|
||
# 配置日志
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
datefmt='%Y-%m-%d %H:%M:%S'
|
||
)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
def process_pdf(pdf_path, output_dir, confidence_threshold=0.6):
|
||
"""处理单个PDF文件"""
|
||
try:
|
||
# 创建输出目录
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
|
||
# 获取PDF文件名
|
||
pdf_filename = os.path.basename(pdf_path)
|
||
pdf_name = os.path.splitext(pdf_filename)[0]
|
||
|
||
logger.info(f"开始处理PDF: {pdf_path}")
|
||
|
||
# 处理PDF
|
||
result = process_pdf_to_table(
|
||
pdf_folder=os.path.dirname(pdf_path),
|
||
output_folder=output_dir,
|
||
confidence_threshold=confidence_threshold
|
||
)
|
||
|
||
# 生成JSON结果
|
||
result_json = {
|
||
'status': 'success',
|
||
'pdf_file': pdf_filename,
|
||
'result': result,
|
||
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
}
|
||
|
||
# 保存结果
|
||
result_path = os.path.join(output_dir, f"{pdf_name}_result.json")
|
||
with open(result_path, 'w', encoding='utf-8') as f:
|
||
json.dump(result_json, f, ensure_ascii=False, indent=2)
|
||
|
||
# 输出结果路径到标准输出(供Java读取)
|
||
print(json.dumps({
|
||
'status': 'success',
|
||
'result_path': result_path
|
||
}))
|
||
|
||
return 0
|
||
|
||
except Exception as e:
|
||
error_msg = f"处理PDF时出错: {str(e)}"
|
||
logger.error(error_msg)
|
||
# 输出错误信息到标准输出(供Java读取)
|
||
print(json.dumps({
|
||
'status': 'error',
|
||
'error': error_msg
|
||
}))
|
||
return 1
|
||
|
||
def main():
|
||
"""主函数"""
|
||
# 解析命令行参数
|
||
parser = argparse.ArgumentParser(description='PDF OCR处理工具')
|
||
parser.add_argument('--input', '-i', required=True, help='输入PDF文件路径')
|
||
parser.add_argument('--output', '-o', required=True, help='输出目录')
|
||
parser.add_argument('--confidence', '-c', type=float, default=0.6, help='置信度阈值')
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 处理PDF
|
||
return process_pdf(args.input, args.output, args.confidence)
|
||
|
||
if __name__ == "__main__":
|
||
try:
|
||
sys.exit(main())
|
||
except Exception as e:
|
||
logger.error(f"程序执行出错: {str(e)}")
|
||
print(json.dumps({
|
||
'status': 'error',
|
||
'error': str(e)
|
||
}))
|
||
sys.exit(1) |