94 lines
2.8 KiB
Python
94 lines
2.8 KiB
Python
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
import json
|
|||
|
|
import logging
|
|||
|
|
import time
|
|||
|
|
import glob
|
|||
|
|
import re
|
|||
|
|
import argparse
|
|||
|
|
import shutil
|
|||
|
|
import traceback
|
|||
|
|
from datetime import datetime
|
|||
|
|
from pdf_to_table import process_pdf_to_table
|
|||
|
|
|
|||
|
|
# 配置日志
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|||
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
def process_pdf(pdf_path, output_dir, confidence_threshold=0.6):
|
|||
|
|
"""处理单个PDF文件"""
|
|||
|
|
try:
|
|||
|
|
# 创建输出目录
|
|||
|
|
os.makedirs(output_dir, exist_ok=True)
|
|||
|
|
|
|||
|
|
# 获取PDF文件名
|
|||
|
|
pdf_filename = os.path.basename(pdf_path)
|
|||
|
|
pdf_name = os.path.splitext(pdf_filename)[0]
|
|||
|
|
|
|||
|
|
logger.info(f"开始处理PDF: {pdf_path}")
|
|||
|
|
|
|||
|
|
# 处理PDF
|
|||
|
|
result = process_pdf_to_table(
|
|||
|
|
pdf_folder=os.path.dirname(pdf_path),
|
|||
|
|
output_folder=output_dir,
|
|||
|
|
confidence_threshold=confidence_threshold
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 生成JSON结果
|
|||
|
|
result_json = {
|
|||
|
|
'status': 'success',
|
|||
|
|
'pdf_file': pdf_filename,
|
|||
|
|
'result': result,
|
|||
|
|
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 保存结果
|
|||
|
|
result_path = os.path.join(output_dir, f"{pdf_name}_result.json")
|
|||
|
|
with open(result_path, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(result_json, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
# 输出结果路径到标准输出(供Java读取)
|
|||
|
|
print(json.dumps({
|
|||
|
|
'status': 'success',
|
|||
|
|
'result_path': result_path
|
|||
|
|
}))
|
|||
|
|
|
|||
|
|
return 0
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
error_msg = f"处理PDF时出错: {str(e)}"
|
|||
|
|
logger.error(error_msg)
|
|||
|
|
# 输出错误信息到标准输出(供Java读取)
|
|||
|
|
print(json.dumps({
|
|||
|
|
'status': 'error',
|
|||
|
|
'error': error_msg
|
|||
|
|
}))
|
|||
|
|
return 1
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
# 解析命令行参数
|
|||
|
|
parser = argparse.ArgumentParser(description='PDF OCR处理工具')
|
|||
|
|
parser.add_argument('--input', '-i', required=True, help='输入PDF文件路径')
|
|||
|
|
parser.add_argument('--output', '-o', required=True, help='输出目录')
|
|||
|
|
parser.add_argument('--confidence', '-c', type=float, default=0.6, help='置信度阈值')
|
|||
|
|
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
# 处理PDF
|
|||
|
|
return process_pdf(args.input, args.output, args.confidence)
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
try:
|
|||
|
|
sys.exit(main())
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"程序执行出错: {str(e)}")
|
|||
|
|
print(json.dumps({
|
|||
|
|
'status': 'error',
|
|||
|
|
'error': str(e)
|
|||
|
|
}))
|
|||
|
|
sys.exit(1)
|