import os import sys import json import logging import time import glob import re import argparse import shutil import traceback from datetime import datetime from pdf_to_table import process_pdf_to_table # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) def process_pdf(pdf_path, output_dir, confidence_threshold=0.6): """处理单个PDF文件""" try: # 创建输出目录 os.makedirs(output_dir, exist_ok=True) # 获取PDF文件名 pdf_filename = os.path.basename(pdf_path) pdf_name = os.path.splitext(pdf_filename)[0] logger.info(f"开始处理PDF: {pdf_path}") # 处理PDF result = process_pdf_to_table( pdf_folder=os.path.dirname(pdf_path), output_folder=output_dir, confidence_threshold=confidence_threshold ) # 生成JSON结果 result_json = { 'status': 'success', 'pdf_file': pdf_filename, 'result': result, 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } # 保存结果 result_path = os.path.join(output_dir, f"{pdf_name}_result.json") with open(result_path, 'w', encoding='utf-8') as f: json.dump(result_json, f, ensure_ascii=False, indent=2) # 输出结果路径到标准输出(供Java读取) print(json.dumps({ 'status': 'success', 'result_path': result_path })) return 0 except Exception as e: error_msg = f"处理PDF时出错: {str(e)}" logger.error(error_msg) # 输出错误信息到标准输出(供Java读取) print(json.dumps({ 'status': 'error', 'error': error_msg })) return 1 def main(): """主函数""" # 解析命令行参数 parser = argparse.ArgumentParser(description='PDF OCR处理工具') parser.add_argument('--input', '-i', required=True, help='输入PDF文件路径') parser.add_argument('--output', '-o', required=True, help='输出目录') parser.add_argument('--confidence', '-c', type=float, default=0.6, help='置信度阈值') args = parser.parse_args() # 处理PDF return process_pdf(args.input, args.output, args.confidence) if __name__ == "__main__": try: sys.exit(main()) except Exception as e: logger.error(f"程序执行出错: {str(e)}") print(json.dumps({ 'status': 'error', 'error': str(e) })) sys.exit(1)