ocr识别融合

This commit is contained in:
cwchen 2025-04-16 18:10:22 +08:00
parent c0171e6c03
commit e82bc335c6
8 changed files with 325 additions and 39 deletions

View File

@ -0,0 +1,19 @@
package com.ytlk.entity;
import lombok.Data;
/**
* @className:ConfigUtil
* @author:cwchen
* @date:2025-04-16-11:20
* @version:1.0
* @description:配置实体类
*/
@Data
public class ConfigVo {
private String python;
private String pythonScript;
private String output;
private String directory;
}

View File

@ -0,0 +1,19 @@
package com.ytlk.entity;
import lombok.Data;
/**
* @className:OcrHandleVo
* @author:cwchen
* @date:2025-04-16-13:42
* @version:1.0
* @description:ocr识别结果-vo
*/
@Data
public class OcrHandleVo {
private String pdf_file;
private String[] name;
private String[] money;
private String[] name_money;
}

View File

@ -0,0 +1,46 @@
package com.ytlk.fusion;
import com.alibaba.fastjson.JSONObject;
import java.io.*;
/**
* @className:test
* @author:cwchen
* @date:2025-04-16-9:06
* @version:1.0
* @description:
*/
public class test {
public static void main(String[] args) {
try {
// 创建ProcessBuilder
String python = "D:\\miniconda3\\envs\\ocr\\python.exe";
String pythonScript = new File("C:\\Users\\10488\\Desktop\\ocr-release\\main.py").getAbsolutePath();
String param1 = "C:\\Users\\10488\\Desktop\\1.pdf";
String param2 = "C:\\Users\\10488\\Desktop\\test (2)";
ProcessBuilder pb = new ProcessBuilder(
python,
pythonScript,
"--input", param1,
"--output", param2
);
pb.directory(new File("C:\\Users\\10488\\Desktop\\ocr-release"));
Process p = pb.start();
// 读取输出
BufferedReader reader = new BufferedReader(
new InputStreamReader(p.getInputStream()));
String line;
while ((line = reader.readLine()) != null) {
System.out.println(line);
}
// 等待进程结束
int exitCode = p.waitFor();
System.out.println("Python脚本执行完毕退出码: " + exitCode);
} catch (Exception e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,21 @@
package com.ytlk.fusion;
import com.ytlk.ocr.vo.UserVo;
import com.ytlk.util.OcrUtil;
import java.util.List;
/**
* @className:test2
* @author:cwchen
* @date:2025-04-16-11:24
* @version:1.0
* @description:
*/
public class test2 {
public static void main(String[] args) {
List<UserVo> userVos = OcrUtil.ocrHandle("C:\\Users\\10488\\Desktop\\3.pdf");
System.err.println(userVos);
}
}

View File

@ -162,8 +162,8 @@ public class OCRSwingArea extends JFrame {
JFileChooser fc = new JFileChooser(); JFileChooser fc = new JFileChooser();
fc.setMultiSelectionEnabled(true); fc.setMultiSelectionEnabled(true);
fc.setFileSelectionMode(JFileChooser.FILES_ONLY); fc.setFileSelectionMode(JFileChooser.FILES_ONLY);
// fc.setFileFilter(new FileNameExtensionFilter("PDF Documents", "pdf")); fc.setFileFilter(new FileNameExtensionFilter("PDF Documents", "pdf"));
fc.setFileFilter(new FileNameExtensionFilter("Excel Files (*.xls, *.xlsx)", "xls", "xlsx")); // fc.setFileFilter(new FileNameExtensionFilter("Excel Files (*.xls, *.xlsx)", "xls", "xlsx"));
fc.setAcceptAllFileFilterUsed(false); fc.setAcceptAllFileFilterUsed(false);
int val = fc.showOpenDialog(null); // 文件打开对话框 int val = fc.showOpenDialog(null); // 文件打开对话框
if (val == fc.APPROVE_OPTION) { if (val == fc.APPROVE_OPTION) {

View File

@ -5,6 +5,7 @@ import com.ytlk.ocr.util.CompareDataUtil;
import com.ytlk.ocr.util.FileUtils; import com.ytlk.ocr.util.FileUtils;
import com.ytlk.ocr.vo.UserErrorVo; import com.ytlk.ocr.vo.UserErrorVo;
import com.ytlk.ocr.vo.UserVo; import com.ytlk.ocr.vo.UserVo;
import com.ytlk.util.OcrUtil;
import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -40,46 +41,22 @@ public class HandleDataUtil {
List<Future<Map<String, Object>>> futures3 = new ArrayList<>(); List<Future<Map<String, Object>>> futures3 = new ArrayList<>();
// excel表格读取数据 // excel表格读取数据
for (int i = 0; i < rowCount; i++) { for (int i = 0; i < rowCount; i++) {
int finalI = i; Map<String, Object> map = new HashMap<>();
Future<Map<String, Object>> future = executor.submit(new Callable<Map<String, Object>>() { File file = (File) OCRSwingArea.table.getValueAt(i, 0);
@Override String path = file.getAbsoluteFile().toPath().toString();
public Map<String, Object> call() throws Exception { List<UserVo> userVos = FileUtils.getExcelUsers(path);
Map<String, Object> map = new HashMap<>(); map.put("fileName", file.getName());
File file = (File) OCRSwingArea.table.getValueAt(finalI, 0); map.put("list", userVos);
System.out.println(file.getName()); excelDataList.add(map);
String path = file.getAbsoluteFile().toPath().toString();
List<UserVo> userVos = FileUtils.getExcelUsers(path);
map.put("fileName", file.getName());
map.put("list", userVos);
return map;
}
});
futures.add(future);
} }
// pdf读取数据 // pdf读取数据
for (int i = 0; i < rowCount2; i++) { for (int i = 0; i < rowCount2; i++) {
int finalI = i; Map<String, Object> map = new HashMap<>();
Future<Map<String, Object>> future = executor.submit(new Callable<Map<String, Object>>() { File file = (File) OCRSwingArea.table2.getValueAt(i, 0);
@Override String path = file.getAbsoluteFile().toPath().toString();
public Map<String, Object> call() throws Exception { List<UserVo> userVos = OcrUtil.ocrHandle(path);
Map<String, Object> map = new HashMap<>(); map.put("fileName", file.getName());
File file = (File) OCRSwingArea.table2.getValueAt(finalI, 0); map.put("list", userVos);
System.out.println(file.getName());
String path = file.getAbsoluteFile().toPath().toString();
List<UserVo> userVos = FileUtils.getExcelUsers(path);
map.put("fileName", file.getName());
map.put("list", userVos);
return map;
}
});
futures2.add(future);
}
for (Future<Map<String, Object>> future : futures) {
Map<String, Object> map = future.get();
excelDataList.add(map);
}
for (Future<Map<String, Object>> future : futures2) {
Map<String, Object> map = future.get();
pdfDataList.add(map); pdfDataList.add(map);
} }
// 校验excel表格数据 // 校验excel表格数据

View File

@ -0,0 +1,38 @@
package com.ytlk.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
/**
* @className:FileUtil
* @author:cwchen
* @date:2025-04-16-11:18
* @version:1.0
* @description:文件工具类
*/
public class FileUtil {
public static String readDataFromFile(String filePath) {
File file = new File(filePath);
if (file.exists()) {
StringBuffer sb = null;
try (
BufferedReader reader = new BufferedReader(new FileReader(filePath))) {
sb = new StringBuffer();
int lineNumber = 0;
String line;
while ((line = reader.readLine()) != null) {
// 在这里可以对每一行进行进一步的处理例如提取字段转换数据类型等
sb.append(line);
}
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
} else {
return null;
}
}
}

View File

@ -0,0 +1,166 @@
package com.ytlk.util;
import com.alibaba.fastjson.JSON;
import com.ytlk.entity.ConfigVo;
import com.ytlk.entity.OcrHandleVo;
import com.ytlk.ocr.vo.UserVo;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.TimeUnit;
/**
* @className:OcrUtil
* @author:cwchen
* @date:2025-04-16-11:08
* @version:1.0
* @description:orc识别工具类
*/
public class OcrUtil {
static ConfigVo configVo = new ConfigVo();
/**配置文件读取路径*/
private static final String configPath = "C:\\Users\\10488\\Desktop\\orc\\config.json";
private static final DateTimeFormatter FORMATTER = DateTimeFormatter.ofPattern("yyyy_MM_dd_HH_mm_ss");
private static final DateTimeFormatter FORMATTER2 = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
static {
String str = FileUtil.readDataFromFile(configPath);
configVo = JSON.parseObject(str, ConfigVo.class);
}
public static List<UserVo> ocrHandle(String filePath){
List<UserVo> dataList = null;
try {
System.err.println("开始识别--" + getBaseName(filePath) + "文件," + "开始时间" + LocalDateTime.now().format(FORMATTER2));
long startTime = System.currentTimeMillis();
// 创建ProcessBuilder
String python = configVo.getPython();
String pythonScript = new File(configVo.getPythonScript()).getAbsolutePath();
String param1 = filePath;
String param2 = configVo.getOutput() + File.separator + getCurrentDateTimeFormatted();
// 创建输出目录
createDirectories(param2);
ProcessBuilder pb = new ProcessBuilder(
python,
pythonScript,
"--input", param1,
"--output", param2
);
pb.redirectErrorStream(true);
// 设置环境变量指定编码
Map<String, String> env = pb.environment();
env.put("PYTHONIOENCODING", "UTF-8"); // 或GBK等与脚本匹配的编码
// 设置工作目录
pb.directory(new File(configVo.getDirectory()));
Process p = pb.start();
// 读取输出
BufferedReader reader = new BufferedReader(
new InputStreamReader(p.getInputStream(), StandardCharsets.UTF_8));
String line;
while ((line = reader.readLine()) != null) {
System.out.println(line);
}
// 等待进程结束
// int exitCode = p.waitFor();
// 设置30秒超时
boolean finished = p.waitFor(30, TimeUnit.SECONDS);
if (!finished) {
p.destroyForcibly(); // 强制终止进程
System.err.println("Python脚本执行超时");
} else {
int exitCode = p.waitFor();
System.out.println("Python脚本执行完毕退出码: " + exitCode);
if(exitCode == 0){
String path = param2 + File.separator + getBaseName(filePath) + "_result.json";
dataList = ocrHandleResult(path);
}
long endTime = System.currentTimeMillis();
System.err.println("结束识别--" + getBaseName(filePath) + "文件," + "开始时间" + LocalDateTime.now().format(FORMATTER2));
long duration = endTime - startTime;
System.err.println("执行耗时: " + duration + " 毫秒");
}
} catch (Exception e) {
e.printStackTrace();
}
System.err.println("识别结果:" + dataList);
return Optional.ofNullable(dataList).orElseGet(ArrayList::new);
}
public static String getCurrentDateTimeFormatted() {
return LocalDateTime.now().format(FORMATTER);
}
public static List<UserVo> ocrHandleResult(String filePath){
List<UserVo> dataList = new ArrayList<>();
if(!new File(filePath).exists()){
return dataList;
}
try {
String str = FileUtil.readDataFromFile(filePath);
if(StringUtils.isBlank(str)){
return dataList;
}
OcrHandleVo ocrHandleVo = JSON.parseObject(str, OcrHandleVo.class);
System.err.println(ocrHandleVo);
String[] nameMoney = ocrHandleVo.getName_money();
List<String> list = Arrays.asList(nameMoney);
for (String value : list) {
if(value.contains("未知姓名")){
continue;
}
String[] split = value.split("-");
UserVo userVo = new UserVo();
userVo.setName(split[0]);
userVo.setWage(convertToDouble(split[1]));
dataList.add(userVo);
}
} catch (Exception e) {
e.printStackTrace();
return dataList;
}
return dataList;
}
public static String getBaseName(String filePath) {
Path path = Paths.get(filePath);
String fileName = path.getFileName().toString();
int dotIndex = fileName.lastIndexOf('.');
return (dotIndex == -1) ? fileName : fileName.substring(0, dotIndex);
}
public static void createDirectories(String multiLevelPath){
try {
Path path = Paths.get(multiLevelPath);
Files.createDirectories(path); // 自动创建所有不存在的父目录
} catch (Exception e) {
System.err.println("创建失败: " + e.getMessage());
}
}
public static double convertToDouble(String input) {
if (input == null) {
return 0.0;
}
// 去除所有空白字符包括空格制表符换行等
String trimmed = input.replaceAll("\\s+", "");
try {
return Double.parseDouble(trimmed);
} catch (NumberFormatException e) {
return 0.0;
}
}
}