From 43fc5b79711de3816972f2596b0bdedbc790657c Mon Sep 17 00:00:00 2001 From: yy2205 <2238220225@qq.com> Date: Tue, 27 May 2025 15:37:04 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.yaml | 150 +++++++------ src/main/java/com/ocr/ConfigManager.java | 86 ++++++-- src/main/java/com/ocr/FolderMonitor.java | 242 +++++++++++++++------ src/main/java/com/ocr/ImageOcrMonitor.java | 133 +++++++++++ 4 files changed, 452 insertions(+), 159 deletions(-) diff --git a/config.yaml b/config.yaml index 0e04a60..41ea256 100644 --- a/config.yaml +++ b/config.yaml @@ -3,10 +3,20 @@ output: all_results: "./ocr_results/all_results.json" # 所有识别结果 current_results: "./ocr_results/current_results.json" # 当前批次识别结果 processed_files: "./ocr_results/processed_files.txt" # 已处理文件列表 + missing_key_files: "./ocr_results/missing_key_files.txt" # 缺少关键字的文件路径 + missing_key_results: "./ocr_results/missing_key_results.json" # 缺少关键字的识别结果 + +# 需要检查的关键字配置 +required_keys: + - "name" # 姓名 + - "examId" # 检查编号 + - "age" # 年龄 + - "gender" # 性别 + - "hr" # 心率 # 后端接口配置 -upload_url: https://pacs.gw12320.com/adminecg/admin-api/tblist/ecganalysisparas/parsePhotoCreateData -#upload_url: http://localhost:48080/admin-api/tblist/ecganalysisparas/parsePhotoCreateData +# upload_url: https://pacs.gw12320.com/adminecg/admin-api/tblist/ecganalysisparas/parsePhotoCreateData +upload_url: http://localhost:48080/admin-api/tblist/ecganalysisparas/parsePhotoCreateData # 底部识别配置 bottom_recognition: @@ -40,49 +50,43 @@ bottom_recognition: # 图片目录配置 directories: - - path: "./test_images" - recognition_type: "normal" - key_mapping: - "编 号": "number" - "姓 名": "name" - "性 别": "gender" - "年 龄": "age" - "科 室": "department" - "床 号": "bed_number" - "HR": "HR" - "PR": "PR" - "QRS": "QRS" - "QT/QTC": "QT/QTC" - "P/QRS/T": "P/QRS/T" - "RV5/SV1": "RV5/SV1" - "RV5+SV1": "RV5+SV1" - bottom_key_words: # 特定目录的底部关键字配置 - - "检查日期" - - "日期" - bottom_key_mapping: # 特定目录的底部关键字映射 - "检查日期": "checkDate" - "日期": "date" - - path: "./ocr_images" - recognition_type: "normal" - key_mapping: - "ID": "id" - "申请科室": "department" - "病床号": "bed_number" - "HR": "HR" - "P": "P" - "QRS": "QRS" - "QT/QTc": "QT/QTc" - "P/QRS/T": "P/QRS/T" - "RV5/SV1": "RV5/SV1" + - path: "../ecgimage/北屯中心卫生院" + recognition_type: "split" + recognition_area: + start_x: 0 + start_y: 0 + width: 60 + height: 20 + split_blocks: + - width_percent: 40 + key_mapping: + "编 号": "examId" + "姓 名": "name" + "性 别": "gender" + "年 龄": "age" + "科 室": "department" + "床 号": "bed_number" + - width_percent: 35 + key_mapping: + "HR": "hr" + "PR": "pr" + "QRS": "qrs" + "QT/QTC": "qt/qtc" + "P/QRS/T": "pAxle/qrsAxle/tAxle" + "RV5/SV1": "rv5/sv1" + "RV5+SV1": "rv5Sv1" + - width_percent: 35 + key_mapping: + "备注": "notes" + bottom_key_words: - "检查日期" - - "时间" bottom_key_mapping: "检查日期": "examDate" "时间": "examTime" - - path: "./礼泉县裴寨卫生院" # 分块识别目录 - 3个分块 + - path: "../ecgimage/礼泉县裴寨卫生院" # 分块识别目录 - 3个分块 recognition_type: "split" recognition_area: # 识别区域配置 start_x: 0 # 起始X坐标(百分比) @@ -117,44 +121,35 @@ directories: bottom_key_mapping: # 特定目录的底部关键字映射 "检查时间": "collectionTime" - - path: "./special_images" # 分块识别目录 - 4个分块 - recognition_type: "templateA" + - path: "../ecgimage/药王洞卫生院" # 分块识别目录 - 4个分块 + recognition_type: "split" recognition_area: start_x: 0 start_y: 0 - width: 100 - height: 100 + width: 50 + height: 20 split_blocks: - - width_percent: 25 + - width_percent: 37 key_mapping: - "患者": "patient" - "ID": "id" - - width_percent: 25 + "编 号": "examId" + "姓 名": "name" + "性 别": "gender" + "年 龄": "age" + "科 室": "department" + "床 号": "bed_number" + - width_percent: 33 key_mapping: - "检查项目": "exam_item" - "检查日期": "exam_date" - - width_percent: 25 + "HR": "hr" + "PR": "pr" + "QRS": "qrs" + "QT/QTc": "qt/qtc" + "P/QRS/T": "pAxle/qrsAxle/tAxle" + "RV5/SV1": "rv5/sv1" + "RV5+SV1": "rv5Sv1" + - width_percent: 35 key_mapping: "结果1": "result1" - "结果2": "result2" - - width_percent: 25 - key_mapping: - "医师": "doctor" - "结论": "conclusion" - - path: "./建陵卫生院" # 需要旋转90度的图片目录 - recognition_type: "rotate90" # 新增旋转类型 - recognition_area: # 添加旋转后的识别区域 - start_x: 0 - start_y: 0 - width: 100 # 宽度百分比 - height: 25 # 高度百分比 - key_mapping: - "ID": "examId" - "姓 名": "name" - "年 龄": "age" - "性 别": "gender" - "时 间": "collectionTime" - - path: "./礼泉县城关卫生院" + - path: "../ecgimage/礼泉县城关卫生院" recognition_type: "CG" key_mapping: "姓名": "name" @@ -173,8 +168,20 @@ directories: - "检查时间" bottom_key_mapping: "检查时间": "collectionTime" - - - path: "./史德卫生院" + - path: "../ecgimage/建陵卫生院" # 需要旋转90度的图片目录 + recognition_type: "rotate90" # 新增旋转类型 + recognition_area: # 添加旋转后的识别区域 + start_x: 0 + start_y: 0 + width: 100 # 宽度百分比 + height: 25 # 高度百分比 + key_mapping: + "ID": "examId" + "姓 名": "name" + "年 龄": "age" + "性 别": "gender" + "时 间": "collectionTime" + - path: "./images/史德卫生院" recognition_type: "LQXSD" key_mapping: "姓名": "name" @@ -193,7 +200,6 @@ directories: - "检查时间" bottom_key_mapping: "检查时间": "collectionTime" - # OCR程序与语言包路径配置 # 新增Tesseract相关配置 @@ -202,7 +208,7 @@ directories: # language 必须,指定语言包 tesseract: - bin_path: "D:/Program Files/Tesseract-OCR/tesseract.exe" + bin_path: "C:/Program Files/Tesseract-OCR/tesseract.exe" data_path: "./tessdata" # data_path: "F:/陕西省咸阳市礼泉县心电图FTP/ecgimage/tessdata" language: "chi_sim+eng" diff --git a/src/main/java/com/ocr/ConfigManager.java b/src/main/java/com/ocr/ConfigManager.java index eb10149..5eee6c1 100644 --- a/src/main/java/com/ocr/ConfigManager.java +++ b/src/main/java/com/ocr/ConfigManager.java @@ -14,35 +14,75 @@ import java.util.*; public class ConfigManager { private static final Logger logger = LoggerFactory.getLogger(ConfigManager.class); private final List directoryConfigs; - private final String allResultsPath; - private final String currentResultsPath; - private final String processedFilesPath; + private String allResultsPath; + private String currentResultsPath; + private String processedFilesPath; + private String missingKeyFilesPath; + private String missingKeyResultsPath; + private List requiredKeys; private final ObjectMapper objectMapper; - private final Map config; + private Map config; + private final String configPath; public ConfigManager(String configPath) { + this.configPath = configPath; this.objectMapper = new ObjectMapper(); - this.config = loadConfig(configPath); + loadConfig(); this.directoryConfigs = loadDirectoryConfigs(config); - - @SuppressWarnings("unchecked") - Map output = (Map) config.get("output"); - this.allResultsPath = output.get("all_results"); - this.currentResultsPath = output.get("current_results"); - this.processedFilesPath = output.get("processed_files"); - - initializeOutputFiles(); } - @SuppressWarnings("unchecked") - private Map loadConfig(String configPath) { - try (InputStream input = new FileInputStream(configPath)) { + private Map loadConfig() { + try { + // 读取配置文件 Yaml yaml = new Yaml(); - return yaml.load(input); + try (InputStream input = new FileInputStream(configPath)) { + config = yaml.load(input); + } + + // 设置输出文件路径 + Map output = (Map) config.get("output"); + this.allResultsPath = output.get("all_results"); + this.currentResultsPath = output.get("current_results"); + this.processedFilesPath = output.get("processed_files"); + this.missingKeyFilesPath = output.get("missing_key_files"); + this.missingKeyResultsPath = output.get("missing_key_results"); + + // 设置必需关键字 + this.requiredKeys = (List) config.get("required_keys"); + + // 确保输出目录存在 + createOutputDirectories(); + } catch (Exception e) { logger.error("加载配置文件失败", e); throw new RuntimeException("加载配置文件失败", e); } + return config; + } + + private void createOutputDirectories() { + try { + // 创建输出目录 + Files.createDirectories(Paths.get(allResultsPath).getParent()); + Files.createDirectories(Paths.get(currentResultsPath).getParent()); + Files.createDirectories(Paths.get(processedFilesPath).getParent()); + Files.createDirectories(Paths.get(missingKeyFilesPath).getParent()); + Files.createDirectories(Paths.get(missingKeyResultsPath).getParent()); + + // 如果文件不存在,创建空文件 + if (!Files.exists(Paths.get(processedFilesPath))) { + Files.createFile(Paths.get(processedFilesPath)); + } + if (!Files.exists(Paths.get(missingKeyFilesPath))) { + Files.createFile(Paths.get(missingKeyFilesPath)); + } + if (!Files.exists(Paths.get(missingKeyResultsPath))) { + Files.createFile(Paths.get(missingKeyResultsPath)); + } + } catch (IOException e) { + logger.error("创建输出目录失败", e); + throw new RuntimeException("创建输出目录失败", e); + } } @SuppressWarnings("unchecked") @@ -232,6 +272,18 @@ public class ConfigManager { return processedFilesPath; } + public String getMissingKeyFilesPath() { + return missingKeyFilesPath; + } + + public String getMissingKeyResultsPath() { + return missingKeyResultsPath; + } + + public List getRequiredKeys() { + return requiredKeys; + } + public ObjectMapper getObjectMapper() { return objectMapper; } diff --git a/src/main/java/com/ocr/FolderMonitor.java b/src/main/java/com/ocr/FolderMonitor.java index 77fd780..5098f13 100644 --- a/src/main/java/com/ocr/FolderMonitor.java +++ b/src/main/java/com/ocr/FolderMonitor.java @@ -64,14 +64,14 @@ public class FolderMonitor { this.httpClient = HttpClients.createDefault(); } - public void processImage(Path imagePath) { + public Map processImage(Path imagePath) { try { logger.info("开始处理图片: {}", imagePath); // 检查文件是否已处理 if (isFileProcessed(imagePath)) { logger.info("文件已处理过,跳过: {}", imagePath); - return; + return new HashMap<>(); } // 获取图片的完整路径 @@ -105,80 +105,41 @@ public class FolderMonitor { } } catch (Exception e) { logger.error("识别逻辑处理异常", e); - return; + return new HashMap<>(); } - // 添加orgName(父文件夹名称)和ecgDataFilePath(图片名称) - File imageFile = imagePath.toFile(); - String fileName = imageFile.getName(); - String parentFolderName = imageFile.getParentFile().getName(); - - extractedData.put("orgName", parentFolderName); - extractedData.put("ecgDataFilePath", fileName); - - logger.info("添加文件信息 - orgName: {}, ecgDataFilePath: {}", parentFolderName, fileName); - - // 处理已提取的时间格式 - processTimeFields(extractedData); - - // 检查是否需要进行底部识别 - Map config = configManager.getConfig(); - if (config.containsKey("bottom_recognition")) { - @SuppressWarnings("unchecked") - Map bottomConfig = (Map) config.get("bottom_recognition"); - boolean enableBottomRecognition = (boolean) bottomConfig.getOrDefault("enable", false); + // 检查是否缺少必需的关键字 + List requiredKeys = configManager.getRequiredKeys(); + if (requiredKeys != null && !requiredKeys.isEmpty()) { + List missingKeys = new ArrayList<>(); + for (String key : requiredKeys) { + if (!extractedData.containsKey(key) || extractedData.get(key) == null || extractedData.get(key).trim().isEmpty()) { + missingKeys.add(key); + } + } - if (enableBottomRecognition) { - // 查找目录特定的底部关键字配置 - List keyWords; - Map keyMapping = null; + if (!missingKeys.isEmpty()) { + logger.warn("图片缺少必需的关键字: {}, 文件路径: {}", missingKeys, imageFullPath); - // 获取全局关键字映射 - @SuppressWarnings("unchecked") - Map globalKeyMapping = bottomConfig.containsKey("key_mapping") ? - (Map) bottomConfig.get("key_mapping") : Collections.emptyMap(); + // 记录缺少关键字的文件路径 + String missingKeyFilesPath = configManager.getMissingKeyFilesPath(); + Files.write(Paths.get(missingKeyFilesPath), + (imageFullPath + "\n").getBytes(StandardCharsets.UTF_8), + java.nio.file.StandardOpenOption.APPEND); - // 获取当前目录的特定配置 - if (directoryConfig.getBottomKeyWords() != null && !directoryConfig.getBottomKeyWords().isEmpty()) { - keyWords = directoryConfig.getBottomKeyWords(); - logger.info("使用目录特定的底部关键字: {}", keyWords); - - // 获取目录特定的关键字映射 - keyMapping = directoryConfig.getBottomKeyMapping(); - if (keyMapping != null && !keyMapping.isEmpty()) { - logger.info("使用目录特定的底部关键字映射: {}", keyMapping); - } else if (!globalKeyMapping.isEmpty()) { - // 如果目录没有特定映射但有全局映射,使用全局的 - keyMapping = globalKeyMapping; - logger.info("使用全局底部关键字映射: {}", keyMapping); - } - } else { - // 使用全局配置 - @SuppressWarnings("unchecked") - List globalKeyWords = (List) bottomConfig.getOrDefault("key_words", Collections.emptyList()); - keyWords = globalKeyWords; - logger.info("使用全局底部关键字: {}", keyWords); - - // 使用全局关键字映射 - keyMapping = globalKeyMapping; - if (!keyMapping.isEmpty()) { - logger.info("使用全局底部关键字映射: {}", keyMapping); - } - } + // 记录缺少关键字的识别结果 + Map missingKeyResult = new HashMap<>(); + missingKeyResult.put("file_path", imageFullPath); + missingKeyResult.put("process_time", LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)); + missingKeyResult.put("missing_keys", missingKeys); + missingKeyResult.put("extracted_data", extractedData); - // 创建带有目录特定关键字和映射的配置 - Map dirBottomConfig = new HashMap<>(bottomConfig); - dirBottomConfig.put("key_words", keyWords); - if (keyMapping != null && !keyMapping.isEmpty()) { - dirBottomConfig.put("key_mapping", keyMapping); - } - - // 调用通用底部识别方法 - Map bottomData = recognizeBottomArea(imageFullPath, dirBottomConfig); - if (!bottomData.isEmpty()) { - logger.info("添加底部识别结果: {}", bottomData); - extractedData.putAll(bottomData); - } + String missingKeyResultsPath = configManager.getMissingKeyResultsPath(); + ObjectMapper mapper = new ObjectMapper(); + String jsonResult = mapper.writeValueAsString(missingKeyResult) + "\n"; + Files.write(Paths.get(missingKeyResultsPath), + jsonResult.getBytes(StandardCharsets.UTF_8), + java.nio.file.StandardOpenOption.APPEND); } } @@ -197,8 +158,11 @@ public class FolderMonitor { // 标记文件为已处理 markFileAsProcessed(imagePath); + return extractedData; + } catch (Exception e) { logger.error("处理图片失败: " + imagePath, e); + return new HashMap<>(); } } @@ -710,6 +674,144 @@ public class FolderMonitor { // 这里只是示例,实际可根据模板A的需求实现 return processImageNormal(imageFullPath); } + // 新版CG识别逻辑:按区域裁剪后分别OCR提取字段 +/* + private Map processImageWithCG(String imageFullPath) { + Map extractedData = new HashMap<>(); + try { + BufferedImage image = ImageIO.read(new File(imageFullPath)); + int width = image.getWidth(); + int height = image.getHeight(); + + // 1. 标题区和患者信息区分开裁剪 + int titleHeight = (int) (height * 0.01); // 标题区5% + int infoHeight = (int) (height * 0.10); // 患者信息区10% + // 跳过标题区,只识别患者信息区 + BufferedImage infoArea = image.getSubimage(0, titleHeight, width, infoHeight); + String infoOcr = tesseract.doOCR(infoArea); + String[] infoLines = infoOcr.split("\\r?\\n"); + String namePattern = "^([\\u4e00-\\u9fa5 ]+)\\s+(女|男)"; + String agePattern = "(\\d+)\\s*[岁%]"; + String idPattern = "[I1l][D][::]?\\s*([A-Za-z0-9]+)"; + for (String line : infoLines) { + line = line.replaceAll("\\s+", " ").trim(); + if (extractedData.get("name") == null) { + java.util.regex.Matcher m = java.util.regex.Pattern.compile(namePattern).matcher(line); + if (m.find()) { + extractedData.put("name", m.group(1).replaceAll(" ", "")); // 去除姓名中的空格 + extractedData.put("gender", m.group(2)); + java.util.regex.Matcher ageM = java.util.regex.Pattern.compile(agePattern).matcher(line); + if (ageM.find()) { + extractedData.put("age", ageM.group(1)); + } + } + } + if (extractedData.get("id") == null) { + java.util.regex.Matcher m = java.util.regex.Pattern.compile(idPattern).matcher(line); + if (m.find()) { + extractedData.put("id", m.group(1)); + } + } + } + + // 2. 左侧参数区(提取HR、P、PR、QRS、QT/QTC、P/QRS/T、RV5/SV1) + int paramWidth = (int) (width * 0.32); // 左侧32% + int paramStartY = titleHeight; + int paramHeight = (int) (height * 0.355); // 参数区高度约38% + BufferedImage paramArea = image.getSubimage(0, paramStartY, paramWidth, paramHeight); + String paramOcr = tesseract.doOCR(paramArea); + String[] paramLines = paramOcr.split("\\r?\\n"); + String hrPattern = "HR\\s*[::.·]?\\s*([\\dOo./]+[bB][pP][mM])"; + String pPattern = "P\\s*[::.·]?\\s*([\\dOo./]+[mM][sS])"; + String prPattern = "PR\\s*[::.·]?\\s*([\\dOo./]+[mM][sS])"; + String qrsPattern = "QRS\\s*[::.·]?\\s*([\\dOo./]+[mM][sS])"; + String qtPattern = "QT/QT[cC]?\\s*[::.·]?\\s*([\\dOo./]+[mM][sS])"; + String pqrstPattern = "P/QRS/T\\s*[::.·]?\\s*([\\dOo./]+)"; + String rv5sv1Pattern = "([_\\s]*[Rr][Vv5][Ss][\\s/]*[Ss][Vv1Ii])\\s*[::.·]?\\s*([\\dOo./]+)\\s*[mM][vVyY]?"; + for (String line : paramLines) { + line = line.replaceAll("\\s+", " ").trim(); + if (extractedData.get("HR") == null) { + java.util.regex.Matcher m = java.util.regex.Pattern.compile(hrPattern, java.util.regex.Pattern.CASE_INSENSITIVE).matcher(line); + if (m.find()) { + String hrValue = m.group(1).replaceAll("[Oo]", "0").replaceAll("[^\\d.]", ""); + extractedData.put("HR", hrValue); + } + } + if (extractedData.get("P") == null) { + java.util.regex.Matcher m = java.util.regex.Pattern.compile(pPattern, java.util.regex.Pattern.CASE_INSENSITIVE).matcher(line); + if (m.find()) { + String pValue = m.group(1).replaceAll("[Oo]", "0").replaceAll("[^\\d.]", ""); + extractedData.put("P", pValue); + } + } + if (extractedData.get("PR") == null) { + java.util.regex.Matcher m = java.util.regex.Pattern.compile(prPattern, java.util.regex.Pattern.CASE_INSENSITIVE).matcher(line); + if (m.find()) { + String prValue = m.group(1).replaceAll("[Oo]", "0").replaceAll("[^\\d.]", ""); + extractedData.put("PR", prValue); + } + } + if (extractedData.get("QRS") == null) { + java.util.regex.Matcher m = java.util.regex.Pattern.compile(qrsPattern, java.util.regex.Pattern.CASE_INSENSITIVE).matcher(line); + if (m.find()) { + String qrsValue = m.group(1).replaceAll("[Oo]", "0").replaceAll("[^\\d.]", ""); + extractedData.put("QRS", qrsValue); + } + } + if (extractedData.get("QT/QTC") == null) { + java.util.regex.Matcher m = java.util.regex.Pattern.compile(qtPattern, java.util.regex.Pattern.CASE_INSENSITIVE).matcher(line); + if (m.find()) { + String qtValue = m.group(1).replaceAll("[Oo]", "0").replaceAll("[^\\d./]", ""); + extractedData.put("QT/QTC", qtValue); + } + } + if (extractedData.get("P/QRS/T") == null) { + java.util.regex.Matcher m = java.util.regex.Pattern.compile(pqrstPattern, java.util.regex.Pattern.CASE_INSENSITIVE).matcher(line); + if (m.find()) { + String pqrstValue = m.group(1).replaceAll("[Oo]", "0").replaceAll("[^\\d/degDEG.]", ""); + extractedData.put("P/QRS/T", pqrstValue); + } + } + if (extractedData.get("RV5/SV1") == null) { + if (line.toLowerCase().contains("sv1")) { + java.util.regex.Matcher m = java.util.regex.Pattern.compile("([\\dOo.]+)/([\\dOo.]+)\\s*[mM][vVyY]").matcher(line); + if (m.find()) { + String rv5 = m.group(1).replaceAll("[Oo]", "0"); + String sv1 = m.group(2).replaceAll("[Oo]", "0"); + String rv5sv1Value = rv5 + "/" + sv1; + extractedData.put("RV5/SV1", rv5sv1Value); + + } + } + } + } + + // 3. 底部区域(提取检查时间) + int bottomHeight = (int) (height * 0.05); // 底部12% + int bottomStartY = height - bottomHeight; + BufferedImage bottomArea = image.getSubimage(0, bottomStartY, width, bottomHeight); + String bottomOcr = tesseract.doOCR(bottomArea); + String[] bottomLines = bottomOcr.split("\\r?\\n"); + String checkTimePattern = "检查[::]?\\s*([\\d-]+ [\\d:]+)"; + for (String line : bottomLines) { + String lineNoSpace = line.replaceAll("\\s+", ""); + if (extractedData.get("collectionTime") == null) { + java.util.regex.Matcher m = java.util.regex.Pattern.compile("检查[::]?(\\d{4}-\\d{2}-\\d{2}\\s*\\d{2}:\\d{2}:\\d{2})").matcher(lineNoSpace); + if (m.find()) { + String dateTime = m.group(1); + if (!dateTime.contains(" ")) { + dateTime = dateTime.substring(0, 10) + " " + dateTime.substring(10); + } + extractedData.put("collectionTime", dateTime); + } + } + } + } catch (Exception e) { + logger.error("processImageWithCG 区域识别异常", e); + } + return extractedData; + } +*/ // 新版CG识别逻辑:按区域裁剪后分别OCR提取字段 private Map processImageWithCG(String imageFullPath) { diff --git a/src/main/java/com/ocr/ImageOcrMonitor.java b/src/main/java/com/ocr/ImageOcrMonitor.java index e89b591..7147351 100644 --- a/src/main/java/com/ocr/ImageOcrMonitor.java +++ b/src/main/java/com/ocr/ImageOcrMonitor.java @@ -15,15 +15,19 @@ import java.io.*; import java.nio.file.*; import java.util.*; import java.util.concurrent.*; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; public class ImageOcrMonitor { private static final Logger logger = LoggerFactory.getLogger(ImageOcrMonitor.class); private static final long CHECK_INTERVAL = 120000; // 2分钟 + private static final long RETRY_INTERVAL = 600000; // 10分钟 private final ConfigManager configManager; private final Map folderMonitors; private final Map watchServices; private Tesseract tesseract; private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); + private int currentRetryIndex = 0; // 当前重试的文件索引 public ImageOcrMonitor(String configPath) { this.configManager = new ConfigManager(configPath); @@ -31,6 +35,7 @@ public class ImageOcrMonitor { this.watchServices = new HashMap<>(); initializeMonitors(); initTesseract(); + startRetryTask(); } private void initializeMonitors() { @@ -128,6 +133,134 @@ public class ImageOcrMonitor { } } + private void startRetryTask() { + scheduler.scheduleAtFixedRate(() -> { + try { + logger.info("开始扫描识别失败的文件..."); + retryFailedRecognition(); + } catch (Exception e) { + logger.error("重试识别失败", e); + } + }, RETRY_INTERVAL, RETRY_INTERVAL, TimeUnit.MILLISECONDS); + } + + private void retryFailedRecognition() { + try { + // 读取missing_key_files.txt + Path missingKeyFilesPath = Paths.get(configManager.getMissingKeyFilesPath()); + if (!Files.exists(missingKeyFilesPath)) { + logger.info("没有需要重试的文件"); + return; + } + + List failedFiles = Files.readAllLines(missingKeyFilesPath); + if (failedFiles.isEmpty()) { + logger.info("没有需要重试的文件"); + currentRetryIndex = 0; // 重置索引 + return; + } + + // 如果索引超出范围,重置为0 + if (currentRetryIndex >= failedFiles.size()) { + currentRetryIndex = 0; + } + + // 获取当前要处理的文件 + String currentFile = failedFiles.get(currentRetryIndex); + logger.info("开始处理第 {} 个失败文件: {}", currentRetryIndex + 1, currentFile); + + // 读取现有的missing_key_results.json + Path missingKeyResultsPath = Paths.get(configManager.getMissingKeyResultsPath()); + List> existingResults = new ArrayList<>(); + if (Files.exists(missingKeyResultsPath)) { + String content = new String(Files.readAllBytes(missingKeyResultsPath)); + if (!content.trim().isEmpty()) { + existingResults = configManager.getObjectMapper().readValue(content, List.class); + } + } + + // 处理当前文件 + Path path = Paths.get(currentFile); + if (!Files.exists(path)) { + logger.warn("文件不存在,跳过: {}", currentFile); + currentRetryIndex++; // 移动到下一个文件 + return; + } + + // 找到对应的FolderMonitor + FolderMonitor monitor = findMonitorForFile(path); + if (monitor == null) { + logger.warn("找不到对应的FolderMonitor,跳过: {}", currentFile); + currentRetryIndex++; // 移动到下一个文件 + return; + } + + // 重新识别 + Map extractedData = monitor.processImage(path); + + // 检查是否还有缺失的关键字 + List missingKeys = new ArrayList<>(); + for (String requiredKey : configManager.getRequiredKeys()) { + if (!extractedData.containsKey(requiredKey) || + extractedData.get(requiredKey) == null || + extractedData.get(requiredKey).trim().isEmpty()) { + missingKeys.add(requiredKey); + } + } + + if (missingKeys.isEmpty()) { + // 识别成功,从结果中移除 + logger.info("文件识别成功,移除失败记录: {}", currentFile); + failedFiles.remove(currentRetryIndex); + // 更新missing_key_files.txt + Files.write(missingKeyFilesPath, failedFiles); + // 不需要更新索引,因为列表已经缩短 + } else { + // 仍然失败,更新结果 + logger.info("文件仍然识别失败,更新结果: {}", currentFile); + // 更新或添加结果 + boolean found = false; + for (Map result : existingResults) { + if (currentFile.equals(result.get("file_path"))) { + result.put("process_time", LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)); + result.put("missing_keys", missingKeys); + result.put("extracted_data", extractedData); + found = true; + break; + } + } + if (!found) { + Map result = new HashMap<>(); + result.put("file_path", currentFile); + result.put("process_time", LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME)); + result.put("missing_keys", missingKeys); + result.put("extracted_data", extractedData); + existingResults.add(result); + } + // 更新missing_key_results.json + String jsonContent = configManager.getObjectMapper().writerWithDefaultPrettyPrinter() + .writeValueAsString(existingResults); + Files.write(missingKeyResultsPath, jsonContent.getBytes()); + currentRetryIndex++; // 移动到下一个文件 + } + + logger.info("本次重试完成,当前处理进度: {}/{}", currentRetryIndex + 1, failedFiles.size()); + } catch (Exception e) { + logger.error("重试识别过程发生错误", e); + currentRetryIndex++; // 发生错误时也移动到下一个文件 + } + } + + private FolderMonitor findMonitorForFile(Path filePath) { + String absolutePath = filePath.toAbsolutePath().toString(); + for (Map.Entry entry : folderMonitors.entrySet()) { + if (absolutePath.startsWith(entry.getKey())) { + return entry.getValue(); + } + } + return null; + } + public static void main(String[] args) { try { logger.info("OCR监控程序启动...");