建陵卫生院

This commit is contained in:
Flow 2025-05-15 09:55:57 +08:00
parent 937e90c621
commit dbac0cc964
4 changed files with 585 additions and 42 deletions

View File

@ -5,7 +5,7 @@ output:
processed_files: "./ocr_results/processed_files.txt" # 已处理文件列表
# 后端接口配置
# upload_url: https://pacs.gw12320.com/adminecg/admin-api/tblist/ecganalysisparas/parsePhotoCreateData
#upload_url: https://pacs.gw12320.com/adminecg/admin-api/tblist/ecganalysisparas/parsePhotoCreateData
upload_url: http://localhost:48080/admin-api/tblist/ecganalysisparas/parsePhotoCreateData
# 底部识别配置
@ -141,6 +141,20 @@ directories:
key_mapping:
"医师": "doctor"
"结论": "conclusion"
- path: "./建陵卫生院" # 需要旋转90度的图片目录
recognition_type: "rotate90" # 新增旋转类型
recognition_area: # 添加旋转后的识别区域
start_x: 0
start_y: 0
width: 100 # 宽度百分比
height: 25 # 高度百分比
key_mapping:
"ID": "examId"
"姓 名": "name"
"年 龄": "age"
"性 别": "gender"
"时 间": "collectionTime"
# OCR程序与语言包路径配置

View File

@ -71,7 +71,6 @@ public class ConfigManager {
// 检查是否是分块模式
if (dir.containsKey("recognition_area")) {
Map<String, Integer> recognitionArea = (Map<String, Integer>) dir.get("recognition_area");
List<Map<String, Object>> splitBlocks = (List<Map<String, Object>>) dir.get("split_blocks");
// 创建识别区域对象
DirectoryConfig.RecognitionArea area = new DirectoryConfig.RecognitionArea(
@ -81,48 +80,88 @@ public class ConfigManager {
recognitionArea.get("height")
);
// 创建切割块列表
List<DirectoryConfig.SplitBlock> blocks = new ArrayList<>();
int totalWidth = 0;
// 计算总宽度确保所有块的宽度总和为100%
for (Map<String, Object> block : splitBlocks) {
totalWidth += (Integer) block.get("width_percent");
}
if (totalWidth != 100) {
logger.warn("目录 {} 的分块宽度总和为 {}%不等于100%,将进行自动调整", path, totalWidth);
}
// 创建分块配置
totalWidth = 0;
for (int i = 0; i < splitBlocks.size(); i++) {
Map<String, Object> block = splitBlocks.get(i);
int widthPercent = (Integer) block.get("width_percent");
Map<String, String> keyMapping = (Map<String, String>) block.get("key_mapping");
// 如果是最后一个块且总宽度不足100%调整宽度
if (i == splitBlocks.size() - 1 && totalWidth + widthPercent != 100) {
widthPercent = 100 - totalWidth;
logger.info("调整最后一个分块的宽度为 {}%确保总宽度为100%", widthPercent);
// 根据识别类型处理
if ("rotate90".equals(recognitionType)) {
// 旋转类型不需要分块
Map<String, String> keyMapping = (Map<String, String>) dir.get("key_mapping");
DirectoryConfig dirConfig;
if (bottomKeyWords != null && bottomKeyMapping != null) {
dirConfig = new DirectoryConfig(path, keyMapping, bottomKeyWords, bottomKeyMapping);
} else if (bottomKeyWords != null) {
dirConfig = new DirectoryConfig(path, keyMapping, bottomKeyWords);
} else {
dirConfig = new DirectoryConfig(path, keyMapping);
}
dirConfig.setRecognitionType(recognitionType);
// 设置识别区域
dirConfig.setRecognitionArea(area);
configs.add(dirConfig);
logger.info("加载旋转识别目录配置: {}", path);
} else {
// 处理分块模式
List<Map<String, Object>> splitBlocks = (List<Map<String, Object>>) dir.get("split_blocks");
if (splitBlocks == null) {
logger.warn("目录 {} 配置了recognition_area但没有split_blocks将使用整个区域处理", path);
// 如果没有分块配置仍然使用普通模式
Map<String, String> keyMapping = (Map<String, String>) dir.get("key_mapping");
DirectoryConfig dirConfig;
if (bottomKeyWords != null && bottomKeyMapping != null) {
dirConfig = new DirectoryConfig(path, keyMapping, bottomKeyWords, bottomKeyMapping);
} else if (bottomKeyWords != null) {
dirConfig = new DirectoryConfig(path, keyMapping, bottomKeyWords);
} else {
dirConfig = new DirectoryConfig(path, keyMapping);
}
dirConfig.setRecognitionType(recognitionType);
dirConfig.setRecognitionArea(area);
configs.add(dirConfig);
logger.info("加载区域识别目录配置: {}", path);
continue;
}
blocks.add(new DirectoryConfig.SplitBlock(widthPercent, keyMapping));
totalWidth += widthPercent;
// 创建切割块列表
List<DirectoryConfig.SplitBlock> blocks = new ArrayList<>();
int totalWidth = 0;
// 计算总宽度确保所有块的宽度总和为100%
for (Map<String, Object> block : splitBlocks) {
totalWidth += (Integer) block.get("width_percent");
}
if (totalWidth != 100) {
logger.warn("目录 {} 的分块宽度总和为 {}%不等于100%,将进行自动调整", path, totalWidth);
}
// 创建分块配置
totalWidth = 0;
for (int i = 0; i < splitBlocks.size(); i++) {
Map<String, Object> block = splitBlocks.get(i);
int widthPercent = (Integer) block.get("width_percent");
Map<String, String> keyMapping = (Map<String, String>) block.get("key_mapping");
// 如果是最后一个块且总宽度不足100%调整宽度
if (i == splitBlocks.size() - 1 && totalWidth + widthPercent != 100) {
widthPercent = 100 - totalWidth;
logger.info("调整最后一个分块的宽度为 {}%确保总宽度为100%", widthPercent);
}
blocks.add(new DirectoryConfig.SplitBlock(widthPercent, keyMapping));
totalWidth += widthPercent;
}
// 创建目录配置带有或不带有底部关键字和映射
DirectoryConfig dirConfig;
if (bottomKeyWords != null && bottomKeyMapping != null) {
dirConfig = new DirectoryConfig(path, area, blocks, bottomKeyWords, bottomKeyMapping);
} else if (bottomKeyWords != null) {
dirConfig = new DirectoryConfig(path, area, blocks, bottomKeyWords);
} else {
dirConfig = new DirectoryConfig(path, area, blocks);
}
dirConfig.setRecognitionType(recognitionType);
configs.add(dirConfig);
logger.info("加载分块目录配置: {}, 分块数: {}", path, blocks.size());
}
// 创建目录配置带有或不带有底部关键字和映射
DirectoryConfig dirConfig;
if (bottomKeyWords != null && bottomKeyMapping != null) {
dirConfig = new DirectoryConfig(path, area, blocks, bottomKeyWords, bottomKeyMapping);
} else if (bottomKeyWords != null) {
dirConfig = new DirectoryConfig(path, area, blocks, bottomKeyWords);
} else {
dirConfig = new DirectoryConfig(path, area, blocks);
}
dirConfig.setRecognitionType(recognitionType);
configs.add(dirConfig);
logger.info("加载分块目录配置: {}, 分块数: {}", path, blocks.size());
} else {
Map<String, String> keyMapping = (Map<String, String>) dir.get("key_mapping");

View File

@ -105,6 +105,10 @@ public class DirectoryConfig {
return recognitionArea;
}
public void setRecognitionArea(RecognitionArea recognitionArea) {
this.recognitionArea = recognitionArea;
}
public List<SplitBlock> getSplitBlocks() {
return splitBlocks;
}

View File

@ -17,6 +17,8 @@ import org.apache.http.util.EntityUtils;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.awt.geom.AffineTransform;
import java.awt.Graphics2D;
import java.io.File;
import java.io.IOException;
import java.net.URI;
@ -87,6 +89,9 @@ public class FolderMonitor {
case "templateA":
extractedData = processImageWithTemplateA(imageFullPath);
break;
case "rotate90":
extractedData = processImageWithRotate90(imageFullPath);
break;
case "normal":
default:
extractedData = processImageNormal(imageFullPath);
@ -107,6 +112,9 @@ public class FolderMonitor {
logger.info("添加文件信息 - orgName: {}, ecgDataFilePath: {}", parentFolderName, fileName);
// 处理已提取的时间格式
processTimeFields(extractedData);
// 检查是否需要进行底部识别
Map<String, Object> config = configManager.getConfig();
if (config.containsKey("bottom_recognition")) {
@ -697,6 +705,484 @@ public class FolderMonitor {
return processImageNormal(imageFullPath);
}
/**
* 处理图片将图片逆时针旋转90度
* @param imageFullPath 图片完整路径
* @return 识别结果
*/
private Map<String, String> processImageWithRotate90(String imageFullPath) throws Exception {
logger.info("使用旋转90度识别逻辑处理: {}", imageFullPath);
// 读取原始图像
BufferedImage originalImage = ImageIO.read(new File(imageFullPath));
// 创建一个旋转后的图像宽高交换
int width = originalImage.getWidth();
int height = originalImage.getHeight();
BufferedImage rotatedImage = new BufferedImage(height, width, originalImage.getType());
// 执行逆时针旋转90度操作
Graphics2D g2d = rotatedImage.createGraphics();
g2d.translate(height, 0);
g2d.rotate(Math.PI / 2);
g2d.drawImage(originalImage, 0, 0, null);
g2d.dispose();
// 将旋转后的图片保存回原始位置
// ImageIO.write(rotatedImage, "PNG", new File(imageFullPath));
// logger.info("已将图片逆时针旋转90度并保存: {}", imageFullPath);
// 在旋转后的图片上执行OCR识别
String result = tesseract.doOCR(new File(imageFullPath));
logger.info("旋转90度后OCR识别结果: {}", result);
// 处理OCR结果
return processOcrResult(result, directoryConfig.getKeyMapping());
}
/**
* 处理OCR识别结果
* @param ocrResult OCR结果文本
* @param keyMapping 关键词映射
* @return 提取的数据
*/
private Map<String, String> processOcrResult(String ocrResult, Map<String, String> keyMapping) {
Map<String, String> extractedData = new HashMap<>();
String[] lines = ocrResult.split("\\r?\\n");
logger.info("OCR结果分割为 {} 行", lines.length);
// 处理第一行 - 整行处理获取时间等信息
if (lines.length > 0) {
String firstLine = lines[0].trim().replaceAll("\\s+", " ");
if (!firstLine.isEmpty()) {
logger.info("处理第一行(整行): {}", firstLine);
// 创建时间相关映射
Map<String, String> timeMapping = new HashMap<>();
for (Map.Entry<String, String> entry : keyMapping.entrySet()) {
if (entry.getValue().contains("time") || entry.getValue().contains("Time") ||
entry.getValue().contains("日期") || entry.getValue().contains("collectionTime")) {
timeMapping.put(entry.getKey(), entry.getValue());
}
}
processLine(firstLine, timeMapping, extractedData);
}
}
// 处理第二行 - 主要提取ID
if (lines.length > 1) {
String secondLine = lines[1].trim().replaceAll("\\s+", " ");
if (!secondLine.isEmpty()) {
logger.info("处理第二行(主要提取ID): {}", secondLine);
// 针对ID创建映射
Map<String, String> idMapping = new HashMap<>();
for (Map.Entry<String, String> entry : keyMapping.entrySet()) {
if (entry.getValue().contains("id") || entry.getValue().contains("Id") ||
entry.getValue().equals("ID") || entry.getValue().equals("examId")) {
idMapping.put(entry.getKey(), entry.getValue());
}
}
// 处理ID提取
processLine(secondLine, idMapping, extractedData);
}
}
// 处理第三行 - 同时提取姓名性别和年龄
if (lines.length > 2) {
String thirdLine = lines[2].trim().replaceAll("\\s+", " ");
if (!thirdLine.isEmpty()) {
logger.info("处理第三行(提取姓名、性别和年龄): {}", thirdLine);
// 提取姓名
String nameField = null;
for (Map.Entry<String, String> entry : keyMapping.entrySet()) {
if (entry.getValue().contains("name") || entry.getValue().contains("姓名")) {
nameField = entry.getValue();
break;
}
}
if (nameField != null && thirdLine.contains("") && thirdLine.contains("")) {
// 通过"姓名"标记提取姓名修改正则表达式以匹配包含引号的情况
String namePattern = "\\s*名\\s*[:]?\\s*[\"\\s]*([^\\d年龄]{2,8})";
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile(namePattern);
java.util.regex.Matcher matcher = pattern.matcher(thirdLine);
if (matcher.find()) {
String name = matcher.group(1).replaceAll("\\s+", "");
// 去除名字中的引号字符
name = name.replaceAll("\"", "") // 双引号
.replaceAll("\u201C", "") // 左双引号
.replaceAll("\u201D", ""); // 右双引号
extractedData.put(nameField, name);
logger.info("从第三行提取姓名: {}", name);
} else {
// 尝试使用另一种提取方法
int nameStart = thirdLine.indexOf("姓名");
if (nameStart < 0) nameStart = thirdLine.indexOf("姓 名");
int ageStart = thirdLine.indexOf("年龄");
if (ageStart < 0) ageStart = thirdLine.indexOf("年 龄");
if (nameStart >= 0 && ageStart > nameStart) {
String nameSection = thirdLine.substring(nameStart + 2, ageStart).trim();
// 清理冒号引号等
nameSection = nameSection.replaceAll("[:]", "").trim();
nameSection = nameSection.replaceAll("\"", "").trim();
nameSection = nameSection.replaceAll("\u201C", "").trim(); // 左双引号
nameSection = nameSection.replaceAll("\u201D", "").trim(); // 右双引号
// 移除空格
nameSection = nameSection.replaceAll("\\s+", "");
if (!nameSection.isEmpty()) {
extractedData.put(nameField, nameSection);
logger.info("通过分割提取姓名: {}", nameSection);
}
}
}
}
// 针对性别和年龄创建特定的映射
Map<String, String> genderAgeMapping = new HashMap<>();
for (Map.Entry<String, String> entry : keyMapping.entrySet()) {
if (entry.getValue().contains("gender") || entry.getValue().contains("性别") ||
entry.getValue().contains("age") || entry.getValue().contains("年龄")) {
genderAgeMapping.put(entry.getKey(), entry.getValue());
}
}
// 提取性别和年龄
processLine(thirdLine, genderAgeMapping, extractedData);
// 如果姓名未提取成功尝试手动提取
if (!extractedData.keySet().stream().anyMatch(k -> k.contains("name") || k.contains("姓名"))) {
// 针对特定的行结构"姓 名 : " : 27"
if (thirdLine.contains("") && thirdLine.contains("") && thirdLine.contains("") && thirdLine.contains("")) {
// 获取"姓名""年龄"之间的内容
int nameStart = thirdLine.indexOf("");
int ageStart = thirdLine.indexOf("年龄");
if (nameStart >= 0 && ageStart > nameStart) {
String nameSection = thirdLine.substring(nameStart + 1, ageStart).trim();
// 清理冒号引号等
nameSection = nameSection.replaceAll("[:\"]", "").trim();
// 移除空格
nameSection = nameSection.replaceAll("\\s+", "");
if (!nameSection.isEmpty() && nameField != null) {
extractedData.put(nameField, nameSection);
logger.info("通过位置提取姓名: {}", nameSection);
}
}
}
}
}
}
// 处理剩余行 - 使用完整的关键字映射
for (int i = 3; i < lines.length; i++) {
String line = lines[i].trim().replaceAll("\\s+", " ");
if (line.isEmpty()) continue;
logger.info("处理行 {}: {}", i+1, line);
processLine(line, keyMapping, extractedData);
}
// 确保所有提取到的数据都被存入结果中
logger.info("提取结果: {}", extractedData);
return extractedData;
}
/**
* 处理单行文本提取关键字和对应的值
* @param line 要处理的文本行
* @param keyMapping 关键字映射
* @param extractedData 用于存储提取的数据的Map
*/
private void processLine(String line, Map<String, String> keyMapping, Map<String, String> extractedData) {
// 在一行中查找所有key
for (Map.Entry<String, String> entry : keyMapping.entrySet()) {
String key = entry.getKey();
String value = entry.getValue();
if (line.contains(key)) {
// 提取key后面的值
String[] parts = line.split(key);
if (parts.length > 1) {
String extractedValue = parts[1].trim();
// 清除值前面的冒号和空格
if (extractedValue.startsWith(":") || extractedValue.startsWith("")) {
extractedValue = extractedValue.substring(1).trim();
}
// 根据字段类型进行特殊处理
if (value.contains("id") || value.contains("Id") || value.equals("ID") || value.equals("examId")) {
// 对ID进行处理只保留数字
extractedValue = extractedValue.replaceAll("[^0-9]", "");
} else if (value.contains("gender") || value.contains("性别")) {
// 处理性别信息只保留""""
if (extractedValue.contains("")) {
extractedValue = "";
} else if (extractedValue.contains("")) {
extractedValue = "";
}
} else if (value.contains("age") || value.contains("年龄")) {
// 处理年龄只保留数字并确保是有效的年龄值通常小于120
if (extractedValue.matches(".*\\d+.*")) {
// 提取第一组连续数字通常是年龄
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("\\d+");
java.util.regex.Matcher matcher = pattern.matcher(extractedValue);
if (matcher.find()) {
String ageStr = matcher.group();
try {
int age = Integer.parseInt(ageStr);
// 判断是否是合理的年龄值
if (age > 0 && age < 120) {
extractedValue = String.valueOf(age);
} else {
// 如果不合理可能误提取了其他数字尝试进一步处理
// 通常年龄是较小的数字而身高是较大的数字
if (age > 120 && ageStr.length() >= 3) {
// 可能包含了身高尝试只取前两位
extractedValue = ageStr.substring(0, Math.min(2, ageStr.length()));
logger.info("年龄值过大,截取前两位: {} -> {}", ageStr, extractedValue);
}
}
} catch (NumberFormatException e) {
logger.warn("年龄解析错误: {}", ageStr);
}
}
}
} else if (value.contains("collectionTime") || value.contains("exam_time") || value.contains("time") || value.contains("日期")) {
// 处理日期时间格式
extractedValue = extractedValue.replaceAll("\\s+", " ");
// 使用-分割字符串
String[] dateParts = extractedValue.split("-");
if (dateParts.length >= 3) {
try {
// 处理年份
int year = Integer.parseInt(dateParts[0]);
int currentYear = java.time.LocalDate.now().getYear();
if (year > currentYear) {
year = currentYear;
logger.info("年份大于当前年份,使用当前年份: {}", year);
}
// 处理月份
int month = Integer.parseInt(dateParts[1]);
if (month > 12) {
String monthStr = String.valueOf(month);
if (monthStr.length() >= 2) {
// 取最后两位
month = Integer.parseInt(monthStr.substring(monthStr.length() - 2));
logger.info("月份大于12取后两位: {}", month);
}
}
// 处理日期取第一部分
String dayPart = dateParts[2].split("\\s+")[0]; // 只取日期部分去掉时间
int day = Integer.parseInt(dayPart);
// 日期有效性检查
if (year > 1900 && year <= currentYear && month >= 1 && month <= 12 && day >= 1 && day <= 31) {
extractedValue = String.format("%04d-%02d-%02d", year, month, day);
logger.info("格式化后的日期: {}", extractedValue);
} else {
logger.warn("日期无效: 年={}, 月={}, 日={}", year, month, day);
}
} catch (NumberFormatException e) {
logger.warn("日期解析错误: {}", extractedValue);
}
} else {
logger.warn("日期格式不正确: {}", extractedValue);
}
} else if (value.contains("name") || value.contains("姓名")) {
// 处理姓名去除多余的空格并验证是否为有效姓名
extractedValue = extractedValue.replaceAll("\\s+", "");
// 检查姓名是否包含"医院""卫生院"等机构名称如果包含则可能是误提取
if (extractedValue.contains("医院") || extractedValue.contains("卫生院") ||
extractedValue.contains("诊所") || extractedValue.contains("中心")) {
// 这可能是医院名称而不是患者姓名不进行存储
logger.warn("疑似将机构名称误识别为姓名: {}", extractedValue);
continue; // 跳过这个字段不添加到结果中
}
// 检查姓名长度正常中文姓名长度为2-4个字符
if (extractedValue.length() > 10) {
logger.warn("疑似姓名长度异常: {}", extractedValue);
continue; // 跳过这个字段不添加到结果中
}
}
extractedData.put(value, extractedValue);
logger.info("找到匹配: {} = {}", value, extractedValue);
}
}
}
}
/**
* 处理已提取的时间格式
* @param extractedData 已提取的数据
*/
private void processTimeFields(Map<String, String> extractedData) {
// 处理已提取的时间格式
logger.info("开始处理时间字段,当前提取数据: {}", extractedData);
for (Map.Entry<String, String> entry : extractedData.entrySet()) {
String key = entry.getKey();
String value = entry.getValue();
if (key.contains("time") || key.contains("Time") || key.contains("collection") || key.contains("collectionTime") ||
key.contains("时间") || key.contains("日期")) {
logger.info("处理时间字段: {} = {}", key, value);
// 处理日期时间格式
value = value.replaceAll("\\s+", " ");
// 识别数字格式尝试提取年月日和时分秒
String dateTimePattern = "(\\d{4})[-/]?(\\d{1,2})[-/]?(\\d{1,2})\\s*(\\d{1,2})[:\\s]?(\\d{1,2})[:\\s]?(\\d{1,2})";
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile(dateTimePattern);
java.util.regex.Matcher matcher = pattern.matcher(value);
if (matcher.find()) {
try {
int year = Integer.parseInt(matcher.group(1));
int month = Integer.parseInt(matcher.group(2));
int day = Integer.parseInt(matcher.group(3));
int hour = Integer.parseInt(matcher.group(4));
int minute = Integer.parseInt(matcher.group(5));
int second = Integer.parseInt(matcher.group(6));
logger.info("原始日期时间值: 年={}, 月={}, 日={}, 时={}, 分={}, 秒={}",
year, month, day, hour, minute, second);
// 获取当前年份
int currentYear = java.time.LocalDate.now().getYear();
// 修正明显错误的日期值
// 如果年份大于当前年份使用当前年份
if (year > currentYear) {
logger.info("年份 {} 大于当前年份 {},将使用当前年份", year, currentYear);
year = currentYear;
logger.info("修正日期中的年份为当前年份: {}", year);
}
// 如果月份大于12取后两位
if (month > 12) {
logger.info("月份 {} 无效,开始修正", month);
String monthStr = String.valueOf(month);
if (monthStr.length() >= 2) {
// 取最后两位
int newMonth = Integer.parseInt(monthStr.substring(monthStr.length() - 2));
logger.info("尝试取月份后两位: {} -> {}", month, newMonth);
month = newMonth;
// 如果取两位后仍然大于12则只取最后一位
if (month > 12) {
newMonth = Integer.parseInt(monthStr.substring(monthStr.length() - 1));
logger.info("月份仍然无效,取最后一位: {} -> {}", month, newMonth);
month = newMonth;
}
} else {
int newMonth = month % 10;
logger.info("对个位数月份取模: {} -> {}", month, newMonth);
month = newMonth; // 对于个位数使用取模方式
}
logger.info("修正后的月份: {}", month);
}
if (day > 31) {
logger.info("日期 {} 无效,开始修正", day);
int newDay = day % 100; // 对于识别错误尝试取后两位
logger.info("尝试取模100修正日期: {} -> {}", day, newDay);
day = newDay;
if (day > 31) {
newDay = day % 10;
logger.info("日期仍然无效取模10: {} -> {}", day, newDay);
day = newDay;
}
logger.info("修正后的日期: {}", day);
}
// 日期有效性检查
if (year > 1900 && year <= currentYear && month >= 1 && month <= 12 && day >= 1 && day <= 31) {
String formattedDate = String.format("%04d-%02d-%02d", year, month, day);
logger.info("字段 {} 的日期已修正: {} -> {}", key, value, formattedDate);
extractedData.put(key, formattedDate);
} else {
logger.warn("字段 {} 的日期在有效性检查后仍然无效: 年={}, 月={}, 日={}",
key, year, month, day);
}
} catch (NumberFormatException e) {
logger.warn("日期时间解析错误: {}, 异常: {}", value, e.getMessage());
}
} else {
logger.info("字段 {} 不符合完整日期时间格式,尝试仅匹配日期部分", key);
// 如果无法匹配完整的日期时间格式尝试仅匹配日期部分
String datePattern = "(\\d{4})[-/]?(\\d{1,2})[-/]?(\\d{1,2})";
pattern = java.util.regex.Pattern.compile(datePattern);
matcher = pattern.matcher(value);
if (matcher.find()) {
try {
int year = Integer.parseInt(matcher.group(1));
int month = Integer.parseInt(matcher.group(2));
int day = Integer.parseInt(matcher.group(3));
logger.info("仅日期部分匹配: 年={}, 月={}, 日={}", year, month, day);
// 获取当前年份
int currentYear = java.time.LocalDate.now().getYear();
// 修正明显错误的日期值
if (month > 12) {
logger.info("月份 {} 无效,尝试修正", month);
int newMonth = month % 10; // 对于识别错误如"904"取最后一位数字"4"
logger.info("取模10修正月份: {} -> {}", month, newMonth);
month = newMonth;
}
if (day > 31) {
logger.info("日期 {} 无效,尝试修正", day);
int newDay = day % 100; // 对于识别错误尝试取后两位
logger.info("取模100修正日期: {} -> {}", day, newDay);
day = newDay;
if (day > 31) {
newDay = day % 10;
logger.info("日期仍然无效取模10: {} -> {}", day, newDay);
day = newDay;
}
}
// 日期有效性检查
if (year > 1900 && year < 2100 && month >= 1 && month <= 12 && day >= 1 && day <= 31) {
String formattedDate = String.format("%04d-%02d-%02d", year, month, day);
logger.info("字段 {} 的日期已修正: {} -> {}", key, value, formattedDate);
extractedData.put(key, formattedDate);
} else {
logger.warn("字段 {} 的日期在有效性检查后仍然无效: 年={}, 月={}, 日={}",
key, year, month, day);
}
} catch (NumberFormatException e) {
logger.warn("日期解析错误: {}, 异常: {}", value, e.getMessage());
}
} else {
logger.warn("字段 {} 完全不匹配日期格式,保留原值: {}", key, value);
}
}
}
}
logger.info("时间字段处理完成,处理后数据: {}", extractedData);
}
/**
* 关闭资源
*/
@ -709,4 +1195,4 @@ public class FolderMonitor {
logger.error("关闭HttpClient失败", e);
}
}
}
}