建陵卫生院
This commit is contained in:
parent
937e90c621
commit
dbac0cc964
16
config.yaml
16
config.yaml
@ -5,7 +5,7 @@ output:
|
||||
processed_files: "./ocr_results/processed_files.txt" # 已处理文件列表
|
||||
|
||||
# 后端接口配置
|
||||
# upload_url: https://pacs.gw12320.com/adminecg/admin-api/tblist/ecganalysisparas/parsePhotoCreateData
|
||||
#upload_url: https://pacs.gw12320.com/adminecg/admin-api/tblist/ecganalysisparas/parsePhotoCreateData
|
||||
upload_url: http://localhost:48080/admin-api/tblist/ecganalysisparas/parsePhotoCreateData
|
||||
|
||||
# 底部识别配置
|
||||
@ -141,6 +141,20 @@ directories:
|
||||
key_mapping:
|
||||
"医师": "doctor"
|
||||
"结论": "conclusion"
|
||||
|
||||
- path: "./建陵卫生院" # 需要旋转90度的图片目录
|
||||
recognition_type: "rotate90" # 新增旋转类型
|
||||
recognition_area: # 添加旋转后的识别区域
|
||||
start_x: 0
|
||||
start_y: 0
|
||||
width: 100 # 宽度百分比
|
||||
height: 25 # 高度百分比
|
||||
key_mapping:
|
||||
"ID": "examId"
|
||||
"姓 名": "name"
|
||||
"年 龄": "age"
|
||||
"性 别": "gender"
|
||||
"时 间": "collectionTime"
|
||||
|
||||
# OCR程序与语言包路径配置
|
||||
|
||||
|
@ -71,7 +71,6 @@ public class ConfigManager {
|
||||
// 检查是否是分块模式
|
||||
if (dir.containsKey("recognition_area")) {
|
||||
Map<String, Integer> recognitionArea = (Map<String, Integer>) dir.get("recognition_area");
|
||||
List<Map<String, Object>> splitBlocks = (List<Map<String, Object>>) dir.get("split_blocks");
|
||||
|
||||
// 创建识别区域对象
|
||||
DirectoryConfig.RecognitionArea area = new DirectoryConfig.RecognitionArea(
|
||||
@ -81,48 +80,88 @@ public class ConfigManager {
|
||||
recognitionArea.get("height")
|
||||
);
|
||||
|
||||
// 创建切割块列表
|
||||
List<DirectoryConfig.SplitBlock> blocks = new ArrayList<>();
|
||||
int totalWidth = 0;
|
||||
|
||||
// 计算总宽度,确保所有块的宽度总和为100%
|
||||
for (Map<String, Object> block : splitBlocks) {
|
||||
totalWidth += (Integer) block.get("width_percent");
|
||||
}
|
||||
|
||||
if (totalWidth != 100) {
|
||||
logger.warn("目录 {} 的分块宽度总和为 {}%,不等于100%,将进行自动调整", path, totalWidth);
|
||||
}
|
||||
|
||||
// 创建分块配置
|
||||
totalWidth = 0;
|
||||
for (int i = 0; i < splitBlocks.size(); i++) {
|
||||
Map<String, Object> block = splitBlocks.get(i);
|
||||
int widthPercent = (Integer) block.get("width_percent");
|
||||
Map<String, String> keyMapping = (Map<String, String>) block.get("key_mapping");
|
||||
|
||||
// 如果是最后一个块且总宽度不足100%,调整宽度
|
||||
if (i == splitBlocks.size() - 1 && totalWidth + widthPercent != 100) {
|
||||
widthPercent = 100 - totalWidth;
|
||||
logger.info("调整最后一个分块的宽度为 {}%,确保总宽度为100%", widthPercent);
|
||||
// 根据识别类型处理
|
||||
if ("rotate90".equals(recognitionType)) {
|
||||
// 旋转类型不需要分块
|
||||
Map<String, String> keyMapping = (Map<String, String>) dir.get("key_mapping");
|
||||
DirectoryConfig dirConfig;
|
||||
if (bottomKeyWords != null && bottomKeyMapping != null) {
|
||||
dirConfig = new DirectoryConfig(path, keyMapping, bottomKeyWords, bottomKeyMapping);
|
||||
} else if (bottomKeyWords != null) {
|
||||
dirConfig = new DirectoryConfig(path, keyMapping, bottomKeyWords);
|
||||
} else {
|
||||
dirConfig = new DirectoryConfig(path, keyMapping);
|
||||
}
|
||||
dirConfig.setRecognitionType(recognitionType);
|
||||
// 设置识别区域
|
||||
dirConfig.setRecognitionArea(area);
|
||||
configs.add(dirConfig);
|
||||
logger.info("加载旋转识别目录配置: {}", path);
|
||||
} else {
|
||||
// 处理分块模式
|
||||
List<Map<String, Object>> splitBlocks = (List<Map<String, Object>>) dir.get("split_blocks");
|
||||
if (splitBlocks == null) {
|
||||
logger.warn("目录 {} 配置了recognition_area但没有split_blocks,将使用整个区域处理", path);
|
||||
// 如果没有分块配置,仍然使用普通模式
|
||||
Map<String, String> keyMapping = (Map<String, String>) dir.get("key_mapping");
|
||||
DirectoryConfig dirConfig;
|
||||
if (bottomKeyWords != null && bottomKeyMapping != null) {
|
||||
dirConfig = new DirectoryConfig(path, keyMapping, bottomKeyWords, bottomKeyMapping);
|
||||
} else if (bottomKeyWords != null) {
|
||||
dirConfig = new DirectoryConfig(path, keyMapping, bottomKeyWords);
|
||||
} else {
|
||||
dirConfig = new DirectoryConfig(path, keyMapping);
|
||||
}
|
||||
dirConfig.setRecognitionType(recognitionType);
|
||||
dirConfig.setRecognitionArea(area);
|
||||
configs.add(dirConfig);
|
||||
logger.info("加载区域识别目录配置: {}", path);
|
||||
continue;
|
||||
}
|
||||
|
||||
blocks.add(new DirectoryConfig.SplitBlock(widthPercent, keyMapping));
|
||||
totalWidth += widthPercent;
|
||||
// 创建切割块列表
|
||||
List<DirectoryConfig.SplitBlock> blocks = new ArrayList<>();
|
||||
int totalWidth = 0;
|
||||
|
||||
// 计算总宽度,确保所有块的宽度总和为100%
|
||||
for (Map<String, Object> block : splitBlocks) {
|
||||
totalWidth += (Integer) block.get("width_percent");
|
||||
}
|
||||
|
||||
if (totalWidth != 100) {
|
||||
logger.warn("目录 {} 的分块宽度总和为 {}%,不等于100%,将进行自动调整", path, totalWidth);
|
||||
}
|
||||
|
||||
// 创建分块配置
|
||||
totalWidth = 0;
|
||||
for (int i = 0; i < splitBlocks.size(); i++) {
|
||||
Map<String, Object> block = splitBlocks.get(i);
|
||||
int widthPercent = (Integer) block.get("width_percent");
|
||||
Map<String, String> keyMapping = (Map<String, String>) block.get("key_mapping");
|
||||
|
||||
// 如果是最后一个块且总宽度不足100%,调整宽度
|
||||
if (i == splitBlocks.size() - 1 && totalWidth + widthPercent != 100) {
|
||||
widthPercent = 100 - totalWidth;
|
||||
logger.info("调整最后一个分块的宽度为 {}%,确保总宽度为100%", widthPercent);
|
||||
}
|
||||
|
||||
blocks.add(new DirectoryConfig.SplitBlock(widthPercent, keyMapping));
|
||||
totalWidth += widthPercent;
|
||||
}
|
||||
|
||||
// 创建目录配置(带有或不带有底部关键字和映射)
|
||||
DirectoryConfig dirConfig;
|
||||
if (bottomKeyWords != null && bottomKeyMapping != null) {
|
||||
dirConfig = new DirectoryConfig(path, area, blocks, bottomKeyWords, bottomKeyMapping);
|
||||
} else if (bottomKeyWords != null) {
|
||||
dirConfig = new DirectoryConfig(path, area, blocks, bottomKeyWords);
|
||||
} else {
|
||||
dirConfig = new DirectoryConfig(path, area, blocks);
|
||||
}
|
||||
dirConfig.setRecognitionType(recognitionType);
|
||||
configs.add(dirConfig);
|
||||
logger.info("加载分块目录配置: {}, 分块数: {}", path, blocks.size());
|
||||
}
|
||||
|
||||
// 创建目录配置(带有或不带有底部关键字和映射)
|
||||
DirectoryConfig dirConfig;
|
||||
if (bottomKeyWords != null && bottomKeyMapping != null) {
|
||||
dirConfig = new DirectoryConfig(path, area, blocks, bottomKeyWords, bottomKeyMapping);
|
||||
} else if (bottomKeyWords != null) {
|
||||
dirConfig = new DirectoryConfig(path, area, blocks, bottomKeyWords);
|
||||
} else {
|
||||
dirConfig = new DirectoryConfig(path, area, blocks);
|
||||
}
|
||||
dirConfig.setRecognitionType(recognitionType);
|
||||
configs.add(dirConfig);
|
||||
logger.info("加载分块目录配置: {}, 分块数: {}", path, blocks.size());
|
||||
} else {
|
||||
Map<String, String> keyMapping = (Map<String, String>) dir.get("key_mapping");
|
||||
|
||||
|
@ -105,6 +105,10 @@ public class DirectoryConfig {
|
||||
return recognitionArea;
|
||||
}
|
||||
|
||||
public void setRecognitionArea(RecognitionArea recognitionArea) {
|
||||
this.recognitionArea = recognitionArea;
|
||||
}
|
||||
|
||||
public List<SplitBlock> getSplitBlocks() {
|
||||
return splitBlocks;
|
||||
}
|
||||
|
@ -17,6 +17,8 @@ import org.apache.http.util.EntityUtils;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.Graphics2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
@ -87,6 +89,9 @@ public class FolderMonitor {
|
||||
case "templateA":
|
||||
extractedData = processImageWithTemplateA(imageFullPath);
|
||||
break;
|
||||
case "rotate90":
|
||||
extractedData = processImageWithRotate90(imageFullPath);
|
||||
break;
|
||||
case "normal":
|
||||
default:
|
||||
extractedData = processImageNormal(imageFullPath);
|
||||
@ -107,6 +112,9 @@ public class FolderMonitor {
|
||||
|
||||
logger.info("添加文件信息 - orgName: {}, ecgDataFilePath: {}", parentFolderName, fileName);
|
||||
|
||||
// 处理已提取的时间格式
|
||||
processTimeFields(extractedData);
|
||||
|
||||
// 检查是否需要进行底部识别
|
||||
Map<String, Object> config = configManager.getConfig();
|
||||
if (config.containsKey("bottom_recognition")) {
|
||||
@ -697,6 +705,484 @@ public class FolderMonitor {
|
||||
return processImageNormal(imageFullPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理图片,将图片逆时针旋转90度
|
||||
* @param imageFullPath 图片完整路径
|
||||
* @return 识别结果
|
||||
*/
|
||||
private Map<String, String> processImageWithRotate90(String imageFullPath) throws Exception {
|
||||
logger.info("使用旋转90度识别逻辑处理: {}", imageFullPath);
|
||||
|
||||
// 读取原始图像
|
||||
BufferedImage originalImage = ImageIO.read(new File(imageFullPath));
|
||||
|
||||
// 创建一个旋转后的图像(宽高交换)
|
||||
int width = originalImage.getWidth();
|
||||
int height = originalImage.getHeight();
|
||||
BufferedImage rotatedImage = new BufferedImage(height, width, originalImage.getType());
|
||||
|
||||
// 执行逆时针旋转90度操作
|
||||
Graphics2D g2d = rotatedImage.createGraphics();
|
||||
g2d.translate(height, 0);
|
||||
g2d.rotate(Math.PI / 2);
|
||||
g2d.drawImage(originalImage, 0, 0, null);
|
||||
g2d.dispose();
|
||||
|
||||
// 将旋转后的图片保存回原始位置
|
||||
// ImageIO.write(rotatedImage, "PNG", new File(imageFullPath));
|
||||
// logger.info("已将图片逆时针旋转90度并保存: {}", imageFullPath);
|
||||
|
||||
// 在旋转后的图片上执行OCR识别
|
||||
String result = tesseract.doOCR(new File(imageFullPath));
|
||||
logger.info("旋转90度后OCR识别结果: {}", result);
|
||||
|
||||
// 处理OCR结果
|
||||
return processOcrResult(result, directoryConfig.getKeyMapping());
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理OCR识别结果
|
||||
* @param ocrResult OCR结果文本
|
||||
* @param keyMapping 关键词映射
|
||||
* @return 提取的数据
|
||||
*/
|
||||
private Map<String, String> processOcrResult(String ocrResult, Map<String, String> keyMapping) {
|
||||
Map<String, String> extractedData = new HashMap<>();
|
||||
String[] lines = ocrResult.split("\\r?\\n");
|
||||
logger.info("OCR结果分割为 {} 行", lines.length);
|
||||
|
||||
// 处理第一行 - 整行处理获取时间等信息
|
||||
if (lines.length > 0) {
|
||||
String firstLine = lines[0].trim().replaceAll("\\s+", " ");
|
||||
if (!firstLine.isEmpty()) {
|
||||
logger.info("处理第一行(整行): {}", firstLine);
|
||||
|
||||
// 创建时间相关映射
|
||||
Map<String, String> timeMapping = new HashMap<>();
|
||||
for (Map.Entry<String, String> entry : keyMapping.entrySet()) {
|
||||
if (entry.getValue().contains("time") || entry.getValue().contains("Time") ||
|
||||
entry.getValue().contains("日期") || entry.getValue().contains("collectionTime")) {
|
||||
timeMapping.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
processLine(firstLine, timeMapping, extractedData);
|
||||
}
|
||||
}
|
||||
|
||||
// 处理第二行 - 主要提取ID
|
||||
if (lines.length > 1) {
|
||||
String secondLine = lines[1].trim().replaceAll("\\s+", " ");
|
||||
if (!secondLine.isEmpty()) {
|
||||
logger.info("处理第二行(主要提取ID): {}", secondLine);
|
||||
|
||||
// 针对ID创建映射
|
||||
Map<String, String> idMapping = new HashMap<>();
|
||||
for (Map.Entry<String, String> entry : keyMapping.entrySet()) {
|
||||
if (entry.getValue().contains("id") || entry.getValue().contains("Id") ||
|
||||
entry.getValue().equals("ID") || entry.getValue().equals("examId")) {
|
||||
idMapping.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
// 处理ID提取
|
||||
processLine(secondLine, idMapping, extractedData);
|
||||
}
|
||||
}
|
||||
|
||||
// 处理第三行 - 同时提取姓名、性别和年龄
|
||||
if (lines.length > 2) {
|
||||
String thirdLine = lines[2].trim().replaceAll("\\s+", " ");
|
||||
if (!thirdLine.isEmpty()) {
|
||||
logger.info("处理第三行(提取姓名、性别和年龄): {}", thirdLine);
|
||||
|
||||
// 提取姓名
|
||||
String nameField = null;
|
||||
for (Map.Entry<String, String> entry : keyMapping.entrySet()) {
|
||||
if (entry.getValue().contains("name") || entry.getValue().contains("姓名")) {
|
||||
nameField = entry.getValue();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (nameField != null && thirdLine.contains("姓") && thirdLine.contains("名")) {
|
||||
// 通过"姓名"标记提取姓名,修改正则表达式以匹配包含引号的情况
|
||||
String namePattern = "姓\\s*名\\s*[::]?\\s*[\"\\s]*([^\\d年龄]{2,8})";
|
||||
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile(namePattern);
|
||||
java.util.regex.Matcher matcher = pattern.matcher(thirdLine);
|
||||
|
||||
if (matcher.find()) {
|
||||
String name = matcher.group(1).replaceAll("\\s+", "");
|
||||
// 去除名字中的引号字符
|
||||
name = name.replaceAll("\"", "") // 双引号
|
||||
.replaceAll("\u201C", "") // 左双引号
|
||||
.replaceAll("\u201D", ""); // 右双引号
|
||||
extractedData.put(nameField, name);
|
||||
logger.info("从第三行提取姓名: {}", name);
|
||||
} else {
|
||||
// 尝试使用另一种提取方法
|
||||
int nameStart = thirdLine.indexOf("姓名");
|
||||
if (nameStart < 0) nameStart = thirdLine.indexOf("姓 名");
|
||||
int ageStart = thirdLine.indexOf("年龄");
|
||||
if (ageStart < 0) ageStart = thirdLine.indexOf("年 龄");
|
||||
|
||||
if (nameStart >= 0 && ageStart > nameStart) {
|
||||
String nameSection = thirdLine.substring(nameStart + 2, ageStart).trim();
|
||||
// 清理冒号、引号等
|
||||
nameSection = nameSection.replaceAll("[::]", "").trim();
|
||||
nameSection = nameSection.replaceAll("\"", "").trim();
|
||||
nameSection = nameSection.replaceAll("\u201C", "").trim(); // 左双引号
|
||||
nameSection = nameSection.replaceAll("\u201D", "").trim(); // 右双引号
|
||||
// 移除空格
|
||||
nameSection = nameSection.replaceAll("\\s+", "");
|
||||
|
||||
if (!nameSection.isEmpty()) {
|
||||
extractedData.put(nameField, nameSection);
|
||||
logger.info("通过分割提取姓名: {}", nameSection);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 针对性别和年龄创建特定的映射
|
||||
Map<String, String> genderAgeMapping = new HashMap<>();
|
||||
for (Map.Entry<String, String> entry : keyMapping.entrySet()) {
|
||||
if (entry.getValue().contains("gender") || entry.getValue().contains("性别") ||
|
||||
entry.getValue().contains("age") || entry.getValue().contains("年龄")) {
|
||||
genderAgeMapping.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
// 提取性别和年龄
|
||||
processLine(thirdLine, genderAgeMapping, extractedData);
|
||||
|
||||
// 如果姓名未提取成功,尝试手动提取
|
||||
if (!extractedData.keySet().stream().anyMatch(k -> k.contains("name") || k.contains("姓名"))) {
|
||||
// 针对特定的行结构:"姓 名 : " 秦 浣 彷 年 龄 : 27"
|
||||
if (thirdLine.contains("姓") && thirdLine.contains("名") && thirdLine.contains("年") && thirdLine.contains("龄")) {
|
||||
// 获取"姓名"和"年龄"之间的内容
|
||||
int nameStart = thirdLine.indexOf("名");
|
||||
int ageStart = thirdLine.indexOf("年龄");
|
||||
|
||||
if (nameStart >= 0 && ageStart > nameStart) {
|
||||
String nameSection = thirdLine.substring(nameStart + 1, ageStart).trim();
|
||||
// 清理冒号、引号等
|
||||
nameSection = nameSection.replaceAll("[::\"]", "").trim();
|
||||
// 移除空格
|
||||
nameSection = nameSection.replaceAll("\\s+", "");
|
||||
|
||||
if (!nameSection.isEmpty() && nameField != null) {
|
||||
extractedData.put(nameField, nameSection);
|
||||
logger.info("通过位置提取姓名: {}", nameSection);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 处理剩余行 - 使用完整的关键字映射
|
||||
for (int i = 3; i < lines.length; i++) {
|
||||
String line = lines[i].trim().replaceAll("\\s+", " ");
|
||||
if (line.isEmpty()) continue;
|
||||
|
||||
logger.info("处理行 {}: {}", i+1, line);
|
||||
processLine(line, keyMapping, extractedData);
|
||||
}
|
||||
|
||||
// 确保所有提取到的数据都被存入结果中
|
||||
logger.info("提取结果: {}", extractedData);
|
||||
|
||||
return extractedData;
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理单行文本,提取关键字和对应的值
|
||||
* @param line 要处理的文本行
|
||||
* @param keyMapping 关键字映射
|
||||
* @param extractedData 用于存储提取的数据的Map
|
||||
*/
|
||||
private void processLine(String line, Map<String, String> keyMapping, Map<String, String> extractedData) {
|
||||
// 在一行中查找所有key
|
||||
for (Map.Entry<String, String> entry : keyMapping.entrySet()) {
|
||||
String key = entry.getKey();
|
||||
String value = entry.getValue();
|
||||
|
||||
if (line.contains(key)) {
|
||||
// 提取key后面的值
|
||||
String[] parts = line.split(key);
|
||||
if (parts.length > 1) {
|
||||
String extractedValue = parts[1].trim();
|
||||
|
||||
// 清除值前面的冒号和空格
|
||||
if (extractedValue.startsWith(":") || extractedValue.startsWith(":")) {
|
||||
extractedValue = extractedValue.substring(1).trim();
|
||||
}
|
||||
|
||||
// 根据字段类型进行特殊处理
|
||||
if (value.contains("id") || value.contains("Id") || value.equals("ID") || value.equals("examId")) {
|
||||
// 对ID进行处理,只保留数字
|
||||
extractedValue = extractedValue.replaceAll("[^0-9]", "");
|
||||
} else if (value.contains("gender") || value.contains("性别")) {
|
||||
// 处理性别信息,只保留"男"或"女"
|
||||
if (extractedValue.contains("男")) {
|
||||
extractedValue = "男";
|
||||
} else if (extractedValue.contains("女")) {
|
||||
extractedValue = "女";
|
||||
}
|
||||
} else if (value.contains("age") || value.contains("年龄")) {
|
||||
// 处理年龄,只保留数字,并确保是有效的年龄值(通常小于120)
|
||||
if (extractedValue.matches(".*\\d+.*")) {
|
||||
// 提取第一组连续数字(通常是年龄)
|
||||
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile("\\d+");
|
||||
java.util.regex.Matcher matcher = pattern.matcher(extractedValue);
|
||||
if (matcher.find()) {
|
||||
String ageStr = matcher.group();
|
||||
try {
|
||||
int age = Integer.parseInt(ageStr);
|
||||
// 判断是否是合理的年龄值
|
||||
if (age > 0 && age < 120) {
|
||||
extractedValue = String.valueOf(age);
|
||||
} else {
|
||||
// 如果不合理,可能误提取了其他数字,尝试进一步处理
|
||||
// 通常年龄是较小的数字,而身高是较大的数字
|
||||
if (age > 120 && ageStr.length() >= 3) {
|
||||
// 可能包含了身高,尝试只取前两位
|
||||
extractedValue = ageStr.substring(0, Math.min(2, ageStr.length()));
|
||||
logger.info("年龄值过大,截取前两位: {} -> {}", ageStr, extractedValue);
|
||||
}
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("年龄解析错误: {}", ageStr);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (value.contains("collectionTime") || value.contains("exam_time") || value.contains("time") || value.contains("日期")) {
|
||||
// 处理日期时间格式
|
||||
extractedValue = extractedValue.replaceAll("\\s+", " ");
|
||||
|
||||
// 使用-分割字符串
|
||||
String[] dateParts = extractedValue.split("-");
|
||||
if (dateParts.length >= 3) {
|
||||
try {
|
||||
// 处理年份
|
||||
int year = Integer.parseInt(dateParts[0]);
|
||||
int currentYear = java.time.LocalDate.now().getYear();
|
||||
if (year > currentYear) {
|
||||
year = currentYear;
|
||||
logger.info("年份大于当前年份,使用当前年份: {}", year);
|
||||
}
|
||||
|
||||
// 处理月份
|
||||
int month = Integer.parseInt(dateParts[1]);
|
||||
if (month > 12) {
|
||||
String monthStr = String.valueOf(month);
|
||||
if (monthStr.length() >= 2) {
|
||||
// 取最后两位
|
||||
month = Integer.parseInt(monthStr.substring(monthStr.length() - 2));
|
||||
logger.info("月份大于12,取后两位: {}", month);
|
||||
}
|
||||
}
|
||||
|
||||
// 处理日期(取第一部分)
|
||||
String dayPart = dateParts[2].split("\\s+")[0]; // 只取日期部分,去掉时间
|
||||
int day = Integer.parseInt(dayPart);
|
||||
|
||||
// 日期有效性检查
|
||||
if (year > 1900 && year <= currentYear && month >= 1 && month <= 12 && day >= 1 && day <= 31) {
|
||||
extractedValue = String.format("%04d-%02d-%02d", year, month, day);
|
||||
logger.info("格式化后的日期: {}", extractedValue);
|
||||
} else {
|
||||
logger.warn("日期无效: 年={}, 月={}, 日={}", year, month, day);
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("日期解析错误: {}", extractedValue);
|
||||
}
|
||||
} else {
|
||||
logger.warn("日期格式不正确: {}", extractedValue);
|
||||
}
|
||||
} else if (value.contains("name") || value.contains("姓名")) {
|
||||
// 处理姓名,去除多余的空格,并验证是否为有效姓名
|
||||
extractedValue = extractedValue.replaceAll("\\s+", "");
|
||||
|
||||
// 检查姓名是否包含"医院"、"卫生院"等机构名称,如果包含则可能是误提取
|
||||
if (extractedValue.contains("医院") || extractedValue.contains("卫生院") ||
|
||||
extractedValue.contains("诊所") || extractedValue.contains("中心")) {
|
||||
// 这可能是医院名称而不是患者姓名,不进行存储
|
||||
logger.warn("疑似将机构名称误识别为姓名: {}", extractedValue);
|
||||
continue; // 跳过这个字段,不添加到结果中
|
||||
}
|
||||
|
||||
// 检查姓名长度,正常中文姓名长度为2-4个字符
|
||||
if (extractedValue.length() > 10) {
|
||||
logger.warn("疑似姓名长度异常: {}", extractedValue);
|
||||
continue; // 跳过这个字段,不添加到结果中
|
||||
}
|
||||
}
|
||||
|
||||
extractedData.put(value, extractedValue);
|
||||
logger.info("找到匹配: {} = {}", value, extractedValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理已提取的时间格式
|
||||
* @param extractedData 已提取的数据
|
||||
*/
|
||||
private void processTimeFields(Map<String, String> extractedData) {
|
||||
// 处理已提取的时间格式
|
||||
logger.info("开始处理时间字段,当前提取数据: {}", extractedData);
|
||||
|
||||
for (Map.Entry<String, String> entry : extractedData.entrySet()) {
|
||||
String key = entry.getKey();
|
||||
String value = entry.getValue();
|
||||
|
||||
if (key.contains("time") || key.contains("Time") || key.contains("collection") || key.contains("collectionTime") ||
|
||||
key.contains("时间") || key.contains("日期")) {
|
||||
logger.info("处理时间字段: {} = {}", key, value);
|
||||
// 处理日期时间格式
|
||||
value = value.replaceAll("\\s+", " ");
|
||||
|
||||
// 识别数字格式,尝试提取年月日和时分秒
|
||||
String dateTimePattern = "(\\d{4})[-/]?(\\d{1,2})[-/]?(\\d{1,2})\\s*(\\d{1,2})[:\\s]?(\\d{1,2})[:\\s]?(\\d{1,2})";
|
||||
java.util.regex.Pattern pattern = java.util.regex.Pattern.compile(dateTimePattern);
|
||||
java.util.regex.Matcher matcher = pattern.matcher(value);
|
||||
|
||||
if (matcher.find()) {
|
||||
try {
|
||||
int year = Integer.parseInt(matcher.group(1));
|
||||
int month = Integer.parseInt(matcher.group(2));
|
||||
int day = Integer.parseInt(matcher.group(3));
|
||||
int hour = Integer.parseInt(matcher.group(4));
|
||||
int minute = Integer.parseInt(matcher.group(5));
|
||||
int second = Integer.parseInt(matcher.group(6));
|
||||
|
||||
logger.info("原始日期时间值: 年={}, 月={}, 日={}, 时={}, 分={}, 秒={}",
|
||||
year, month, day, hour, minute, second);
|
||||
|
||||
// 获取当前年份
|
||||
int currentYear = java.time.LocalDate.now().getYear();
|
||||
|
||||
// 修正明显错误的日期值
|
||||
// 如果年份大于当前年份,使用当前年份
|
||||
if (year > currentYear) {
|
||||
logger.info("年份 {} 大于当前年份 {},将使用当前年份", year, currentYear);
|
||||
year = currentYear;
|
||||
logger.info("修正日期中的年份为当前年份: {}", year);
|
||||
}
|
||||
|
||||
// 如果月份大于12,取后两位
|
||||
if (month > 12) {
|
||||
logger.info("月份 {} 无效,开始修正", month);
|
||||
String monthStr = String.valueOf(month);
|
||||
if (monthStr.length() >= 2) {
|
||||
// 取最后两位
|
||||
int newMonth = Integer.parseInt(monthStr.substring(monthStr.length() - 2));
|
||||
logger.info("尝试取月份后两位: {} -> {}", month, newMonth);
|
||||
month = newMonth;
|
||||
// 如果取两位后仍然大于12,则只取最后一位
|
||||
if (month > 12) {
|
||||
newMonth = Integer.parseInt(monthStr.substring(monthStr.length() - 1));
|
||||
logger.info("月份仍然无效,取最后一位: {} -> {}", month, newMonth);
|
||||
month = newMonth;
|
||||
}
|
||||
} else {
|
||||
int newMonth = month % 10;
|
||||
logger.info("对个位数月份取模: {} -> {}", month, newMonth);
|
||||
month = newMonth; // 对于个位数,使用取模方式
|
||||
}
|
||||
logger.info("修正后的月份: {}", month);
|
||||
}
|
||||
|
||||
if (day > 31) {
|
||||
logger.info("日期 {} 无效,开始修正", day);
|
||||
int newDay = day % 100; // 对于识别错误,尝试取后两位
|
||||
logger.info("尝试取模100修正日期: {} -> {}", day, newDay);
|
||||
day = newDay;
|
||||
|
||||
if (day > 31) {
|
||||
newDay = day % 10;
|
||||
logger.info("日期仍然无效,取模10: {} -> {}", day, newDay);
|
||||
day = newDay;
|
||||
}
|
||||
|
||||
logger.info("修正后的日期: {}", day);
|
||||
}
|
||||
|
||||
// 日期有效性检查
|
||||
if (year > 1900 && year <= currentYear && month >= 1 && month <= 12 && day >= 1 && day <= 31) {
|
||||
String formattedDate = String.format("%04d-%02d-%02d", year, month, day);
|
||||
logger.info("字段 {} 的日期已修正: {} -> {}", key, value, formattedDate);
|
||||
extractedData.put(key, formattedDate);
|
||||
} else {
|
||||
logger.warn("字段 {} 的日期在有效性检查后仍然无效: 年={}, 月={}, 日={}",
|
||||
key, year, month, day);
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("日期时间解析错误: {}, 异常: {}", value, e.getMessage());
|
||||
}
|
||||
} else {
|
||||
logger.info("字段 {} 不符合完整日期时间格式,尝试仅匹配日期部分", key);
|
||||
// 如果无法匹配完整的日期时间格式,尝试仅匹配日期部分
|
||||
String datePattern = "(\\d{4})[-/]?(\\d{1,2})[-/]?(\\d{1,2})";
|
||||
pattern = java.util.regex.Pattern.compile(datePattern);
|
||||
matcher = pattern.matcher(value);
|
||||
|
||||
if (matcher.find()) {
|
||||
try {
|
||||
int year = Integer.parseInt(matcher.group(1));
|
||||
int month = Integer.parseInt(matcher.group(2));
|
||||
int day = Integer.parseInt(matcher.group(3));
|
||||
|
||||
logger.info("仅日期部分匹配: 年={}, 月={}, 日={}", year, month, day);
|
||||
|
||||
// 获取当前年份
|
||||
int currentYear = java.time.LocalDate.now().getYear();
|
||||
|
||||
// 修正明显错误的日期值
|
||||
if (month > 12) {
|
||||
logger.info("月份 {} 无效,尝试修正", month);
|
||||
int newMonth = month % 10; // 对于识别错误如"904",取最后一位数字"4"
|
||||
logger.info("取模10修正月份: {} -> {}", month, newMonth);
|
||||
month = newMonth;
|
||||
}
|
||||
if (day > 31) {
|
||||
logger.info("日期 {} 无效,尝试修正", day);
|
||||
int newDay = day % 100; // 对于识别错误,尝试取后两位
|
||||
logger.info("取模100修正日期: {} -> {}", day, newDay);
|
||||
day = newDay;
|
||||
|
||||
if (day > 31) {
|
||||
newDay = day % 10;
|
||||
logger.info("日期仍然无效,取模10: {} -> {}", day, newDay);
|
||||
day = newDay;
|
||||
}
|
||||
}
|
||||
|
||||
// 日期有效性检查
|
||||
if (year > 1900 && year < 2100 && month >= 1 && month <= 12 && day >= 1 && day <= 31) {
|
||||
String formattedDate = String.format("%04d-%02d-%02d", year, month, day);
|
||||
logger.info("字段 {} 的日期已修正: {} -> {}", key, value, formattedDate);
|
||||
extractedData.put(key, formattedDate);
|
||||
} else {
|
||||
logger.warn("字段 {} 的日期在有效性检查后仍然无效: 年={}, 月={}, 日={}",
|
||||
key, year, month, day);
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
logger.warn("日期解析错误: {}, 异常: {}", value, e.getMessage());
|
||||
}
|
||||
} else {
|
||||
logger.warn("字段 {} 完全不匹配日期格式,保留原值: {}", key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("时间字段处理完成,处理后数据: {}", extractedData);
|
||||
}
|
||||
|
||||
/**
|
||||
* 关闭资源
|
||||
*/
|
||||
@ -709,4 +1195,4 @@ public class FolderMonitor {
|
||||
logger.error("关闭HttpClient失败", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user