refactor: 收敛文档 OCR 配置并补齐 Office 解析兼容

- 统一 MinerU OCR 配置结构并移除分模块冗余属性类

- 补齐 JSON 字符串化结果拆包、XLSX 图片兼容与对应回归测试
This commit is contained in:
2026-04-18 13:01:17 +08:00
parent b66876d0fd
commit 56ee149e7c
15 changed files with 559 additions and 272 deletions

View File

@@ -1,6 +1,5 @@
package com.easyagents.document.core.mineru;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import com.easyagents.core.util.StringUtil;
import com.easyagents.document.core.exception.DocumentParseException;
@@ -16,6 +15,7 @@ import okhttp3.ResponseBody;
import java.io.IOException;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
@@ -115,7 +115,10 @@ public class MineruClient {
}
String contentType = response.header("Content-Type");
if (contentType != null && contentType.contains("application/json")) {
JSONObject jsonObject = JSON.parseObject(new String(responseBytes));
JSONObject jsonObject = MineruJsonSupport.parseObject(
new String(responseBytes, StandardCharsets.UTF_8),
"MinerU async result response: " + path
);
throw new DocumentParseException("MinerU async result is not ready: " + jsonObject.toJSONString());
}
if (responseBytes.length < 2 || responseBytes[0] != 'P' || responseBytes[1] != 'K') {
@@ -148,9 +151,13 @@ public class MineruClient {
ResponseBody body = response.body();
String bodyText = body == null ? "" : body.string();
if (!response.isSuccessful()) {
throw buildHttpException(path, response.code(), bodyText == null ? new byte[0] : bodyText.getBytes());
throw buildHttpException(
path,
response.code(),
bodyText == null ? new byte[0] : bodyText.getBytes(StandardCharsets.UTF_8)
);
}
return JSON.parseObject(bodyText);
return MineruJsonSupport.parseObject(bodyText, "MinerU response body: " + path);
} catch (IOException exception) {
throw new DocumentParseException("Failed to call MinerU endpoint: " + path, exception);
}
@@ -197,7 +204,7 @@ public class MineruClient {
}
private DocumentParseException buildHttpException(String path, int statusCode, byte[] bodyBytes) {
String bodyText = bodyBytes == null ? "" : new String(bodyBytes);
String bodyText = bodyBytes == null ? "" : new String(bodyBytes, StandardCharsets.UTF_8);
return new DocumentParseException(
"MinerU request failed: path=" + path + ", status=" + statusCode + ", body=" + bodyText
);

View File

@@ -0,0 +1,199 @@
package com.easyagents.document.core.mineru;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import com.easyagents.core.util.StringUtil;
import com.easyagents.document.core.exception.DocumentParseException;
/**
* MinerU JSON 兼容解析工具。
*
* <p>部分部署返回的结构化字段会被额外包装成 JSON 字符串,
* 例如 {@code middle_json="{}"} 或整个响应体直接返回
* {@code "{\"results\":{...}}"}。该工具负责统一拆包,
* 让上层模块无需重复处理这些兼容分支。</p>
*
* @author Codex
* @since 2026-04-17
*/
final class MineruJsonSupport {
private static final int MAX_JSON_UNWRAP_DEPTH = 4;
private static final int PREVIEW_LIMIT = 160;
private MineruJsonSupport() {
}
/**
* 解析 JSON 文本并要求最终结果为对象。
*
* @param text JSON 文本
* @param context 错误上下文
* @return JSON 对象
*/
static JSONObject parseObject(String text, String context) {
Object value = parseValue(text, context);
if (value instanceof JSONObject) {
return (JSONObject) value;
}
throw unexpectedType(context, "JSONObject", value);
}
/**
* 规范化任意 JSON 值。
*
* @param value 原始值
* @param context 错误上下文
* @return 规范化后的 JSON 值
*/
static Object normalizeValue(Object value, String context) {
if (value == null) {
return null;
}
if (value instanceof JSONObject || value instanceof JSONArray) {
return value;
}
if (value instanceof CharSequence) {
String text = value.toString().trim();
if (!StringUtil.hasText(text)) {
return null;
}
return parseValue(text, context);
}
try {
return unwrapNestedJsonString(JSON.parse(JSON.toJSONString(value)), context);
} catch (Exception exception) {
throw new DocumentParseException(
"Failed to normalize MinerU JSON value: " + context + ", preview=" + preview(String.valueOf(value)),
exception
);
}
}
/**
* 在值看起来像 JSON 时才尝试拆包,否则保留原始值。
*
* @param value 原始值
* @param context 错误上下文
* @return 规范化后的值或原值
*/
static Object normalizeValueIfJsonLike(Object value, String context) {
if (!(value instanceof CharSequence)) {
return normalizeValue(value, context);
}
String text = value.toString().trim();
if (!looksLikeJson(text) && !(text.startsWith("\"") && text.endsWith("\""))) {
return value;
}
return normalizeValue(value, context);
}
/**
* 将任意 JSON 值转换为对象。
*
* @param value 原始值
* @param context 错误上下文
* @return JSON 对象
*/
static JSONObject asObject(Object value, String context) {
Object normalized = normalizeValue(value, context);
if (normalized == null) {
return null;
}
if (normalized instanceof JSONObject) {
return (JSONObject) normalized;
}
throw unexpectedType(context, "JSONObject", normalized);
}
/**
* 将任意 JSON 值转换为数组。
*
* @param value 原始值
* @param context 错误上下文
* @return JSON 数组
*/
static JSONArray asArray(Object value, String context) {
Object normalized = normalizeValue(value, context);
if (normalized == null) {
return null;
}
if (normalized instanceof JSONArray) {
return (JSONArray) normalized;
}
throw unexpectedType(context, "JSONArray", normalized);
}
/**
* 解析 JSON 文本并自动拆解被双层包装的字符串值。
*
* @param text JSON 文本
* @param context 错误上下文
* @return 解析结果
*/
static Object parseValue(String text, String context) {
String trimmed = text == null ? null : text.trim();
if (!StringUtil.hasText(trimmed)) {
throw new DocumentParseException("MinerU JSON payload is empty: " + context);
}
try {
return unwrapNestedJsonString(JSON.parse(trimmed), context);
} catch (DocumentParseException exception) {
throw exception;
} catch (Exception exception) {
throw new DocumentParseException(
"Failed to parse MinerU JSON payload: " + context + ", preview=" + preview(trimmed),
exception
);
}
}
private static Object unwrapNestedJsonString(Object value, String context) {
Object current = value;
for (int depth = 0; depth < MAX_JSON_UNWRAP_DEPTH; depth++) {
if (!(current instanceof String)) {
return current;
}
String text = ((String) current).trim();
if (!looksLikeJson(text)) {
return current;
}
try {
current = JSON.parse(text);
} catch (Exception exception) {
throw new DocumentParseException(
"Failed to unwrap MinerU nested JSON string: " + context + ", preview=" + preview(text),
exception
);
}
}
return current;
}
private static boolean looksLikeJson(String text) {
if (!StringUtil.hasText(text)) {
return false;
}
char first = text.charAt(0);
return first == '{' || first == '[';
}
private static DocumentParseException unexpectedType(String context, String expectedType, Object actualValue) {
String actualType = actualValue == null ? "null" : actualValue.getClass().getSimpleName();
return new DocumentParseException(
"MinerU JSON payload type mismatch: " + context + ", expected=" + expectedType + ", actual=" + actualType
);
}
private static String preview(String text) {
if (text == null) {
return "";
}
String normalized = text.replace('\n', ' ').replace('\r', ' ');
if (normalized.length() <= PREVIEW_LIMIT) {
return normalized;
}
return normalized.substring(0, PREVIEW_LIMIT) + "...";
}
}

View File

@@ -116,10 +116,16 @@ public class MineruMapper {
payload.setBackend(jsonObject.getString("backend"));
payload.setVersion(jsonObject.getString("version"));
Map<String, JSONObject> results = new LinkedHashMap<String, JSONObject>();
JSONObject resultJson = jsonObject.getJSONObject("results");
JSONObject resultJson = MineruJsonSupport.asObject(jsonObject.get("results"), "MinerU sync results");
if (resultJson != null) {
for (String key : resultJson.keySet()) {
results.put(key, resultJson.getJSONObject(key));
JSONObject result = MineruJsonSupport.asObject(
resultJson.get(key),
"MinerU sync result entry: " + key
);
if (result != null) {
results.put(key, result);
}
}
}
payload.setResults(results);
@@ -239,12 +245,24 @@ public class MineruMapper {
result.setPlainText(result.getMarkdown());
ParseArtifacts artifacts = new ParseArtifacts();
artifacts.setMiddleJson(fileResult.get("middle_json"));
artifacts.setContentList(fileResult.get("content_list"));
artifacts.setModelOutput(fileResult.get("model_output"));
artifacts.setMiddleJson(MineruJsonSupport.normalizeValue(
fileResult.get("middle_json"),
"MinerU result " + fileName + " middle_json"
));
artifacts.setContentList(MineruJsonSupport.normalizeValue(
fileResult.get("content_list"),
"MinerU result " + fileName + " content_list"
));
artifacts.setModelOutput(MineruJsonSupport.normalizeValueIfJsonLike(
fileResult.get("model_output"),
"MinerU result " + fileName + " model_output"
));
result.setArtifacts(artifacts);
Map<String, String> imageDataUrls = toStringMap(fileResult.getJSONObject("images"));
Map<String, String> imageDataUrls = toStringMap(MineruJsonSupport.asObject(
fileResult.get("images"),
"MinerU result " + fileName + " images"
));
Map<String, byte[]> imageContents = toBinaryMap(imageDataUrls);
applyStructuredArtifacts(result, imageDataUrls, imageContents);
if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) {
@@ -266,8 +284,8 @@ public class MineruMapper {
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
JSONObject middleJson = asObject(middleArtifact);
JSONArray contentList = asArray(contentListArtifact);
JSONObject middleJson = asObject(middleArtifact, "MinerU ZIP middle artifact: " + fileName);
JSONArray contentList = asArray(contentListArtifact, "MinerU ZIP content_list artifact: " + fileName);
Object modelOutput = modelOutputArtifact;
if (contentList == null && middleArtifact instanceof JSONArray) {
@@ -283,7 +301,10 @@ public class MineruMapper {
artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
artifacts.setModelOutput(modelOutput);
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
JSONArray contentListV2 = asArray(
firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"),
"MinerU ZIP content_list_v2 artifact: " + fileName
);
if (contentListV2 != null) {
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
}
@@ -308,8 +329,8 @@ public class MineruMapper {
}
private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls, Map<String, byte[]> imageContents) {
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson());
JSONArray contentList = asArray(result.getArtifacts().getContentList());
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson(), "MinerU middle_json artifact");
JSONArray contentList = asArray(result.getArtifacts().getContentList(), "MinerU content_list artifact");
if (middleJson != null) {
fillPages(result, middleJson);
@@ -576,34 +597,18 @@ public class MineruMapper {
if (!StringUtil.hasText(text)) {
return null;
}
try {
return JSON.parse(text);
} catch (Exception exception) {
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
}
return MineruJsonSupport.parseValue(text, "MinerU ZIP artifact " + suffix);
}
private JSONObject asObject(Object value) {
if (value instanceof JSONObject) {
return (JSONObject) value;
}
if (value == null) {
return null;
}
private JSONObject asObject(Object value, String context) {
if (value instanceof JSONArray) {
return null;
}
return JSON.parseObject(JSON.toJSONString(value));
return MineruJsonSupport.asObject(value, context);
}
private JSONArray asArray(Object value) {
if (value instanceof JSONArray) {
return (JSONArray) value;
}
if (value == null) {
return null;
}
return JSON.parseArray(JSON.toJSONString(value));
private JSONArray asArray(Object value, String context) {
return MineruJsonSupport.asArray(value, context);
}
private List<String> toStringList(JSONArray jsonArray) {