refactor: 收敛文档 OCR 配置并补齐 Office 解析兼容
- 统一 MinerU OCR 配置结构并移除分模块冗余属性类 - 补齐 JSON 字符串化结果拆包、XLSX 图片兼容与对应回归测试
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
@@ -16,6 +15,7 @@ import okhttp3.ResponseBody;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URLConnection;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@@ -115,7 +115,10 @@ public class MineruClient {
|
||||
}
|
||||
String contentType = response.header("Content-Type");
|
||||
if (contentType != null && contentType.contains("application/json")) {
|
||||
JSONObject jsonObject = JSON.parseObject(new String(responseBytes));
|
||||
JSONObject jsonObject = MineruJsonSupport.parseObject(
|
||||
new String(responseBytes, StandardCharsets.UTF_8),
|
||||
"MinerU async result response: " + path
|
||||
);
|
||||
throw new DocumentParseException("MinerU async result is not ready: " + jsonObject.toJSONString());
|
||||
}
|
||||
if (responseBytes.length < 2 || responseBytes[0] != 'P' || responseBytes[1] != 'K') {
|
||||
@@ -148,9 +151,13 @@ public class MineruClient {
|
||||
ResponseBody body = response.body();
|
||||
String bodyText = body == null ? "" : body.string();
|
||||
if (!response.isSuccessful()) {
|
||||
throw buildHttpException(path, response.code(), bodyText == null ? new byte[0] : bodyText.getBytes());
|
||||
throw buildHttpException(
|
||||
path,
|
||||
response.code(),
|
||||
bodyText == null ? new byte[0] : bodyText.getBytes(StandardCharsets.UTF_8)
|
||||
);
|
||||
}
|
||||
return JSON.parseObject(bodyText);
|
||||
return MineruJsonSupport.parseObject(bodyText, "MinerU response body: " + path);
|
||||
} catch (IOException exception) {
|
||||
throw new DocumentParseException("Failed to call MinerU endpoint: " + path, exception);
|
||||
}
|
||||
@@ -197,7 +204,7 @@ public class MineruClient {
|
||||
}
|
||||
|
||||
private DocumentParseException buildHttpException(String path, int statusCode, byte[] bodyBytes) {
|
||||
String bodyText = bodyBytes == null ? "" : new String(bodyBytes);
|
||||
String bodyText = bodyBytes == null ? "" : new String(bodyBytes, StandardCharsets.UTF_8);
|
||||
return new DocumentParseException(
|
||||
"MinerU request failed: path=" + path + ", status=" + statusCode + ", body=" + bodyText
|
||||
);
|
||||
|
||||
@@ -0,0 +1,199 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.JSONArray;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
|
||||
/**
|
||||
* MinerU JSON 兼容解析工具。
|
||||
*
|
||||
* <p>部分部署返回的结构化字段会被额外包装成 JSON 字符串,
|
||||
* 例如 {@code middle_json="{}"} 或整个响应体直接返回
|
||||
* {@code "{\"results\":{...}}"}。该工具负责统一拆包,
|
||||
* 让上层模块无需重复处理这些兼容分支。</p>
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-17
|
||||
*/
|
||||
final class MineruJsonSupport {
|
||||
|
||||
private static final int MAX_JSON_UNWRAP_DEPTH = 4;
|
||||
private static final int PREVIEW_LIMIT = 160;
|
||||
|
||||
private MineruJsonSupport() {
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析 JSON 文本并要求最终结果为对象。
|
||||
*
|
||||
* @param text JSON 文本
|
||||
* @param context 错误上下文
|
||||
* @return JSON 对象
|
||||
*/
|
||||
static JSONObject parseObject(String text, String context) {
|
||||
Object value = parseValue(text, context);
|
||||
if (value instanceof JSONObject) {
|
||||
return (JSONObject) value;
|
||||
}
|
||||
throw unexpectedType(context, "JSONObject", value);
|
||||
}
|
||||
|
||||
/**
|
||||
* 规范化任意 JSON 值。
|
||||
*
|
||||
* @param value 原始值
|
||||
* @param context 错误上下文
|
||||
* @return 规范化后的 JSON 值
|
||||
*/
|
||||
static Object normalizeValue(Object value, String context) {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
if (value instanceof JSONObject || value instanceof JSONArray) {
|
||||
return value;
|
||||
}
|
||||
if (value instanceof CharSequence) {
|
||||
String text = value.toString().trim();
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return null;
|
||||
}
|
||||
return parseValue(text, context);
|
||||
}
|
||||
try {
|
||||
return unwrapNestedJsonString(JSON.parse(JSON.toJSONString(value)), context);
|
||||
} catch (Exception exception) {
|
||||
throw new DocumentParseException(
|
||||
"Failed to normalize MinerU JSON value: " + context + ", preview=" + preview(String.valueOf(value)),
|
||||
exception
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 在值看起来像 JSON 时才尝试拆包,否则保留原始值。
|
||||
*
|
||||
* @param value 原始值
|
||||
* @param context 错误上下文
|
||||
* @return 规范化后的值或原值
|
||||
*/
|
||||
static Object normalizeValueIfJsonLike(Object value, String context) {
|
||||
if (!(value instanceof CharSequence)) {
|
||||
return normalizeValue(value, context);
|
||||
}
|
||||
String text = value.toString().trim();
|
||||
if (!looksLikeJson(text) && !(text.startsWith("\"") && text.endsWith("\""))) {
|
||||
return value;
|
||||
}
|
||||
return normalizeValue(value, context);
|
||||
}
|
||||
|
||||
/**
|
||||
* 将任意 JSON 值转换为对象。
|
||||
*
|
||||
* @param value 原始值
|
||||
* @param context 错误上下文
|
||||
* @return JSON 对象
|
||||
*/
|
||||
static JSONObject asObject(Object value, String context) {
|
||||
Object normalized = normalizeValue(value, context);
|
||||
if (normalized == null) {
|
||||
return null;
|
||||
}
|
||||
if (normalized instanceof JSONObject) {
|
||||
return (JSONObject) normalized;
|
||||
}
|
||||
throw unexpectedType(context, "JSONObject", normalized);
|
||||
}
|
||||
|
||||
/**
|
||||
* 将任意 JSON 值转换为数组。
|
||||
*
|
||||
* @param value 原始值
|
||||
* @param context 错误上下文
|
||||
* @return JSON 数组
|
||||
*/
|
||||
static JSONArray asArray(Object value, String context) {
|
||||
Object normalized = normalizeValue(value, context);
|
||||
if (normalized == null) {
|
||||
return null;
|
||||
}
|
||||
if (normalized instanceof JSONArray) {
|
||||
return (JSONArray) normalized;
|
||||
}
|
||||
throw unexpectedType(context, "JSONArray", normalized);
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析 JSON 文本并自动拆解被双层包装的字符串值。
|
||||
*
|
||||
* @param text JSON 文本
|
||||
* @param context 错误上下文
|
||||
* @return 解析结果
|
||||
*/
|
||||
static Object parseValue(String text, String context) {
|
||||
String trimmed = text == null ? null : text.trim();
|
||||
if (!StringUtil.hasText(trimmed)) {
|
||||
throw new DocumentParseException("MinerU JSON payload is empty: " + context);
|
||||
}
|
||||
try {
|
||||
return unwrapNestedJsonString(JSON.parse(trimmed), context);
|
||||
} catch (DocumentParseException exception) {
|
||||
throw exception;
|
||||
} catch (Exception exception) {
|
||||
throw new DocumentParseException(
|
||||
"Failed to parse MinerU JSON payload: " + context + ", preview=" + preview(trimmed),
|
||||
exception
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private static Object unwrapNestedJsonString(Object value, String context) {
|
||||
Object current = value;
|
||||
for (int depth = 0; depth < MAX_JSON_UNWRAP_DEPTH; depth++) {
|
||||
if (!(current instanceof String)) {
|
||||
return current;
|
||||
}
|
||||
String text = ((String) current).trim();
|
||||
if (!looksLikeJson(text)) {
|
||||
return current;
|
||||
}
|
||||
try {
|
||||
current = JSON.parse(text);
|
||||
} catch (Exception exception) {
|
||||
throw new DocumentParseException(
|
||||
"Failed to unwrap MinerU nested JSON string: " + context + ", preview=" + preview(text),
|
||||
exception
|
||||
);
|
||||
}
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
private static boolean looksLikeJson(String text) {
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return false;
|
||||
}
|
||||
char first = text.charAt(0);
|
||||
return first == '{' || first == '[';
|
||||
}
|
||||
|
||||
private static DocumentParseException unexpectedType(String context, String expectedType, Object actualValue) {
|
||||
String actualType = actualValue == null ? "null" : actualValue.getClass().getSimpleName();
|
||||
return new DocumentParseException(
|
||||
"MinerU JSON payload type mismatch: " + context + ", expected=" + expectedType + ", actual=" + actualType
|
||||
);
|
||||
}
|
||||
|
||||
private static String preview(String text) {
|
||||
if (text == null) {
|
||||
return "";
|
||||
}
|
||||
String normalized = text.replace('\n', ' ').replace('\r', ' ');
|
||||
if (normalized.length() <= PREVIEW_LIMIT) {
|
||||
return normalized;
|
||||
}
|
||||
return normalized.substring(0, PREVIEW_LIMIT) + "...";
|
||||
}
|
||||
}
|
||||
@@ -116,10 +116,16 @@ public class MineruMapper {
|
||||
payload.setBackend(jsonObject.getString("backend"));
|
||||
payload.setVersion(jsonObject.getString("version"));
|
||||
Map<String, JSONObject> results = new LinkedHashMap<String, JSONObject>();
|
||||
JSONObject resultJson = jsonObject.getJSONObject("results");
|
||||
JSONObject resultJson = MineruJsonSupport.asObject(jsonObject.get("results"), "MinerU sync results");
|
||||
if (resultJson != null) {
|
||||
for (String key : resultJson.keySet()) {
|
||||
results.put(key, resultJson.getJSONObject(key));
|
||||
JSONObject result = MineruJsonSupport.asObject(
|
||||
resultJson.get(key),
|
||||
"MinerU sync result entry: " + key
|
||||
);
|
||||
if (result != null) {
|
||||
results.put(key, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
payload.setResults(results);
|
||||
@@ -239,12 +245,24 @@ public class MineruMapper {
|
||||
result.setPlainText(result.getMarkdown());
|
||||
|
||||
ParseArtifacts artifacts = new ParseArtifacts();
|
||||
artifacts.setMiddleJson(fileResult.get("middle_json"));
|
||||
artifacts.setContentList(fileResult.get("content_list"));
|
||||
artifacts.setModelOutput(fileResult.get("model_output"));
|
||||
artifacts.setMiddleJson(MineruJsonSupport.normalizeValue(
|
||||
fileResult.get("middle_json"),
|
||||
"MinerU result " + fileName + " middle_json"
|
||||
));
|
||||
artifacts.setContentList(MineruJsonSupport.normalizeValue(
|
||||
fileResult.get("content_list"),
|
||||
"MinerU result " + fileName + " content_list"
|
||||
));
|
||||
artifacts.setModelOutput(MineruJsonSupport.normalizeValueIfJsonLike(
|
||||
fileResult.get("model_output"),
|
||||
"MinerU result " + fileName + " model_output"
|
||||
));
|
||||
result.setArtifacts(artifacts);
|
||||
|
||||
Map<String, String> imageDataUrls = toStringMap(fileResult.getJSONObject("images"));
|
||||
Map<String, String> imageDataUrls = toStringMap(MineruJsonSupport.asObject(
|
||||
fileResult.get("images"),
|
||||
"MinerU result " + fileName + " images"
|
||||
));
|
||||
Map<String, byte[]> imageContents = toBinaryMap(imageDataUrls);
|
||||
applyStructuredArtifacts(result, imageDataUrls, imageContents);
|
||||
if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) {
|
||||
@@ -266,8 +284,8 @@ public class MineruMapper {
|
||||
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
|
||||
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
|
||||
|
||||
JSONObject middleJson = asObject(middleArtifact);
|
||||
JSONArray contentList = asArray(contentListArtifact);
|
||||
JSONObject middleJson = asObject(middleArtifact, "MinerU ZIP middle artifact: " + fileName);
|
||||
JSONArray contentList = asArray(contentListArtifact, "MinerU ZIP content_list artifact: " + fileName);
|
||||
Object modelOutput = modelOutputArtifact;
|
||||
|
||||
if (contentList == null && middleArtifact instanceof JSONArray) {
|
||||
@@ -283,7 +301,10 @@ public class MineruMapper {
|
||||
artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
|
||||
artifacts.setModelOutput(modelOutput);
|
||||
|
||||
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
|
||||
JSONArray contentListV2 = asArray(
|
||||
firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"),
|
||||
"MinerU ZIP content_list_v2 artifact: " + fileName
|
||||
);
|
||||
if (contentListV2 != null) {
|
||||
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
|
||||
}
|
||||
@@ -308,8 +329,8 @@ public class MineruMapper {
|
||||
}
|
||||
|
||||
private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls, Map<String, byte[]> imageContents) {
|
||||
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson());
|
||||
JSONArray contentList = asArray(result.getArtifacts().getContentList());
|
||||
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson(), "MinerU middle_json artifact");
|
||||
JSONArray contentList = asArray(result.getArtifacts().getContentList(), "MinerU content_list artifact");
|
||||
|
||||
if (middleJson != null) {
|
||||
fillPages(result, middleJson);
|
||||
@@ -576,34 +597,18 @@ public class MineruMapper {
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return JSON.parse(text);
|
||||
} catch (Exception exception) {
|
||||
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
|
||||
}
|
||||
return MineruJsonSupport.parseValue(text, "MinerU ZIP artifact " + suffix);
|
||||
}
|
||||
|
||||
private JSONObject asObject(Object value) {
|
||||
if (value instanceof JSONObject) {
|
||||
return (JSONObject) value;
|
||||
}
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
private JSONObject asObject(Object value, String context) {
|
||||
if (value instanceof JSONArray) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseObject(JSON.toJSONString(value));
|
||||
return MineruJsonSupport.asObject(value, context);
|
||||
}
|
||||
|
||||
private JSONArray asArray(Object value) {
|
||||
if (value instanceof JSONArray) {
|
||||
return (JSONArray) value;
|
||||
}
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseArray(JSON.toJSONString(value));
|
||||
private JSONArray asArray(Object value, String context) {
|
||||
return MineruJsonSupport.asArray(value, context);
|
||||
}
|
||||
|
||||
private List<String> toStringList(JSONArray jsonArray) {
|
||||
|
||||
Reference in New Issue
Block a user