refactor: 收敛文档 OCR 配置并补齐 Office 解析兼容

- 统一 MinerU OCR 配置结构并移除分模块冗余属性类

- 补齐 JSON 字符串化结果拆包、XLSX 图片兼容与对应回归测试
This commit is contained in:
2026-04-18 13:01:17 +08:00
parent b66876d0fd
commit 56ee149e7c
15 changed files with 559 additions and 272 deletions

View File

@@ -1,6 +1,5 @@
package com.easyagents.document.core.mineru;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import com.easyagents.core.util.StringUtil;
import com.easyagents.document.core.exception.DocumentParseException;
@@ -16,6 +15,7 @@ import okhttp3.ResponseBody;
import java.io.IOException;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
@@ -115,7 +115,10 @@ public class MineruClient {
}
String contentType = response.header("Content-Type");
if (contentType != null && contentType.contains("application/json")) {
JSONObject jsonObject = JSON.parseObject(new String(responseBytes));
JSONObject jsonObject = MineruJsonSupport.parseObject(
new String(responseBytes, StandardCharsets.UTF_8),
"MinerU async result response: " + path
);
throw new DocumentParseException("MinerU async result is not ready: " + jsonObject.toJSONString());
}
if (responseBytes.length < 2 || responseBytes[0] != 'P' || responseBytes[1] != 'K') {
@@ -148,9 +151,13 @@ public class MineruClient {
ResponseBody body = response.body();
String bodyText = body == null ? "" : body.string();
if (!response.isSuccessful()) {
throw buildHttpException(path, response.code(), bodyText == null ? new byte[0] : bodyText.getBytes());
throw buildHttpException(
path,
response.code(),
bodyText == null ? new byte[0] : bodyText.getBytes(StandardCharsets.UTF_8)
);
}
return JSON.parseObject(bodyText);
return MineruJsonSupport.parseObject(bodyText, "MinerU response body: " + path);
} catch (IOException exception) {
throw new DocumentParseException("Failed to call MinerU endpoint: " + path, exception);
}
@@ -197,7 +204,7 @@ public class MineruClient {
}
private DocumentParseException buildHttpException(String path, int statusCode, byte[] bodyBytes) {
String bodyText = bodyBytes == null ? "" : new String(bodyBytes);
String bodyText = bodyBytes == null ? "" : new String(bodyBytes, StandardCharsets.UTF_8);
return new DocumentParseException(
"MinerU request failed: path=" + path + ", status=" + statusCode + ", body=" + bodyText
);

View File

@@ -0,0 +1,199 @@
package com.easyagents.document.core.mineru;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import com.easyagents.core.util.StringUtil;
import com.easyagents.document.core.exception.DocumentParseException;
/**
* MinerU JSON 兼容解析工具。
*
* <p>部分部署返回的结构化字段会被额外包装成 JSON 字符串,
* 例如 {@code middle_json="{}"} 或整个响应体直接返回
* {@code "{\"results\":{...}}"}。该工具负责统一拆包,
* 让上层模块无需重复处理这些兼容分支。</p>
*
* @author Codex
* @since 2026-04-17
*/
final class MineruJsonSupport {
private static final int MAX_JSON_UNWRAP_DEPTH = 4;
private static final int PREVIEW_LIMIT = 160;
private MineruJsonSupport() {
}
/**
* 解析 JSON 文本并要求最终结果为对象。
*
* @param text JSON 文本
* @param context 错误上下文
* @return JSON 对象
*/
static JSONObject parseObject(String text, String context) {
Object value = parseValue(text, context);
if (value instanceof JSONObject) {
return (JSONObject) value;
}
throw unexpectedType(context, "JSONObject", value);
}
/**
* 规范化任意 JSON 值。
*
* @param value 原始值
* @param context 错误上下文
* @return 规范化后的 JSON 值
*/
static Object normalizeValue(Object value, String context) {
if (value == null) {
return null;
}
if (value instanceof JSONObject || value instanceof JSONArray) {
return value;
}
if (value instanceof CharSequence) {
String text = value.toString().trim();
if (!StringUtil.hasText(text)) {
return null;
}
return parseValue(text, context);
}
try {
return unwrapNestedJsonString(JSON.parse(JSON.toJSONString(value)), context);
} catch (Exception exception) {
throw new DocumentParseException(
"Failed to normalize MinerU JSON value: " + context + ", preview=" + preview(String.valueOf(value)),
exception
);
}
}
/**
* 在值看起来像 JSON 时才尝试拆包,否则保留原始值。
*
* @param value 原始值
* @param context 错误上下文
* @return 规范化后的值或原值
*/
static Object normalizeValueIfJsonLike(Object value, String context) {
if (!(value instanceof CharSequence)) {
return normalizeValue(value, context);
}
String text = value.toString().trim();
if (!looksLikeJson(text) && !(text.startsWith("\"") && text.endsWith("\""))) {
return value;
}
return normalizeValue(value, context);
}
/**
* 将任意 JSON 值转换为对象。
*
* @param value 原始值
* @param context 错误上下文
* @return JSON 对象
*/
static JSONObject asObject(Object value, String context) {
Object normalized = normalizeValue(value, context);
if (normalized == null) {
return null;
}
if (normalized instanceof JSONObject) {
return (JSONObject) normalized;
}
throw unexpectedType(context, "JSONObject", normalized);
}
/**
* 将任意 JSON 值转换为数组。
*
* @param value 原始值
* @param context 错误上下文
* @return JSON 数组
*/
static JSONArray asArray(Object value, String context) {
Object normalized = normalizeValue(value, context);
if (normalized == null) {
return null;
}
if (normalized instanceof JSONArray) {
return (JSONArray) normalized;
}
throw unexpectedType(context, "JSONArray", normalized);
}
/**
* 解析 JSON 文本并自动拆解被双层包装的字符串值。
*
* @param text JSON 文本
* @param context 错误上下文
* @return 解析结果
*/
static Object parseValue(String text, String context) {
String trimmed = text == null ? null : text.trim();
if (!StringUtil.hasText(trimmed)) {
throw new DocumentParseException("MinerU JSON payload is empty: " + context);
}
try {
return unwrapNestedJsonString(JSON.parse(trimmed), context);
} catch (DocumentParseException exception) {
throw exception;
} catch (Exception exception) {
throw new DocumentParseException(
"Failed to parse MinerU JSON payload: " + context + ", preview=" + preview(trimmed),
exception
);
}
}
private static Object unwrapNestedJsonString(Object value, String context) {
Object current = value;
for (int depth = 0; depth < MAX_JSON_UNWRAP_DEPTH; depth++) {
if (!(current instanceof String)) {
return current;
}
String text = ((String) current).trim();
if (!looksLikeJson(text)) {
return current;
}
try {
current = JSON.parse(text);
} catch (Exception exception) {
throw new DocumentParseException(
"Failed to unwrap MinerU nested JSON string: " + context + ", preview=" + preview(text),
exception
);
}
}
return current;
}
private static boolean looksLikeJson(String text) {
if (!StringUtil.hasText(text)) {
return false;
}
char first = text.charAt(0);
return first == '{' || first == '[';
}
private static DocumentParseException unexpectedType(String context, String expectedType, Object actualValue) {
String actualType = actualValue == null ? "null" : actualValue.getClass().getSimpleName();
return new DocumentParseException(
"MinerU JSON payload type mismatch: " + context + ", expected=" + expectedType + ", actual=" + actualType
);
}
private static String preview(String text) {
if (text == null) {
return "";
}
String normalized = text.replace('\n', ' ').replace('\r', ' ');
if (normalized.length() <= PREVIEW_LIMIT) {
return normalized;
}
return normalized.substring(0, PREVIEW_LIMIT) + "...";
}
}

View File

@@ -116,10 +116,16 @@ public class MineruMapper {
payload.setBackend(jsonObject.getString("backend"));
payload.setVersion(jsonObject.getString("version"));
Map<String, JSONObject> results = new LinkedHashMap<String, JSONObject>();
JSONObject resultJson = jsonObject.getJSONObject("results");
JSONObject resultJson = MineruJsonSupport.asObject(jsonObject.get("results"), "MinerU sync results");
if (resultJson != null) {
for (String key : resultJson.keySet()) {
results.put(key, resultJson.getJSONObject(key));
JSONObject result = MineruJsonSupport.asObject(
resultJson.get(key),
"MinerU sync result entry: " + key
);
if (result != null) {
results.put(key, result);
}
}
}
payload.setResults(results);
@@ -239,12 +245,24 @@ public class MineruMapper {
result.setPlainText(result.getMarkdown());
ParseArtifacts artifacts = new ParseArtifacts();
artifacts.setMiddleJson(fileResult.get("middle_json"));
artifacts.setContentList(fileResult.get("content_list"));
artifacts.setModelOutput(fileResult.get("model_output"));
artifacts.setMiddleJson(MineruJsonSupport.normalizeValue(
fileResult.get("middle_json"),
"MinerU result " + fileName + " middle_json"
));
artifacts.setContentList(MineruJsonSupport.normalizeValue(
fileResult.get("content_list"),
"MinerU result " + fileName + " content_list"
));
artifacts.setModelOutput(MineruJsonSupport.normalizeValueIfJsonLike(
fileResult.get("model_output"),
"MinerU result " + fileName + " model_output"
));
result.setArtifacts(artifacts);
Map<String, String> imageDataUrls = toStringMap(fileResult.getJSONObject("images"));
Map<String, String> imageDataUrls = toStringMap(MineruJsonSupport.asObject(
fileResult.get("images"),
"MinerU result " + fileName + " images"
));
Map<String, byte[]> imageContents = toBinaryMap(imageDataUrls);
applyStructuredArtifacts(result, imageDataUrls, imageContents);
if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) {
@@ -266,8 +284,8 @@ public class MineruMapper {
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
JSONObject middleJson = asObject(middleArtifact);
JSONArray contentList = asArray(contentListArtifact);
JSONObject middleJson = asObject(middleArtifact, "MinerU ZIP middle artifact: " + fileName);
JSONArray contentList = asArray(contentListArtifact, "MinerU ZIP content_list artifact: " + fileName);
Object modelOutput = modelOutputArtifact;
if (contentList == null && middleArtifact instanceof JSONArray) {
@@ -283,7 +301,10 @@ public class MineruMapper {
artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
artifacts.setModelOutput(modelOutput);
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
JSONArray contentListV2 = asArray(
firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"),
"MinerU ZIP content_list_v2 artifact: " + fileName
);
if (contentListV2 != null) {
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
}
@@ -308,8 +329,8 @@ public class MineruMapper {
}
private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls, Map<String, byte[]> imageContents) {
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson());
JSONArray contentList = asArray(result.getArtifacts().getContentList());
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson(), "MinerU middle_json artifact");
JSONArray contentList = asArray(result.getArtifacts().getContentList(), "MinerU content_list artifact");
if (middleJson != null) {
fillPages(result, middleJson);
@@ -576,34 +597,18 @@ public class MineruMapper {
if (!StringUtil.hasText(text)) {
return null;
}
try {
return JSON.parse(text);
} catch (Exception exception) {
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
}
return MineruJsonSupport.parseValue(text, "MinerU ZIP artifact " + suffix);
}
private JSONObject asObject(Object value) {
if (value instanceof JSONObject) {
return (JSONObject) value;
}
if (value == null) {
return null;
}
private JSONObject asObject(Object value, String context) {
if (value instanceof JSONArray) {
return null;
}
return JSON.parseObject(JSON.toJSONString(value));
return MineruJsonSupport.asObject(value, context);
}
private JSONArray asArray(Object value) {
if (value instanceof JSONArray) {
return (JSONArray) value;
}
if (value == null) {
return null;
}
return JSON.parseArray(JSON.toJSONString(value));
private JSONArray asArray(Object value, String context) {
return MineruJsonSupport.asArray(value, context);
}
private List<String> toStringList(JSONArray jsonArray) {

View File

@@ -49,6 +49,25 @@ public class MineruMapperTest {
Assert.assertNotNull(result.getArtifacts().getContentList());
}
@Test
public void shouldMapStringifiedSyncArtifacts() {
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruResultPayload payload = mapper.toResultPayload(syncPayloadWithStringifiedArtifacts());
ParseResponse response = mapper.toParseResponse(payload);
Assert.assertEquals(1, response.getResults().size());
ParseResult result = response.getResults().get(0);
Assert.assertEquals("# title", result.getMarkdown());
Assert.assertFalse(result.getBlocks().isEmpty());
Assert.assertEquals(1, result.getTables().size());
Assert.assertEquals(2, result.getImages().size());
Assert.assertNotNull(result.getImages().get(0).getContent());
Assert.assertTrue(result.getArtifacts().getMiddleJson() instanceof JSONObject);
Assert.assertTrue(result.getArtifacts().getContentList() instanceof JSONArray);
Assert.assertEquals("plain-model-output", result.getArtifacts().getModelOutput());
}
@Test
public void shouldMapZipResponse() throws IOException {
MineruMapper mapper = new MineruMapper(defaultProperties());
@@ -292,6 +311,28 @@ public class MineruMapperTest {
return payload;
}
private JSONObject syncPayloadWithStringifiedArtifacts() {
JSONObject payload = new JSONObject();
payload.put("backend", "vlm-http-client");
payload.put("version", "3.0.9");
JSONObject result = new JSONObject();
result.put("md_content", "# title");
result.put("middle_json", middleJson().toJSONString());
result.put("content_list", contentList().toJSONString());
result.put("model_output", "plain-model-output");
JSONObject images = new JSONObject();
images.put("figure.png", "data:image/png;base64,ZmFrZQ==");
images.put("table.png", "data:image/png;base64,ZmFrZQ==");
result.put("images", images.toJSONString());
JSONObject results = new JSONObject();
results.put("demo", result.toJSONString());
payload.put("results", results);
return payload;
}
private JSONObject middleBlock(String type, String imagePath) {
JSONObject block = new JSONObject();
block.put("type", type);

View File

@@ -81,6 +81,29 @@ public class MineruPptxDocumentParseServiceTest {
Assert.assertEquals(1, taskInfo.getResult().getResults().size());
}
@Test
public void shouldSupportStringifiedMineruSlideArtifacts() throws IOException {
RecordingClient client = new RecordingClient(defaultProperties(), true);
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruPptxDocumentParseService service = new MineruPptxDocumentParseService(
defaultProperties(),
client,
mapper,
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
);
PptxParseRequest request = new PptxParseRequest();
request.addFile(ParseFile.of("demo.pptx", buildPptxBytes()));
ParseResponse response = service.parse(request);
Assert.assertEquals(1, response.getResults().size());
ParseResult result = response.getResults().get(0);
Assert.assertTrue(result.getMarkdown().contains("slide-ocr-1"));
Assert.assertFalse(result.getBlocks().isEmpty());
Assert.assertEquals(2, result.getImages().size());
}
private byte[] buildPptxBytes() throws IOException {
XMLSlideShow slideShow = new XMLSlideShow();
slideShow.setPageSize(new java.awt.Dimension(640, 360));
@@ -117,9 +140,15 @@ public class MineruPptxDocumentParseServiceTest {
private static class RecordingClient extends MineruClient {
private int parseCount;
private final boolean stringifyArtifacts;
private RecordingClient(MineruProperties properties) {
this(properties, false);
}
private RecordingClient(MineruProperties properties, boolean stringifyArtifacts) {
super(properties, new MineruMapper(properties));
this.stringifyArtifacts = stringifyArtifacts;
}
@Override
@@ -134,10 +163,10 @@ public class MineruPptxDocumentParseServiceTest {
payload.put("version", "3.0.9");
JSONObject result = new JSONObject();
result.put("md_content", "slide-ocr-" + index);
result.put("middle_json", middleJson());
result.put("content_list", contentList(index));
result.put("middle_json", stringifyArtifacts ? middleJson().toJSONString() : middleJson());
result.put("content_list", stringifyArtifacts ? contentList(index).toJSONString() : contentList(index));
JSONObject results = new JSONObject();
results.put("slide-" + index, result);
results.put("slide-" + index, stringifyArtifacts ? result.toJSONString() : result);
payload.put("results", results);
return payload;
}

View File

@@ -253,10 +253,11 @@ public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseSe
appendSheetHeader(extraction.markdown, sheet.getSheetName());
if (maxRow < 0 || maxCol <= 0) {
extraction.markdown.append("_empty sheet_");
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
if (!imageArtifacts.isEmpty()) {
appendImageOnlySheet(extraction.markdown, sheet.getSheetName(), request, imageArtifacts);
return extraction;
}
extraction.markdown.append("_empty sheet_");
return extraction;
}
@@ -488,6 +489,29 @@ public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseSe
}
}
private void appendImageOnlySheet(StringBuilder markdownBuilder,
String sheetName,
XlsxParseRequest request,
List<XlsxCellImageArtifact> imageArtifacts) {
markdownBuilder.append("## ").append(sheetName).append(" 图片内容\n\n");
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
markdownBuilder.append("[IMG:")
.append(imageArtifact.getReferenceKey())
.append("]\n\n");
}
if (Boolean.TRUE.equals(request.getIncludeImageAppendix())) {
appendImageAppendix(markdownBuilder, sheetName, imageArtifacts);
return;
}
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
markdownBuilder.append("![")
.append(imageArtifact.getReferenceKey())
.append("](")
.append(imageArtifact.getSourcePath())
.append(")\n\n");
}
}
private List<String> extractMergedRanges(XSSFSheet sheet) {
List<String> mergedRanges = new ArrayList<String>();
for (int index = 0; index < sheet.getNumMergedRegions(); index++) {

View File

@@ -138,7 +138,9 @@ public class MineruXlsxDocumentParseServiceTest {
XlsxParseArtifact artifact = extractXlsxArtifact(result);
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
Assert.assertTrue(result.getMarkdown().contains("_empty sheet_"));
Assert.assertFalse(result.getMarkdown().contains("_empty sheet_"));
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片内容"));
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片说明"));
Assert.assertTrue(result.getMarkdown().contains("![sheet1-r2c2-001](images/sheet1/r2c2-001.png)"));
Assert.assertTrue(result.getMarkdown().contains("- 占位符:[IMG:sheet1-r2c2-001]"));
@@ -147,6 +149,30 @@ public class MineruXlsxDocumentParseServiceTest {
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
}
@Test
public void shouldKeepMarkdownImageReferenceWhenImageAppendixDisabled() throws Exception {
RecordingClient client = new RecordingClient(defaultProperties());
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
defaultProperties(),
client,
mapper,
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
);
XlsxParseRequest request = new XlsxParseRequest();
request.setIncludeImageAppendix(Boolean.FALSE);
request.addFile(ParseFile.of("image-only.xlsx", buildWorkbookBytesWithImageOnlySheet()));
ParseResponse response = service.parse(request);
ParseResult result = response.getResults().get(0);
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片内容"));
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
Assert.assertTrue(result.getMarkdown().contains("![sheet1-r2c2-001](images/sheet1/r2c2-001.png)"));
Assert.assertFalse(result.getMarkdown().contains("## Sheet1 图片说明"));
}
@Test
public void shouldTrackAsyncLifecycleAndExposeResult() throws Exception {
RecordingClient client = new RecordingClient(defaultProperties());