refactor: 收敛文档 OCR 配置并补齐 Office 解析兼容

- 统一 MinerU OCR 配置结构并移除分模块冗余属性类

- 补齐 JSON 字符串化结果拆包、XLSX 图片兼容与对应回归测试
This commit is contained in:
2026-04-18 13:01:17 +08:00
parent b66876d0fd
commit 56ee149e7c
15 changed files with 559 additions and 272 deletions

View File

@@ -1,6 +1,5 @@
package com.easyagents.document.core.mineru; package com.easyagents.document.core.mineru;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject; import com.alibaba.fastjson2.JSONObject;
import com.easyagents.core.util.StringUtil; import com.easyagents.core.util.StringUtil;
import com.easyagents.document.core.exception.DocumentParseException; import com.easyagents.document.core.exception.DocumentParseException;
@@ -16,6 +15,7 @@ import okhttp3.ResponseBody;
import java.io.IOException; import java.io.IOException;
import java.net.URLConnection; import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@@ -115,7 +115,10 @@ public class MineruClient {
} }
String contentType = response.header("Content-Type"); String contentType = response.header("Content-Type");
if (contentType != null && contentType.contains("application/json")) { if (contentType != null && contentType.contains("application/json")) {
JSONObject jsonObject = JSON.parseObject(new String(responseBytes)); JSONObject jsonObject = MineruJsonSupport.parseObject(
new String(responseBytes, StandardCharsets.UTF_8),
"MinerU async result response: " + path
);
throw new DocumentParseException("MinerU async result is not ready: " + jsonObject.toJSONString()); throw new DocumentParseException("MinerU async result is not ready: " + jsonObject.toJSONString());
} }
if (responseBytes.length < 2 || responseBytes[0] != 'P' || responseBytes[1] != 'K') { if (responseBytes.length < 2 || responseBytes[0] != 'P' || responseBytes[1] != 'K') {
@@ -148,9 +151,13 @@ public class MineruClient {
ResponseBody body = response.body(); ResponseBody body = response.body();
String bodyText = body == null ? "" : body.string(); String bodyText = body == null ? "" : body.string();
if (!response.isSuccessful()) { if (!response.isSuccessful()) {
throw buildHttpException(path, response.code(), bodyText == null ? new byte[0] : bodyText.getBytes()); throw buildHttpException(
path,
response.code(),
bodyText == null ? new byte[0] : bodyText.getBytes(StandardCharsets.UTF_8)
);
} }
return JSON.parseObject(bodyText); return MineruJsonSupport.parseObject(bodyText, "MinerU response body: " + path);
} catch (IOException exception) { } catch (IOException exception) {
throw new DocumentParseException("Failed to call MinerU endpoint: " + path, exception); throw new DocumentParseException("Failed to call MinerU endpoint: " + path, exception);
} }
@@ -197,7 +204,7 @@ public class MineruClient {
} }
private DocumentParseException buildHttpException(String path, int statusCode, byte[] bodyBytes) { private DocumentParseException buildHttpException(String path, int statusCode, byte[] bodyBytes) {
String bodyText = bodyBytes == null ? "" : new String(bodyBytes); String bodyText = bodyBytes == null ? "" : new String(bodyBytes, StandardCharsets.UTF_8);
return new DocumentParseException( return new DocumentParseException(
"MinerU request failed: path=" + path + ", status=" + statusCode + ", body=" + bodyText "MinerU request failed: path=" + path + ", status=" + statusCode + ", body=" + bodyText
); );

View File

@@ -0,0 +1,199 @@
package com.easyagents.document.core.mineru;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import com.easyagents.core.util.StringUtil;
import com.easyagents.document.core.exception.DocumentParseException;
/**
* MinerU JSON 兼容解析工具。
*
* <p>部分部署返回的结构化字段会被额外包装成 JSON 字符串,
* 例如 {@code middle_json="{}"} 或整个响应体直接返回
* {@code "{\"results\":{...}}"}。该工具负责统一拆包,
* 让上层模块无需重复处理这些兼容分支。</p>
*
* @author Codex
* @since 2026-04-17
*/
final class MineruJsonSupport {
private static final int MAX_JSON_UNWRAP_DEPTH = 4;
private static final int PREVIEW_LIMIT = 160;
private MineruJsonSupport() {
}
/**
* 解析 JSON 文本并要求最终结果为对象。
*
* @param text JSON 文本
* @param context 错误上下文
* @return JSON 对象
*/
static JSONObject parseObject(String text, String context) {
Object value = parseValue(text, context);
if (value instanceof JSONObject) {
return (JSONObject) value;
}
throw unexpectedType(context, "JSONObject", value);
}
/**
* 规范化任意 JSON 值。
*
* @param value 原始值
* @param context 错误上下文
* @return 规范化后的 JSON 值
*/
static Object normalizeValue(Object value, String context) {
if (value == null) {
return null;
}
if (value instanceof JSONObject || value instanceof JSONArray) {
return value;
}
if (value instanceof CharSequence) {
String text = value.toString().trim();
if (!StringUtil.hasText(text)) {
return null;
}
return parseValue(text, context);
}
try {
return unwrapNestedJsonString(JSON.parse(JSON.toJSONString(value)), context);
} catch (Exception exception) {
throw new DocumentParseException(
"Failed to normalize MinerU JSON value: " + context + ", preview=" + preview(String.valueOf(value)),
exception
);
}
}
/**
* 在值看起来像 JSON 时才尝试拆包,否则保留原始值。
*
* @param value 原始值
* @param context 错误上下文
* @return 规范化后的值或原值
*/
static Object normalizeValueIfJsonLike(Object value, String context) {
if (!(value instanceof CharSequence)) {
return normalizeValue(value, context);
}
String text = value.toString().trim();
if (!looksLikeJson(text) && !(text.startsWith("\"") && text.endsWith("\""))) {
return value;
}
return normalizeValue(value, context);
}
/**
* 将任意 JSON 值转换为对象。
*
* @param value 原始值
* @param context 错误上下文
* @return JSON 对象
*/
static JSONObject asObject(Object value, String context) {
Object normalized = normalizeValue(value, context);
if (normalized == null) {
return null;
}
if (normalized instanceof JSONObject) {
return (JSONObject) normalized;
}
throw unexpectedType(context, "JSONObject", normalized);
}
/**
* 将任意 JSON 值转换为数组。
*
* @param value 原始值
* @param context 错误上下文
* @return JSON 数组
*/
static JSONArray asArray(Object value, String context) {
Object normalized = normalizeValue(value, context);
if (normalized == null) {
return null;
}
if (normalized instanceof JSONArray) {
return (JSONArray) normalized;
}
throw unexpectedType(context, "JSONArray", normalized);
}
/**
* 解析 JSON 文本并自动拆解被双层包装的字符串值。
*
* @param text JSON 文本
* @param context 错误上下文
* @return 解析结果
*/
static Object parseValue(String text, String context) {
String trimmed = text == null ? null : text.trim();
if (!StringUtil.hasText(trimmed)) {
throw new DocumentParseException("MinerU JSON payload is empty: " + context);
}
try {
return unwrapNestedJsonString(JSON.parse(trimmed), context);
} catch (DocumentParseException exception) {
throw exception;
} catch (Exception exception) {
throw new DocumentParseException(
"Failed to parse MinerU JSON payload: " + context + ", preview=" + preview(trimmed),
exception
);
}
}
private static Object unwrapNestedJsonString(Object value, String context) {
Object current = value;
for (int depth = 0; depth < MAX_JSON_UNWRAP_DEPTH; depth++) {
if (!(current instanceof String)) {
return current;
}
String text = ((String) current).trim();
if (!looksLikeJson(text)) {
return current;
}
try {
current = JSON.parse(text);
} catch (Exception exception) {
throw new DocumentParseException(
"Failed to unwrap MinerU nested JSON string: " + context + ", preview=" + preview(text),
exception
);
}
}
return current;
}
private static boolean looksLikeJson(String text) {
if (!StringUtil.hasText(text)) {
return false;
}
char first = text.charAt(0);
return first == '{' || first == '[';
}
private static DocumentParseException unexpectedType(String context, String expectedType, Object actualValue) {
String actualType = actualValue == null ? "null" : actualValue.getClass().getSimpleName();
return new DocumentParseException(
"MinerU JSON payload type mismatch: " + context + ", expected=" + expectedType + ", actual=" + actualType
);
}
private static String preview(String text) {
if (text == null) {
return "";
}
String normalized = text.replace('\n', ' ').replace('\r', ' ');
if (normalized.length() <= PREVIEW_LIMIT) {
return normalized;
}
return normalized.substring(0, PREVIEW_LIMIT) + "...";
}
}

View File

@@ -116,10 +116,16 @@ public class MineruMapper {
payload.setBackend(jsonObject.getString("backend")); payload.setBackend(jsonObject.getString("backend"));
payload.setVersion(jsonObject.getString("version")); payload.setVersion(jsonObject.getString("version"));
Map<String, JSONObject> results = new LinkedHashMap<String, JSONObject>(); Map<String, JSONObject> results = new LinkedHashMap<String, JSONObject>();
JSONObject resultJson = jsonObject.getJSONObject("results"); JSONObject resultJson = MineruJsonSupport.asObject(jsonObject.get("results"), "MinerU sync results");
if (resultJson != null) { if (resultJson != null) {
for (String key : resultJson.keySet()) { for (String key : resultJson.keySet()) {
results.put(key, resultJson.getJSONObject(key)); JSONObject result = MineruJsonSupport.asObject(
resultJson.get(key),
"MinerU sync result entry: " + key
);
if (result != null) {
results.put(key, result);
}
} }
} }
payload.setResults(results); payload.setResults(results);
@@ -239,12 +245,24 @@ public class MineruMapper {
result.setPlainText(result.getMarkdown()); result.setPlainText(result.getMarkdown());
ParseArtifacts artifacts = new ParseArtifacts(); ParseArtifacts artifacts = new ParseArtifacts();
artifacts.setMiddleJson(fileResult.get("middle_json")); artifacts.setMiddleJson(MineruJsonSupport.normalizeValue(
artifacts.setContentList(fileResult.get("content_list")); fileResult.get("middle_json"),
artifacts.setModelOutput(fileResult.get("model_output")); "MinerU result " + fileName + " middle_json"
));
artifacts.setContentList(MineruJsonSupport.normalizeValue(
fileResult.get("content_list"),
"MinerU result " + fileName + " content_list"
));
artifacts.setModelOutput(MineruJsonSupport.normalizeValueIfJsonLike(
fileResult.get("model_output"),
"MinerU result " + fileName + " model_output"
));
result.setArtifacts(artifacts); result.setArtifacts(artifacts);
Map<String, String> imageDataUrls = toStringMap(fileResult.getJSONObject("images")); Map<String, String> imageDataUrls = toStringMap(MineruJsonSupport.asObject(
fileResult.get("images"),
"MinerU result " + fileName + " images"
));
Map<String, byte[]> imageContents = toBinaryMap(imageDataUrls); Map<String, byte[]> imageContents = toBinaryMap(imageDataUrls);
applyStructuredArtifacts(result, imageDataUrls, imageContents); applyStructuredArtifacts(result, imageDataUrls, imageContents);
if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) { if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) {
@@ -266,8 +284,8 @@ public class MineruMapper {
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json"); Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json"); Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
JSONObject middleJson = asObject(middleArtifact); JSONObject middleJson = asObject(middleArtifact, "MinerU ZIP middle artifact: " + fileName);
JSONArray contentList = asArray(contentListArtifact); JSONArray contentList = asArray(contentListArtifact, "MinerU ZIP content_list artifact: " + fileName);
Object modelOutput = modelOutputArtifact; Object modelOutput = modelOutputArtifact;
if (contentList == null && middleArtifact instanceof JSONArray) { if (contentList == null && middleArtifact instanceof JSONArray) {
@@ -283,7 +301,10 @@ public class MineruMapper {
artifacts.setContentList(contentList == null ? contentListArtifact : contentList); artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
artifacts.setModelOutput(modelOutput); artifacts.setModelOutput(modelOutput);
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json")); JSONArray contentListV2 = asArray(
firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"),
"MinerU ZIP content_list_v2 artifact: " + fileName
);
if (contentListV2 != null) { if (contentListV2 != null) {
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2); artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
} }
@@ -308,8 +329,8 @@ public class MineruMapper {
} }
private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls, Map<String, byte[]> imageContents) { private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls, Map<String, byte[]> imageContents) {
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson()); JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson(), "MinerU middle_json artifact");
JSONArray contentList = asArray(result.getArtifacts().getContentList()); JSONArray contentList = asArray(result.getArtifacts().getContentList(), "MinerU content_list artifact");
if (middleJson != null) { if (middleJson != null) {
fillPages(result, middleJson); fillPages(result, middleJson);
@@ -576,34 +597,18 @@ public class MineruMapper {
if (!StringUtil.hasText(text)) { if (!StringUtil.hasText(text)) {
return null; return null;
} }
try { return MineruJsonSupport.parseValue(text, "MinerU ZIP artifact " + suffix);
return JSON.parse(text);
} catch (Exception exception) {
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
}
} }
private JSONObject asObject(Object value) { private JSONObject asObject(Object value, String context) {
if (value instanceof JSONObject) {
return (JSONObject) value;
}
if (value == null) {
return null;
}
if (value instanceof JSONArray) { if (value instanceof JSONArray) {
return null; return null;
} }
return JSON.parseObject(JSON.toJSONString(value)); return MineruJsonSupport.asObject(value, context);
} }
private JSONArray asArray(Object value) { private JSONArray asArray(Object value, String context) {
if (value instanceof JSONArray) { return MineruJsonSupport.asArray(value, context);
return (JSONArray) value;
}
if (value == null) {
return null;
}
return JSON.parseArray(JSON.toJSONString(value));
} }
private List<String> toStringList(JSONArray jsonArray) { private List<String> toStringList(JSONArray jsonArray) {

View File

@@ -49,6 +49,25 @@ public class MineruMapperTest {
Assert.assertNotNull(result.getArtifacts().getContentList()); Assert.assertNotNull(result.getArtifacts().getContentList());
} }
@Test
public void shouldMapStringifiedSyncArtifacts() {
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruResultPayload payload = mapper.toResultPayload(syncPayloadWithStringifiedArtifacts());
ParseResponse response = mapper.toParseResponse(payload);
Assert.assertEquals(1, response.getResults().size());
ParseResult result = response.getResults().get(0);
Assert.assertEquals("# title", result.getMarkdown());
Assert.assertFalse(result.getBlocks().isEmpty());
Assert.assertEquals(1, result.getTables().size());
Assert.assertEquals(2, result.getImages().size());
Assert.assertNotNull(result.getImages().get(0).getContent());
Assert.assertTrue(result.getArtifacts().getMiddleJson() instanceof JSONObject);
Assert.assertTrue(result.getArtifacts().getContentList() instanceof JSONArray);
Assert.assertEquals("plain-model-output", result.getArtifacts().getModelOutput());
}
@Test @Test
public void shouldMapZipResponse() throws IOException { public void shouldMapZipResponse() throws IOException {
MineruMapper mapper = new MineruMapper(defaultProperties()); MineruMapper mapper = new MineruMapper(defaultProperties());
@@ -292,6 +311,28 @@ public class MineruMapperTest {
return payload; return payload;
} }
private JSONObject syncPayloadWithStringifiedArtifacts() {
JSONObject payload = new JSONObject();
payload.put("backend", "vlm-http-client");
payload.put("version", "3.0.9");
JSONObject result = new JSONObject();
result.put("md_content", "# title");
result.put("middle_json", middleJson().toJSONString());
result.put("content_list", contentList().toJSONString());
result.put("model_output", "plain-model-output");
JSONObject images = new JSONObject();
images.put("figure.png", "data:image/png;base64,ZmFrZQ==");
images.put("table.png", "data:image/png;base64,ZmFrZQ==");
result.put("images", images.toJSONString());
JSONObject results = new JSONObject();
results.put("demo", result.toJSONString());
payload.put("results", results);
return payload;
}
private JSONObject middleBlock(String type, String imagePath) { private JSONObject middleBlock(String type, String imagePath) {
JSONObject block = new JSONObject(); JSONObject block = new JSONObject();
block.put("type", type); block.put("type", type);

View File

@@ -81,6 +81,29 @@ public class MineruPptxDocumentParseServiceTest {
Assert.assertEquals(1, taskInfo.getResult().getResults().size()); Assert.assertEquals(1, taskInfo.getResult().getResults().size());
} }
@Test
public void shouldSupportStringifiedMineruSlideArtifacts() throws IOException {
RecordingClient client = new RecordingClient(defaultProperties(), true);
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruPptxDocumentParseService service = new MineruPptxDocumentParseService(
defaultProperties(),
client,
mapper,
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
);
PptxParseRequest request = new PptxParseRequest();
request.addFile(ParseFile.of("demo.pptx", buildPptxBytes()));
ParseResponse response = service.parse(request);
Assert.assertEquals(1, response.getResults().size());
ParseResult result = response.getResults().get(0);
Assert.assertTrue(result.getMarkdown().contains("slide-ocr-1"));
Assert.assertFalse(result.getBlocks().isEmpty());
Assert.assertEquals(2, result.getImages().size());
}
private byte[] buildPptxBytes() throws IOException { private byte[] buildPptxBytes() throws IOException {
XMLSlideShow slideShow = new XMLSlideShow(); XMLSlideShow slideShow = new XMLSlideShow();
slideShow.setPageSize(new java.awt.Dimension(640, 360)); slideShow.setPageSize(new java.awt.Dimension(640, 360));
@@ -117,9 +140,15 @@ public class MineruPptxDocumentParseServiceTest {
private static class RecordingClient extends MineruClient { private static class RecordingClient extends MineruClient {
private int parseCount; private int parseCount;
private final boolean stringifyArtifacts;
private RecordingClient(MineruProperties properties) { private RecordingClient(MineruProperties properties) {
this(properties, false);
}
private RecordingClient(MineruProperties properties, boolean stringifyArtifacts) {
super(properties, new MineruMapper(properties)); super(properties, new MineruMapper(properties));
this.stringifyArtifacts = stringifyArtifacts;
} }
@Override @Override
@@ -134,10 +163,10 @@ public class MineruPptxDocumentParseServiceTest {
payload.put("version", "3.0.9"); payload.put("version", "3.0.9");
JSONObject result = new JSONObject(); JSONObject result = new JSONObject();
result.put("md_content", "slide-ocr-" + index); result.put("md_content", "slide-ocr-" + index);
result.put("middle_json", middleJson()); result.put("middle_json", stringifyArtifacts ? middleJson().toJSONString() : middleJson());
result.put("content_list", contentList(index)); result.put("content_list", stringifyArtifacts ? contentList(index).toJSONString() : contentList(index));
JSONObject results = new JSONObject(); JSONObject results = new JSONObject();
results.put("slide-" + index, result); results.put("slide-" + index, stringifyArtifacts ? result.toJSONString() : result);
payload.put("results", results); payload.put("results", results);
return payload; return payload;
} }

View File

@@ -253,10 +253,11 @@ public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseSe
appendSheetHeader(extraction.markdown, sheet.getSheetName()); appendSheetHeader(extraction.markdown, sheet.getSheetName());
if (maxRow < 0 || maxCol <= 0) { if (maxRow < 0 || maxCol <= 0) {
extraction.markdown.append("_empty sheet_"); if (!imageArtifacts.isEmpty()) {
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) { appendImageOnlySheet(extraction.markdown, sheet.getSheetName(), request, imageArtifacts);
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts); return extraction;
} }
extraction.markdown.append("_empty sheet_");
return extraction; return extraction;
} }
@@ -488,6 +489,29 @@ public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseSe
} }
} }
private void appendImageOnlySheet(StringBuilder markdownBuilder,
String sheetName,
XlsxParseRequest request,
List<XlsxCellImageArtifact> imageArtifacts) {
markdownBuilder.append("## ").append(sheetName).append(" 图片内容\n\n");
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
markdownBuilder.append("[IMG:")
.append(imageArtifact.getReferenceKey())
.append("]\n\n");
}
if (Boolean.TRUE.equals(request.getIncludeImageAppendix())) {
appendImageAppendix(markdownBuilder, sheetName, imageArtifacts);
return;
}
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
markdownBuilder.append("![")
.append(imageArtifact.getReferenceKey())
.append("](")
.append(imageArtifact.getSourcePath())
.append(")\n\n");
}
}
private List<String> extractMergedRanges(XSSFSheet sheet) { private List<String> extractMergedRanges(XSSFSheet sheet) {
List<String> mergedRanges = new ArrayList<String>(); List<String> mergedRanges = new ArrayList<String>();
for (int index = 0; index < sheet.getNumMergedRegions(); index++) { for (int index = 0; index < sheet.getNumMergedRegions(); index++) {

View File

@@ -138,7 +138,9 @@ public class MineruXlsxDocumentParseServiceTest {
XlsxParseArtifact artifact = extractXlsxArtifact(result); XlsxParseArtifact artifact = extractXlsxArtifact(result);
Assert.assertTrue(result.getMarkdown().contains("# Sheet1")); Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
Assert.assertTrue(result.getMarkdown().contains("_empty sheet_")); Assert.assertFalse(result.getMarkdown().contains("_empty sheet_"));
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片内容"));
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片说明")); Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片说明"));
Assert.assertTrue(result.getMarkdown().contains("![sheet1-r2c2-001](images/sheet1/r2c2-001.png)")); Assert.assertTrue(result.getMarkdown().contains("![sheet1-r2c2-001](images/sheet1/r2c2-001.png)"));
Assert.assertTrue(result.getMarkdown().contains("- 占位符:[IMG:sheet1-r2c2-001]")); Assert.assertTrue(result.getMarkdown().contains("- 占位符:[IMG:sheet1-r2c2-001]"));
@@ -147,6 +149,30 @@ public class MineruXlsxDocumentParseServiceTest {
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0)); Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
} }
@Test
public void shouldKeepMarkdownImageReferenceWhenImageAppendixDisabled() throws Exception {
RecordingClient client = new RecordingClient(defaultProperties());
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
defaultProperties(),
client,
mapper,
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
);
XlsxParseRequest request = new XlsxParseRequest();
request.setIncludeImageAppendix(Boolean.FALSE);
request.addFile(ParseFile.of("image-only.xlsx", buildWorkbookBytesWithImageOnlySheet()));
ParseResponse response = service.parse(request);
ParseResult result = response.getResults().get(0);
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片内容"));
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
Assert.assertTrue(result.getMarkdown().contains("![sheet1-r2c2-001](images/sheet1/r2c2-001.png)"));
Assert.assertFalse(result.getMarkdown().contains("## Sheet1 图片说明"));
}
@Test @Test
public void shouldTrackAsyncLifecycleAndExposeResult() throws Exception { public void shouldTrackAsyncLifecycleAndExposeResult() throws Exception {
RecordingClient client = new RecordingClient(defaultProperties()); RecordingClient client = new RecordingClient(defaultProperties());

View File

@@ -12,7 +12,7 @@ import java.util.List;
* @author Codex * @author Codex
* @since 2026-04-16 * @since 2026-04-16
*/ */
@ConfigurationProperties(prefix = "easy-agents.document.mineru") @ConfigurationProperties(prefix = "easy-agents.document.ocr.mineru")
public class CommonMineruDocumentProperties { public class CommonMineruDocumentProperties {
private String baseUrl; private String baseUrl;

View File

@@ -1,119 +0,0 @@
package com.easyagents.spring.boot.document.pdf.mineru;
import org.springframework.boot.context.properties.ConfigurationProperties;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* MinerU Spring Boot 配置。
*
* @author Codex
* @since 2026-04-14
*/
@ConfigurationProperties(prefix = "easy-agents.document.pdf.mineru")
public class MineruDocumentProperties {
private String baseUrl;
private Integer connectTimeoutMs = 3000;
private Integer readTimeoutMs = 600000;
private Integer writeTimeoutMs = 600000;
private Integer pollIntervalMs = 1000;
private Integer resultTimeoutMs = 1800000;
private String defaultBackend = "vlm-http-client";
private String defaultParseMethod = "auto";
private List<String> defaultLangList = new ArrayList<String>(Arrays.asList("ch"));
private Boolean defaultFormulaEnable = true;
private Boolean defaultTableEnable = true;
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(String baseUrl) {
this.baseUrl = baseUrl;
}
public Integer getConnectTimeoutMs() {
return connectTimeoutMs;
}
public void setConnectTimeoutMs(Integer connectTimeoutMs) {
this.connectTimeoutMs = connectTimeoutMs;
}
public Integer getReadTimeoutMs() {
return readTimeoutMs;
}
public void setReadTimeoutMs(Integer readTimeoutMs) {
this.readTimeoutMs = readTimeoutMs;
}
public Integer getWriteTimeoutMs() {
return writeTimeoutMs;
}
public void setWriteTimeoutMs(Integer writeTimeoutMs) {
this.writeTimeoutMs = writeTimeoutMs;
}
public Integer getPollIntervalMs() {
return pollIntervalMs;
}
public void setPollIntervalMs(Integer pollIntervalMs) {
this.pollIntervalMs = pollIntervalMs;
}
public Integer getResultTimeoutMs() {
return resultTimeoutMs;
}
public void setResultTimeoutMs(Integer resultTimeoutMs) {
this.resultTimeoutMs = resultTimeoutMs;
}
public String getDefaultBackend() {
return defaultBackend;
}
public void setDefaultBackend(String defaultBackend) {
this.defaultBackend = defaultBackend;
}
public String getDefaultParseMethod() {
return defaultParseMethod;
}
public void setDefaultParseMethod(String defaultParseMethod) {
this.defaultParseMethod = defaultParseMethod;
}
public List<String> getDefaultLangList() {
return defaultLangList;
}
public void setDefaultLangList(List<String> defaultLangList) {
this.defaultLangList = defaultLangList == null
? new ArrayList<String>(Arrays.asList("ch"))
: defaultLangList;
}
public Boolean getDefaultFormulaEnable() {
return defaultFormulaEnable;
}
public void setDefaultFormulaEnable(Boolean defaultFormulaEnable) {
this.defaultFormulaEnable = defaultFormulaEnable;
}
public Boolean getDefaultTableEnable() {
return defaultTableEnable;
}
public void setDefaultTableEnable(Boolean defaultTableEnable) {
this.defaultTableEnable = defaultTableEnable;
}
}

View File

@@ -10,8 +10,11 @@ import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.context.properties.EnableConfigurationProperties; import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.beans.factory.config.BeanFactoryPostProcessor;
import org.springframework.beans.factory.support.BeanDefinitionRegistry;
import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Primary;
/** /**
* MinerU PDF 文档解析自动装配。 * MinerU PDF 文档解析自动装配。
@@ -21,50 +24,94 @@ import org.springframework.context.annotation.Configuration;
*/ */
@Configuration(proxyBeanMethods = false) @Configuration(proxyBeanMethods = false)
@ConditionalOnClass(MineruPdfDocumentParseService.class) @ConditionalOnClass(MineruPdfDocumentParseService.class)
@ConditionalOnProperty(prefix = "easy-agents.document.pdf", name = "provider", havingValue = "mineru") @ConditionalOnProperty(prefix = "easy-agents.document.ocr", name = "provider", havingValue = "mineru")
@EnableConfigurationProperties({MineruDocumentProperties.class, CommonMineruDocumentProperties.class}) @EnableConfigurationProperties(CommonMineruDocumentProperties.class)
public class MineruPdfAutoConfiguration { public class MineruPdfAutoConfiguration {
public static final String DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME = "documentParseService";
/** /**
* 注册统一 PDF 解析服务。 * 注册统一 PDF 解析服务。
* *
* @param properties Spring Boot 配置 * @param commonProperties Spring Boot 配置
* @return PDF 解析服务 * @return PDF 解析服务
*/ */
@Bean @Bean
@Primary
@ConditionalOnMissingBean(PdfDocumentParseService.class) @ConditionalOnMissingBean(PdfDocumentParseService.class)
public PdfDocumentParseService pdfDocumentParseService(MineruDocumentProperties properties, public PdfDocumentParseService pdfDocumentParseService(CommonMineruDocumentProperties commonProperties) {
CommonMineruDocumentProperties commonProperties) { return new MineruPdfDocumentParseService(toMineruProperties(commonProperties));
return new MineruPdfDocumentParseService(toMineruProperties(properties, commonProperties));
} }
/** /**
* 将 PDF 服务以统一文档解析服务类型暴露,便于调用方直接按抽象注入 * 将默认文档解析服务名注册为 PDF 服务别名,避免重复创建同类型 Bean
* 这里显式走 alias而不是第二个 {@link DocumentParseService} Bean
* 这样既能保持默认契约,也不会破坏按 {@link PdfDocumentParseService} 类型的唯一注入。
* *
* @param pdfDocumentParseService PDF 解析服务 * @return BeanFactory 后置处理器
* @return 统一文档解析服务
*/ */
@Bean @Bean
@ConditionalOnMissingBean(DocumentParseService.class) public static BeanFactoryPostProcessor defaultDocumentParseServiceAliasPostProcessor() {
public DocumentParseService documentParseService(PdfDocumentParseService pdfDocumentParseService) { return beanFactory -> {
return pdfDocumentParseService; if (!(beanFactory instanceof BeanDefinitionRegistry)) {
return;
}
BeanDefinitionRegistry registry = (BeanDefinitionRegistry) beanFactory;
String aliasTarget = resolveAliasTarget(beanFactory, registry);
if (!StringUtil.hasText(aliasTarget)) {
return;
}
if (registry.containsBeanDefinition(DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME)
|| registry.isAlias(DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME)) {
return;
}
if (registry.containsBeanDefinition(aliasTarget)
&& !registry.getBeanDefinition(aliasTarget).isPrimary()) {
registry.getBeanDefinition(aliasTarget).setPrimary(true);
}
registry.registerAlias(aliasTarget, DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME);
};
} }
private MineruProperties toMineruProperties(MineruDocumentProperties properties, private static String resolveAliasTarget(org.springframework.beans.factory.config.ConfigurableListableBeanFactory beanFactory,
CommonMineruDocumentProperties commonProperties) { BeanDefinitionRegistry registry) {
String[] candidateNames = beanFactory.getBeanNamesForType(PdfDocumentParseService.class, true, false);
if (candidateNames == null || candidateNames.length == 0) {
return null;
}
if (candidateNames.length == 1) {
return candidateNames[0];
}
String primaryBeanName = null;
for (String candidateName : candidateNames) {
if (!registry.containsBeanDefinition(candidateName)) {
continue;
}
if (!registry.getBeanDefinition(candidateName).isPrimary()) {
continue;
}
if (primaryBeanName != null) {
return null;
}
primaryBeanName = candidateName;
}
return primaryBeanName;
}
private MineruProperties toMineruProperties(CommonMineruDocumentProperties commonProperties) {
MineruProperties mineruProperties = new MineruProperties(); MineruProperties mineruProperties = new MineruProperties();
boolean useCommon = commonProperties != null && StringUtil.hasText(commonProperties.getBaseUrl()); boolean useCommon = commonProperties != null && StringUtil.hasText(commonProperties.getBaseUrl());
mineruProperties.setBaseUrl(useCommon ? commonProperties.getBaseUrl() : properties.getBaseUrl()); mineruProperties.setBaseUrl(useCommon ? commonProperties.getBaseUrl() : null);
mineruProperties.setConnectTimeoutMs(useCommon ? commonProperties.getConnectTimeoutMs() : properties.getConnectTimeoutMs()); mineruProperties.setConnectTimeoutMs(useCommon ? commonProperties.getConnectTimeoutMs() : null);
mineruProperties.setReadTimeoutMs(useCommon ? commonProperties.getReadTimeoutMs() : properties.getReadTimeoutMs()); mineruProperties.setReadTimeoutMs(useCommon ? commonProperties.getReadTimeoutMs() : null);
mineruProperties.setWriteTimeoutMs(useCommon ? commonProperties.getWriteTimeoutMs() : properties.getWriteTimeoutMs()); mineruProperties.setWriteTimeoutMs(useCommon ? commonProperties.getWriteTimeoutMs() : null);
mineruProperties.setPollIntervalMs(useCommon ? commonProperties.getPollIntervalMs() : properties.getPollIntervalMs()); mineruProperties.setPollIntervalMs(useCommon ? commonProperties.getPollIntervalMs() : null);
mineruProperties.setResultTimeoutMs(useCommon ? commonProperties.getResultTimeoutMs() : properties.getResultTimeoutMs()); mineruProperties.setResultTimeoutMs(useCommon ? commonProperties.getResultTimeoutMs() : null);
mineruProperties.setDefaultBackend(useCommon ? commonProperties.getDefaultBackend() : properties.getDefaultBackend()); mineruProperties.setDefaultBackend(useCommon ? commonProperties.getDefaultBackend() : null);
mineruProperties.setDefaultParseMethod(useCommon ? commonProperties.getDefaultParseMethod() : properties.getDefaultParseMethod()); mineruProperties.setDefaultParseMethod(useCommon ? commonProperties.getDefaultParseMethod() : null);
mineruProperties.setDefaultLangList(useCommon ? commonProperties.getDefaultLangList() : properties.getDefaultLangList()); mineruProperties.setDefaultLangList(useCommon ? commonProperties.getDefaultLangList() : null);
mineruProperties.setDefaultFormulaEnable(useCommon ? commonProperties.getDefaultFormulaEnable() : properties.getDefaultFormulaEnable()); mineruProperties.setDefaultFormulaEnable(useCommon ? commonProperties.getDefaultFormulaEnable() : null);
mineruProperties.setDefaultTableEnable(useCommon ? commonProperties.getDefaultTableEnable() : properties.getDefaultTableEnable()); mineruProperties.setDefaultTableEnable(useCommon ? commonProperties.getDefaultTableEnable() : null);
return mineruProperties; return mineruProperties;
} }
} }

View File

@@ -6,6 +6,7 @@ import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.pptx.PptxDocumentParseService; import com.easyagents.document.pptx.PptxDocumentParseService;
import com.easyagents.document.pptx.mineru.MineruPptxDocumentParseService; import com.easyagents.document.pptx.mineru.MineruPptxDocumentParseService;
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties; import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass; import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
@@ -24,21 +25,24 @@ import java.util.concurrent.Executors;
*/ */
@Configuration(proxyBeanMethods = false) @Configuration(proxyBeanMethods = false)
@ConditionalOnClass(MineruPptxDocumentParseService.class) @ConditionalOnClass(MineruPptxDocumentParseService.class)
@ConditionalOnProperty(prefix = "easy-agents.document.pptx", name = "enabled", havingValue = "true") @ConditionalOnProperty(prefix = "easy-agents.document.ocr", name = "provider", havingValue = "mineru")
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, PptxDocumentProperties.class}) @EnableConfigurationProperties(CommonMineruDocumentProperties.class)
public class MineruPptxAutoConfiguration { public class MineruPptxAutoConfiguration {
public static final String PPTX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME = "pptxDocumentAsyncTaskManager";
private static final int DEFAULT_ASYNC_THREADS = 2;
@Bean @Bean
@ConditionalOnMissingBean(name = "pptxDocumentAsyncTaskManager") @ConditionalOnMissingBean(name = PPTX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
public DocumentAsyncTaskManager pptxDocumentAsyncTaskManager(PptxDocumentProperties properties) { public DocumentAsyncTaskManager pptxDocumentAsyncTaskManager() {
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads(); ExecutorService executorService = Executors.newFixedThreadPool(DEFAULT_ASYNC_THREADS);
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService); return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
} }
@Bean @Bean
@ConditionalOnMissingBean(PptxDocumentParseService.class) @ConditionalOnMissingBean(PptxDocumentParseService.class)
public PptxDocumentParseService pptxDocumentParseService(CommonMineruDocumentProperties commonProperties, public PptxDocumentParseService pptxDocumentParseService(CommonMineruDocumentProperties commonProperties,
@Qualifier(PPTX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
DocumentAsyncTaskManager pptxDocumentAsyncTaskManager) { DocumentAsyncTaskManager pptxDocumentAsyncTaskManager) {
return new MineruPptxDocumentParseService(toMineruProperties(commonProperties), pptxDocumentAsyncTaskManager); return new MineruPptxDocumentParseService(toMineruProperties(commonProperties), pptxDocumentAsyncTaskManager);
} }

View File

@@ -1,32 +0,0 @@
package com.easyagents.spring.boot.document.pptx;
import org.springframework.boot.context.properties.ConfigurationProperties;
/**
* PPTX 文档配置。
*
* @author Codex
* @since 2026-04-16
*/
@ConfigurationProperties(prefix = "easy-agents.document.pptx")
public class PptxDocumentProperties {
private Boolean enabled = false;
private Integer asyncThreads = 2;
public Boolean getEnabled() {
return enabled;
}
public void setEnabled(Boolean enabled) {
this.enabled = enabled;
}
public Integer getAsyncThreads() {
return asyncThreads;
}
public void setAsyncThreads(Integer asyncThreads) {
this.asyncThreads = asyncThreads;
}
}

View File

@@ -6,6 +6,7 @@ import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.xlsx.XlsxDocumentParseService; import com.easyagents.document.xlsx.XlsxDocumentParseService;
import com.easyagents.document.xlsx.mineru.MineruXlsxDocumentParseService; import com.easyagents.document.xlsx.mineru.MineruXlsxDocumentParseService;
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties; import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass; import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
@@ -24,21 +25,24 @@ import java.util.concurrent.Executors;
*/ */
@Configuration(proxyBeanMethods = false) @Configuration(proxyBeanMethods = false)
@ConditionalOnClass(MineruXlsxDocumentParseService.class) @ConditionalOnClass(MineruXlsxDocumentParseService.class)
@ConditionalOnProperty(prefix = "easy-agents.document.xlsx", name = "enabled", havingValue = "true") @ConditionalOnProperty(prefix = "easy-agents.document.ocr", name = "provider", havingValue = "mineru")
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, XlsxDocumentProperties.class}) @EnableConfigurationProperties(CommonMineruDocumentProperties.class)
public class MineruXlsxAutoConfiguration { public class MineruXlsxAutoConfiguration {
public static final String XLSX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME = "xlsxDocumentAsyncTaskManager";
private static final int DEFAULT_ASYNC_THREADS = 2;
@Bean @Bean
@ConditionalOnMissingBean(name = "xlsxDocumentAsyncTaskManager") @ConditionalOnMissingBean(name = XLSX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
public DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager(XlsxDocumentProperties properties) { public DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager() {
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads(); ExecutorService executorService = Executors.newFixedThreadPool(DEFAULT_ASYNC_THREADS);
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService); return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
} }
@Bean @Bean
@ConditionalOnMissingBean(XlsxDocumentParseService.class) @ConditionalOnMissingBean(XlsxDocumentParseService.class)
public XlsxDocumentParseService xlsxDocumentParseService(CommonMineruDocumentProperties commonProperties, public XlsxDocumentParseService xlsxDocumentParseService(CommonMineruDocumentProperties commonProperties,
@Qualifier(XLSX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager) { DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager) {
return new MineruXlsxDocumentParseService(toMineruProperties(commonProperties), xlsxDocumentAsyncTaskManager); return new MineruXlsxDocumentParseService(toMineruProperties(commonProperties), xlsxDocumentAsyncTaskManager);
} }

View File

@@ -1,32 +0,0 @@
package com.easyagents.spring.boot.document.xlsx;
import org.springframework.boot.context.properties.ConfigurationProperties;
/**
* XLSX 文档配置。
*
* @author Codex
* @since 2026-04-16
*/
@ConfigurationProperties(prefix = "easy-agents.document.xlsx")
public class XlsxDocumentProperties {
private Boolean enabled = false;
private Integer asyncThreads = 2;
public Boolean getEnabled() {
return enabled;
}
public void setEnabled(Boolean enabled) {
this.enabled = enabled;
}
public Integer getAsyncThreads() {
return asyncThreads;
}
public void setAsyncThreads(Integer asyncThreads) {
this.asyncThreads = asyncThreads;
}
}

View File

@@ -1,6 +1,10 @@
package com.easyagents.spring.boot.autoconfigure; package com.easyagents.spring.boot.autoconfigure;
import com.easyagents.document.core.DocumentParseService; import com.easyagents.document.core.DocumentParseService;
import com.easyagents.document.core.entity.ParseRequest;
import com.easyagents.document.core.entity.ParseResponse;
import com.easyagents.document.core.entity.ParseTaskInfo;
import com.easyagents.document.core.entity.ParseTaskStatus;
import com.easyagents.document.pdf.PdfDocumentParseService; import com.easyagents.document.pdf.PdfDocumentParseService;
import com.easyagents.document.pptx.PptxDocumentParseService; import com.easyagents.document.pptx.PptxDocumentParseService;
import com.easyagents.document.xlsx.XlsxDocumentParseService; import com.easyagents.document.xlsx.XlsxDocumentParseService;
@@ -13,6 +17,8 @@ import com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration;
import com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration; import com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.boot.test.context.runner.ApplicationContextRunner; import org.springframework.boot.test.context.runner.ApplicationContextRunner;
public class StarterConditionalAutoConfigurationTest { public class StarterConditionalAutoConfigurationTest {
@@ -49,27 +55,105 @@ public class StarterConditionalAutoConfigurationTest {
public void shouldCreateMineruDocumentBeansWhenConfigured() { public void shouldCreateMineruDocumentBeansWhenConfigured() {
contextRunner contextRunner
.withPropertyValues( .withPropertyValues(
"easy-agents.document.pdf.provider=mineru", "easy-agents.document.ocr.provider=mineru",
"easy-agents.document.pdf.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api" "easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
) )
.run(context -> { .run(context -> {
Assert.assertNotNull(context.getBean(PdfDocumentParseService.class)); Assert.assertNotNull(context.getBean(PdfDocumentParseService.class));
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
Assert.assertNotNull(context.getBean(DocumentParseService.class)); Assert.assertNotNull(context.getBean(DocumentParseService.class));
}); });
} }
@Test @Test
public void shouldCreatePptxAndXlsxBeansWhenEnabled() { public void shouldCreatePptxAndXlsxBeansWhenMineruOcrConfigured() {
contextRunner contextRunner
.withPropertyValues( .withPropertyValues(
"easy-agents.document.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api", "easy-agents.document.ocr.provider=mineru",
"easy-agents.document.pptx.enabled=true", "easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
"easy-agents.document.xlsx.enabled=true"
) )
.run(context -> { .run(context -> {
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class)); Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class)); Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
Assert.assertFalse(context.containsBean("documentParseService")); Assert.assertNotNull(context.getBean(DocumentParseService.class));
}); });
} }
@Test
public void shouldKeepPdfAsDefaultDocumentParseServiceWhenMineruOcrConfigured() {
contextRunner
.withPropertyValues(
"easy-agents.document.ocr.provider=mineru",
"easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
)
.run(context -> {
PdfDocumentParseService pdfService = context.getBean(PdfDocumentParseService.class);
Assert.assertNotNull(pdfService);
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
Assert.assertSame(pdfService, context.getBean("documentParseService"));
Assert.assertSame(pdfService, context.getBean(DocumentParseService.class));
});
}
@Test
public void shouldAliasCustomNamedPdfServiceAsDefaultDocumentParseService() {
new ApplicationContextRunner()
.withUserConfiguration(CustomPdfParseServiceConfiguration.class)
.withUserConfiguration(
RagIngestionAutoConfiguration.class,
OllamaAutoConfiguration.class,
OpenSearchAutoConfiguration.class,
MineruPdfAutoConfiguration.class,
MineruPptxAutoConfiguration.class,
MineruXlsxAutoConfiguration.class
)
.withPropertyValues(
"easy-agents.document.ocr.provider=mineru",
"easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
)
.run(context -> {
PdfDocumentParseService pdfService = context.getBean(PdfDocumentParseService.class);
Assert.assertSame(pdfService, context.getBean("documentParseService"));
Assert.assertSame(pdfService, context.getBean(DocumentParseService.class));
});
}
@Configuration(proxyBeanMethods = false)
static class CustomPdfParseServiceConfiguration {
@Bean("customPdfService")
PdfDocumentParseService customPdfService() {
return new NoopPdfDocumentParseService();
}
}
static class NoopPdfDocumentParseService implements PdfDocumentParseService {
@Override
public ParseResponse parse(ParseRequest request) {
return new ParseResponse();
}
@Override
public ParseTaskStatus submit(ParseRequest request) {
return new ParseTaskStatus();
}
@Override
public ParseTaskStatus queryTask(String taskId) {
return new ParseTaskStatus();
}
@Override
public ParseResponse queryResult(String taskId) {
return new ParseResponse();
}
@Override
public ParseTaskInfo queryTaskInfo(String taskId) {
return new ParseTaskInfo();
}
}
} }