refactor: 收敛文档 OCR 配置并补齐 Office 解析兼容
- 统一 MinerU OCR 配置结构并移除分模块冗余属性类 - 补齐 JSON 字符串化结果拆包、XLSX 图片兼容与对应回归测试
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
@@ -16,6 +15,7 @@ import okhttp3.ResponseBody;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URLConnection;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@@ -115,7 +115,10 @@ public class MineruClient {
|
||||
}
|
||||
String contentType = response.header("Content-Type");
|
||||
if (contentType != null && contentType.contains("application/json")) {
|
||||
JSONObject jsonObject = JSON.parseObject(new String(responseBytes));
|
||||
JSONObject jsonObject = MineruJsonSupport.parseObject(
|
||||
new String(responseBytes, StandardCharsets.UTF_8),
|
||||
"MinerU async result response: " + path
|
||||
);
|
||||
throw new DocumentParseException("MinerU async result is not ready: " + jsonObject.toJSONString());
|
||||
}
|
||||
if (responseBytes.length < 2 || responseBytes[0] != 'P' || responseBytes[1] != 'K') {
|
||||
@@ -148,9 +151,13 @@ public class MineruClient {
|
||||
ResponseBody body = response.body();
|
||||
String bodyText = body == null ? "" : body.string();
|
||||
if (!response.isSuccessful()) {
|
||||
throw buildHttpException(path, response.code(), bodyText == null ? new byte[0] : bodyText.getBytes());
|
||||
throw buildHttpException(
|
||||
path,
|
||||
response.code(),
|
||||
bodyText == null ? new byte[0] : bodyText.getBytes(StandardCharsets.UTF_8)
|
||||
);
|
||||
}
|
||||
return JSON.parseObject(bodyText);
|
||||
return MineruJsonSupport.parseObject(bodyText, "MinerU response body: " + path);
|
||||
} catch (IOException exception) {
|
||||
throw new DocumentParseException("Failed to call MinerU endpoint: " + path, exception);
|
||||
}
|
||||
@@ -197,7 +204,7 @@ public class MineruClient {
|
||||
}
|
||||
|
||||
private DocumentParseException buildHttpException(String path, int statusCode, byte[] bodyBytes) {
|
||||
String bodyText = bodyBytes == null ? "" : new String(bodyBytes);
|
||||
String bodyText = bodyBytes == null ? "" : new String(bodyBytes, StandardCharsets.UTF_8);
|
||||
return new DocumentParseException(
|
||||
"MinerU request failed: path=" + path + ", status=" + statusCode + ", body=" + bodyText
|
||||
);
|
||||
|
||||
@@ -0,0 +1,199 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.JSONArray;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
|
||||
/**
|
||||
* MinerU JSON 兼容解析工具。
|
||||
*
|
||||
* <p>部分部署返回的结构化字段会被额外包装成 JSON 字符串,
|
||||
* 例如 {@code middle_json="{}"} 或整个响应体直接返回
|
||||
* {@code "{\"results\":{...}}"}。该工具负责统一拆包,
|
||||
* 让上层模块无需重复处理这些兼容分支。</p>
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-17
|
||||
*/
|
||||
final class MineruJsonSupport {
|
||||
|
||||
private static final int MAX_JSON_UNWRAP_DEPTH = 4;
|
||||
private static final int PREVIEW_LIMIT = 160;
|
||||
|
||||
private MineruJsonSupport() {
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析 JSON 文本并要求最终结果为对象。
|
||||
*
|
||||
* @param text JSON 文本
|
||||
* @param context 错误上下文
|
||||
* @return JSON 对象
|
||||
*/
|
||||
static JSONObject parseObject(String text, String context) {
|
||||
Object value = parseValue(text, context);
|
||||
if (value instanceof JSONObject) {
|
||||
return (JSONObject) value;
|
||||
}
|
||||
throw unexpectedType(context, "JSONObject", value);
|
||||
}
|
||||
|
||||
/**
|
||||
* 规范化任意 JSON 值。
|
||||
*
|
||||
* @param value 原始值
|
||||
* @param context 错误上下文
|
||||
* @return 规范化后的 JSON 值
|
||||
*/
|
||||
static Object normalizeValue(Object value, String context) {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
if (value instanceof JSONObject || value instanceof JSONArray) {
|
||||
return value;
|
||||
}
|
||||
if (value instanceof CharSequence) {
|
||||
String text = value.toString().trim();
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return null;
|
||||
}
|
||||
return parseValue(text, context);
|
||||
}
|
||||
try {
|
||||
return unwrapNestedJsonString(JSON.parse(JSON.toJSONString(value)), context);
|
||||
} catch (Exception exception) {
|
||||
throw new DocumentParseException(
|
||||
"Failed to normalize MinerU JSON value: " + context + ", preview=" + preview(String.valueOf(value)),
|
||||
exception
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 在值看起来像 JSON 时才尝试拆包,否则保留原始值。
|
||||
*
|
||||
* @param value 原始值
|
||||
* @param context 错误上下文
|
||||
* @return 规范化后的值或原值
|
||||
*/
|
||||
static Object normalizeValueIfJsonLike(Object value, String context) {
|
||||
if (!(value instanceof CharSequence)) {
|
||||
return normalizeValue(value, context);
|
||||
}
|
||||
String text = value.toString().trim();
|
||||
if (!looksLikeJson(text) && !(text.startsWith("\"") && text.endsWith("\""))) {
|
||||
return value;
|
||||
}
|
||||
return normalizeValue(value, context);
|
||||
}
|
||||
|
||||
/**
|
||||
* 将任意 JSON 值转换为对象。
|
||||
*
|
||||
* @param value 原始值
|
||||
* @param context 错误上下文
|
||||
* @return JSON 对象
|
||||
*/
|
||||
static JSONObject asObject(Object value, String context) {
|
||||
Object normalized = normalizeValue(value, context);
|
||||
if (normalized == null) {
|
||||
return null;
|
||||
}
|
||||
if (normalized instanceof JSONObject) {
|
||||
return (JSONObject) normalized;
|
||||
}
|
||||
throw unexpectedType(context, "JSONObject", normalized);
|
||||
}
|
||||
|
||||
/**
|
||||
* 将任意 JSON 值转换为数组。
|
||||
*
|
||||
* @param value 原始值
|
||||
* @param context 错误上下文
|
||||
* @return JSON 数组
|
||||
*/
|
||||
static JSONArray asArray(Object value, String context) {
|
||||
Object normalized = normalizeValue(value, context);
|
||||
if (normalized == null) {
|
||||
return null;
|
||||
}
|
||||
if (normalized instanceof JSONArray) {
|
||||
return (JSONArray) normalized;
|
||||
}
|
||||
throw unexpectedType(context, "JSONArray", normalized);
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析 JSON 文本并自动拆解被双层包装的字符串值。
|
||||
*
|
||||
* @param text JSON 文本
|
||||
* @param context 错误上下文
|
||||
* @return 解析结果
|
||||
*/
|
||||
static Object parseValue(String text, String context) {
|
||||
String trimmed = text == null ? null : text.trim();
|
||||
if (!StringUtil.hasText(trimmed)) {
|
||||
throw new DocumentParseException("MinerU JSON payload is empty: " + context);
|
||||
}
|
||||
try {
|
||||
return unwrapNestedJsonString(JSON.parse(trimmed), context);
|
||||
} catch (DocumentParseException exception) {
|
||||
throw exception;
|
||||
} catch (Exception exception) {
|
||||
throw new DocumentParseException(
|
||||
"Failed to parse MinerU JSON payload: " + context + ", preview=" + preview(trimmed),
|
||||
exception
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private static Object unwrapNestedJsonString(Object value, String context) {
|
||||
Object current = value;
|
||||
for (int depth = 0; depth < MAX_JSON_UNWRAP_DEPTH; depth++) {
|
||||
if (!(current instanceof String)) {
|
||||
return current;
|
||||
}
|
||||
String text = ((String) current).trim();
|
||||
if (!looksLikeJson(text)) {
|
||||
return current;
|
||||
}
|
||||
try {
|
||||
current = JSON.parse(text);
|
||||
} catch (Exception exception) {
|
||||
throw new DocumentParseException(
|
||||
"Failed to unwrap MinerU nested JSON string: " + context + ", preview=" + preview(text),
|
||||
exception
|
||||
);
|
||||
}
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
private static boolean looksLikeJson(String text) {
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return false;
|
||||
}
|
||||
char first = text.charAt(0);
|
||||
return first == '{' || first == '[';
|
||||
}
|
||||
|
||||
private static DocumentParseException unexpectedType(String context, String expectedType, Object actualValue) {
|
||||
String actualType = actualValue == null ? "null" : actualValue.getClass().getSimpleName();
|
||||
return new DocumentParseException(
|
||||
"MinerU JSON payload type mismatch: " + context + ", expected=" + expectedType + ", actual=" + actualType
|
||||
);
|
||||
}
|
||||
|
||||
private static String preview(String text) {
|
||||
if (text == null) {
|
||||
return "";
|
||||
}
|
||||
String normalized = text.replace('\n', ' ').replace('\r', ' ');
|
||||
if (normalized.length() <= PREVIEW_LIMIT) {
|
||||
return normalized;
|
||||
}
|
||||
return normalized.substring(0, PREVIEW_LIMIT) + "...";
|
||||
}
|
||||
}
|
||||
@@ -116,10 +116,16 @@ public class MineruMapper {
|
||||
payload.setBackend(jsonObject.getString("backend"));
|
||||
payload.setVersion(jsonObject.getString("version"));
|
||||
Map<String, JSONObject> results = new LinkedHashMap<String, JSONObject>();
|
||||
JSONObject resultJson = jsonObject.getJSONObject("results");
|
||||
JSONObject resultJson = MineruJsonSupport.asObject(jsonObject.get("results"), "MinerU sync results");
|
||||
if (resultJson != null) {
|
||||
for (String key : resultJson.keySet()) {
|
||||
results.put(key, resultJson.getJSONObject(key));
|
||||
JSONObject result = MineruJsonSupport.asObject(
|
||||
resultJson.get(key),
|
||||
"MinerU sync result entry: " + key
|
||||
);
|
||||
if (result != null) {
|
||||
results.put(key, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
payload.setResults(results);
|
||||
@@ -239,12 +245,24 @@ public class MineruMapper {
|
||||
result.setPlainText(result.getMarkdown());
|
||||
|
||||
ParseArtifacts artifacts = new ParseArtifacts();
|
||||
artifacts.setMiddleJson(fileResult.get("middle_json"));
|
||||
artifacts.setContentList(fileResult.get("content_list"));
|
||||
artifacts.setModelOutput(fileResult.get("model_output"));
|
||||
artifacts.setMiddleJson(MineruJsonSupport.normalizeValue(
|
||||
fileResult.get("middle_json"),
|
||||
"MinerU result " + fileName + " middle_json"
|
||||
));
|
||||
artifacts.setContentList(MineruJsonSupport.normalizeValue(
|
||||
fileResult.get("content_list"),
|
||||
"MinerU result " + fileName + " content_list"
|
||||
));
|
||||
artifacts.setModelOutput(MineruJsonSupport.normalizeValueIfJsonLike(
|
||||
fileResult.get("model_output"),
|
||||
"MinerU result " + fileName + " model_output"
|
||||
));
|
||||
result.setArtifacts(artifacts);
|
||||
|
||||
Map<String, String> imageDataUrls = toStringMap(fileResult.getJSONObject("images"));
|
||||
Map<String, String> imageDataUrls = toStringMap(MineruJsonSupport.asObject(
|
||||
fileResult.get("images"),
|
||||
"MinerU result " + fileName + " images"
|
||||
));
|
||||
Map<String, byte[]> imageContents = toBinaryMap(imageDataUrls);
|
||||
applyStructuredArtifacts(result, imageDataUrls, imageContents);
|
||||
if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) {
|
||||
@@ -266,8 +284,8 @@ public class MineruMapper {
|
||||
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
|
||||
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
|
||||
|
||||
JSONObject middleJson = asObject(middleArtifact);
|
||||
JSONArray contentList = asArray(contentListArtifact);
|
||||
JSONObject middleJson = asObject(middleArtifact, "MinerU ZIP middle artifact: " + fileName);
|
||||
JSONArray contentList = asArray(contentListArtifact, "MinerU ZIP content_list artifact: " + fileName);
|
||||
Object modelOutput = modelOutputArtifact;
|
||||
|
||||
if (contentList == null && middleArtifact instanceof JSONArray) {
|
||||
@@ -283,7 +301,10 @@ public class MineruMapper {
|
||||
artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
|
||||
artifacts.setModelOutput(modelOutput);
|
||||
|
||||
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
|
||||
JSONArray contentListV2 = asArray(
|
||||
firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"),
|
||||
"MinerU ZIP content_list_v2 artifact: " + fileName
|
||||
);
|
||||
if (contentListV2 != null) {
|
||||
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
|
||||
}
|
||||
@@ -308,8 +329,8 @@ public class MineruMapper {
|
||||
}
|
||||
|
||||
private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls, Map<String, byte[]> imageContents) {
|
||||
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson());
|
||||
JSONArray contentList = asArray(result.getArtifacts().getContentList());
|
||||
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson(), "MinerU middle_json artifact");
|
||||
JSONArray contentList = asArray(result.getArtifacts().getContentList(), "MinerU content_list artifact");
|
||||
|
||||
if (middleJson != null) {
|
||||
fillPages(result, middleJson);
|
||||
@@ -576,34 +597,18 @@ public class MineruMapper {
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return JSON.parse(text);
|
||||
} catch (Exception exception) {
|
||||
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
|
||||
}
|
||||
return MineruJsonSupport.parseValue(text, "MinerU ZIP artifact " + suffix);
|
||||
}
|
||||
|
||||
private JSONObject asObject(Object value) {
|
||||
if (value instanceof JSONObject) {
|
||||
return (JSONObject) value;
|
||||
}
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
private JSONObject asObject(Object value, String context) {
|
||||
if (value instanceof JSONArray) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseObject(JSON.toJSONString(value));
|
||||
return MineruJsonSupport.asObject(value, context);
|
||||
}
|
||||
|
||||
private JSONArray asArray(Object value) {
|
||||
if (value instanceof JSONArray) {
|
||||
return (JSONArray) value;
|
||||
}
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseArray(JSON.toJSONString(value));
|
||||
private JSONArray asArray(Object value, String context) {
|
||||
return MineruJsonSupport.asArray(value, context);
|
||||
}
|
||||
|
||||
private List<String> toStringList(JSONArray jsonArray) {
|
||||
|
||||
@@ -49,6 +49,25 @@ public class MineruMapperTest {
|
||||
Assert.assertNotNull(result.getArtifacts().getContentList());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldMapStringifiedSyncArtifacts() {
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruResultPayload payload = mapper.toResultPayload(syncPayloadWithStringifiedArtifacts());
|
||||
|
||||
ParseResponse response = mapper.toParseResponse(payload);
|
||||
|
||||
Assert.assertEquals(1, response.getResults().size());
|
||||
ParseResult result = response.getResults().get(0);
|
||||
Assert.assertEquals("# title", result.getMarkdown());
|
||||
Assert.assertFalse(result.getBlocks().isEmpty());
|
||||
Assert.assertEquals(1, result.getTables().size());
|
||||
Assert.assertEquals(2, result.getImages().size());
|
||||
Assert.assertNotNull(result.getImages().get(0).getContent());
|
||||
Assert.assertTrue(result.getArtifacts().getMiddleJson() instanceof JSONObject);
|
||||
Assert.assertTrue(result.getArtifacts().getContentList() instanceof JSONArray);
|
||||
Assert.assertEquals("plain-model-output", result.getArtifacts().getModelOutput());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldMapZipResponse() throws IOException {
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
@@ -292,6 +311,28 @@ public class MineruMapperTest {
|
||||
return payload;
|
||||
}
|
||||
|
||||
private JSONObject syncPayloadWithStringifiedArtifacts() {
|
||||
JSONObject payload = new JSONObject();
|
||||
payload.put("backend", "vlm-http-client");
|
||||
payload.put("version", "3.0.9");
|
||||
|
||||
JSONObject result = new JSONObject();
|
||||
result.put("md_content", "# title");
|
||||
result.put("middle_json", middleJson().toJSONString());
|
||||
result.put("content_list", contentList().toJSONString());
|
||||
result.put("model_output", "plain-model-output");
|
||||
|
||||
JSONObject images = new JSONObject();
|
||||
images.put("figure.png", "data:image/png;base64,ZmFrZQ==");
|
||||
images.put("table.png", "data:image/png;base64,ZmFrZQ==");
|
||||
result.put("images", images.toJSONString());
|
||||
|
||||
JSONObject results = new JSONObject();
|
||||
results.put("demo", result.toJSONString());
|
||||
payload.put("results", results);
|
||||
return payload;
|
||||
}
|
||||
|
||||
private JSONObject middleBlock(String type, String imagePath) {
|
||||
JSONObject block = new JSONObject();
|
||||
block.put("type", type);
|
||||
|
||||
@@ -81,6 +81,29 @@ public class MineruPptxDocumentParseServiceTest {
|
||||
Assert.assertEquals(1, taskInfo.getResult().getResults().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldSupportStringifiedMineruSlideArtifacts() throws IOException {
|
||||
RecordingClient client = new RecordingClient(defaultProperties(), true);
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruPptxDocumentParseService service = new MineruPptxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
PptxParseRequest request = new PptxParseRequest();
|
||||
request.addFile(ParseFile.of("demo.pptx", buildPptxBytes()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
|
||||
Assert.assertEquals(1, response.getResults().size());
|
||||
ParseResult result = response.getResults().get(0);
|
||||
Assert.assertTrue(result.getMarkdown().contains("slide-ocr-1"));
|
||||
Assert.assertFalse(result.getBlocks().isEmpty());
|
||||
Assert.assertEquals(2, result.getImages().size());
|
||||
}
|
||||
|
||||
private byte[] buildPptxBytes() throws IOException {
|
||||
XMLSlideShow slideShow = new XMLSlideShow();
|
||||
slideShow.setPageSize(new java.awt.Dimension(640, 360));
|
||||
@@ -117,9 +140,15 @@ public class MineruPptxDocumentParseServiceTest {
|
||||
private static class RecordingClient extends MineruClient {
|
||||
|
||||
private int parseCount;
|
||||
private final boolean stringifyArtifacts;
|
||||
|
||||
private RecordingClient(MineruProperties properties) {
|
||||
this(properties, false);
|
||||
}
|
||||
|
||||
private RecordingClient(MineruProperties properties, boolean stringifyArtifacts) {
|
||||
super(properties, new MineruMapper(properties));
|
||||
this.stringifyArtifacts = stringifyArtifacts;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -134,10 +163,10 @@ public class MineruPptxDocumentParseServiceTest {
|
||||
payload.put("version", "3.0.9");
|
||||
JSONObject result = new JSONObject();
|
||||
result.put("md_content", "slide-ocr-" + index);
|
||||
result.put("middle_json", middleJson());
|
||||
result.put("content_list", contentList(index));
|
||||
result.put("middle_json", stringifyArtifacts ? middleJson().toJSONString() : middleJson());
|
||||
result.put("content_list", stringifyArtifacts ? contentList(index).toJSONString() : contentList(index));
|
||||
JSONObject results = new JSONObject();
|
||||
results.put("slide-" + index, result);
|
||||
results.put("slide-" + index, stringifyArtifacts ? result.toJSONString() : result);
|
||||
payload.put("results", results);
|
||||
return payload;
|
||||
}
|
||||
|
||||
@@ -253,10 +253,11 @@ public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseSe
|
||||
appendSheetHeader(extraction.markdown, sheet.getSheetName());
|
||||
|
||||
if (maxRow < 0 || maxCol <= 0) {
|
||||
extraction.markdown.append("_empty sheet_");
|
||||
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
|
||||
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
|
||||
if (!imageArtifacts.isEmpty()) {
|
||||
appendImageOnlySheet(extraction.markdown, sheet.getSheetName(), request, imageArtifacts);
|
||||
return extraction;
|
||||
}
|
||||
extraction.markdown.append("_empty sheet_");
|
||||
return extraction;
|
||||
}
|
||||
|
||||
@@ -488,6 +489,29 @@ public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseSe
|
||||
}
|
||||
}
|
||||
|
||||
private void appendImageOnlySheet(StringBuilder markdownBuilder,
|
||||
String sheetName,
|
||||
XlsxParseRequest request,
|
||||
List<XlsxCellImageArtifact> imageArtifacts) {
|
||||
markdownBuilder.append("## ").append(sheetName).append(" 图片内容\n\n");
|
||||
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
|
||||
markdownBuilder.append("[IMG:")
|
||||
.append(imageArtifact.getReferenceKey())
|
||||
.append("]\n\n");
|
||||
}
|
||||
if (Boolean.TRUE.equals(request.getIncludeImageAppendix())) {
|
||||
appendImageAppendix(markdownBuilder, sheetName, imageArtifacts);
|
||||
return;
|
||||
}
|
||||
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
|
||||
markdownBuilder.append("
|
||||
.append(imageArtifact.getSourcePath())
|
||||
.append(")\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> extractMergedRanges(XSSFSheet sheet) {
|
||||
List<String> mergedRanges = new ArrayList<String>();
|
||||
for (int index = 0; index < sheet.getNumMergedRegions(); index++) {
|
||||
|
||||
@@ -138,7 +138,9 @@ public class MineruXlsxDocumentParseServiceTest {
|
||||
XlsxParseArtifact artifact = extractXlsxArtifact(result);
|
||||
|
||||
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("_empty sheet_"));
|
||||
Assert.assertFalse(result.getMarkdown().contains("_empty sheet_"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片内容"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片说明"));
|
||||
Assert.assertTrue(result.getMarkdown().contains(""));
|
||||
Assert.assertTrue(result.getMarkdown().contains("- 占位符:[IMG:sheet1-r2c2-001]"));
|
||||
@@ -147,6 +149,30 @@ public class MineruXlsxDocumentParseServiceTest {
|
||||
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldKeepMarkdownImageReferenceWhenImageAppendixDisabled() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.setIncludeImageAppendix(Boolean.FALSE);
|
||||
request.addFile(ParseFile.of("image-only.xlsx", buildWorkbookBytesWithImageOnlySheet()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
ParseResult result = response.getResults().get(0);
|
||||
|
||||
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片内容"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
|
||||
Assert.assertTrue(result.getMarkdown().contains(""));
|
||||
Assert.assertFalse(result.getMarkdown().contains("## Sheet1 图片说明"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldTrackAsyncLifecycleAndExposeResult() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
|
||||
@@ -12,7 +12,7 @@ import java.util.List;
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
@ConfigurationProperties(prefix = "easy-agents.document.mineru")
|
||||
@ConfigurationProperties(prefix = "easy-agents.document.ocr.mineru")
|
||||
public class CommonMineruDocumentProperties {
|
||||
|
||||
private String baseUrl;
|
||||
|
||||
@@ -1,119 +0,0 @@
|
||||
package com.easyagents.spring.boot.document.pdf.mineru;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* MinerU Spring Boot 配置。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
*/
|
||||
@ConfigurationProperties(prefix = "easy-agents.document.pdf.mineru")
|
||||
public class MineruDocumentProperties {
|
||||
|
||||
private String baseUrl;
|
||||
private Integer connectTimeoutMs = 3000;
|
||||
private Integer readTimeoutMs = 600000;
|
||||
private Integer writeTimeoutMs = 600000;
|
||||
private Integer pollIntervalMs = 1000;
|
||||
private Integer resultTimeoutMs = 1800000;
|
||||
private String defaultBackend = "vlm-http-client";
|
||||
private String defaultParseMethod = "auto";
|
||||
private List<String> defaultLangList = new ArrayList<String>(Arrays.asList("ch"));
|
||||
private Boolean defaultFormulaEnable = true;
|
||||
private Boolean defaultTableEnable = true;
|
||||
|
||||
public String getBaseUrl() {
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
public void setBaseUrl(String baseUrl) {
|
||||
this.baseUrl = baseUrl;
|
||||
}
|
||||
|
||||
public Integer getConnectTimeoutMs() {
|
||||
return connectTimeoutMs;
|
||||
}
|
||||
|
||||
public void setConnectTimeoutMs(Integer connectTimeoutMs) {
|
||||
this.connectTimeoutMs = connectTimeoutMs;
|
||||
}
|
||||
|
||||
public Integer getReadTimeoutMs() {
|
||||
return readTimeoutMs;
|
||||
}
|
||||
|
||||
public void setReadTimeoutMs(Integer readTimeoutMs) {
|
||||
this.readTimeoutMs = readTimeoutMs;
|
||||
}
|
||||
|
||||
public Integer getWriteTimeoutMs() {
|
||||
return writeTimeoutMs;
|
||||
}
|
||||
|
||||
public void setWriteTimeoutMs(Integer writeTimeoutMs) {
|
||||
this.writeTimeoutMs = writeTimeoutMs;
|
||||
}
|
||||
|
||||
public Integer getPollIntervalMs() {
|
||||
return pollIntervalMs;
|
||||
}
|
||||
|
||||
public void setPollIntervalMs(Integer pollIntervalMs) {
|
||||
this.pollIntervalMs = pollIntervalMs;
|
||||
}
|
||||
|
||||
public Integer getResultTimeoutMs() {
|
||||
return resultTimeoutMs;
|
||||
}
|
||||
|
||||
public void setResultTimeoutMs(Integer resultTimeoutMs) {
|
||||
this.resultTimeoutMs = resultTimeoutMs;
|
||||
}
|
||||
|
||||
public String getDefaultBackend() {
|
||||
return defaultBackend;
|
||||
}
|
||||
|
||||
public void setDefaultBackend(String defaultBackend) {
|
||||
this.defaultBackend = defaultBackend;
|
||||
}
|
||||
|
||||
public String getDefaultParseMethod() {
|
||||
return defaultParseMethod;
|
||||
}
|
||||
|
||||
public void setDefaultParseMethod(String defaultParseMethod) {
|
||||
this.defaultParseMethod = defaultParseMethod;
|
||||
}
|
||||
|
||||
public List<String> getDefaultLangList() {
|
||||
return defaultLangList;
|
||||
}
|
||||
|
||||
public void setDefaultLangList(List<String> defaultLangList) {
|
||||
this.defaultLangList = defaultLangList == null
|
||||
? new ArrayList<String>(Arrays.asList("ch"))
|
||||
: defaultLangList;
|
||||
}
|
||||
|
||||
public Boolean getDefaultFormulaEnable() {
|
||||
return defaultFormulaEnable;
|
||||
}
|
||||
|
||||
public void setDefaultFormulaEnable(Boolean defaultFormulaEnable) {
|
||||
this.defaultFormulaEnable = defaultFormulaEnable;
|
||||
}
|
||||
|
||||
public Boolean getDefaultTableEnable() {
|
||||
return defaultTableEnable;
|
||||
}
|
||||
|
||||
public void setDefaultTableEnable(Boolean defaultTableEnable) {
|
||||
this.defaultTableEnable = defaultTableEnable;
|
||||
}
|
||||
}
|
||||
@@ -10,8 +10,11 @@ import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.beans.factory.config.BeanFactoryPostProcessor;
|
||||
import org.springframework.beans.factory.support.BeanDefinitionRegistry;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.context.annotation.Primary;
|
||||
|
||||
/**
|
||||
* MinerU PDF 文档解析自动装配。
|
||||
@@ -21,50 +24,94 @@ import org.springframework.context.annotation.Configuration;
|
||||
*/
|
||||
@Configuration(proxyBeanMethods = false)
|
||||
@ConditionalOnClass(MineruPdfDocumentParseService.class)
|
||||
@ConditionalOnProperty(prefix = "easy-agents.document.pdf", name = "provider", havingValue = "mineru")
|
||||
@EnableConfigurationProperties({MineruDocumentProperties.class, CommonMineruDocumentProperties.class})
|
||||
@ConditionalOnProperty(prefix = "easy-agents.document.ocr", name = "provider", havingValue = "mineru")
|
||||
@EnableConfigurationProperties(CommonMineruDocumentProperties.class)
|
||||
public class MineruPdfAutoConfiguration {
|
||||
|
||||
public static final String DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME = "documentParseService";
|
||||
|
||||
/**
|
||||
* 注册统一 PDF 解析服务。
|
||||
*
|
||||
* @param properties Spring Boot 配置
|
||||
* @param commonProperties Spring Boot 配置
|
||||
* @return PDF 解析服务
|
||||
*/
|
||||
@Bean
|
||||
@Primary
|
||||
@ConditionalOnMissingBean(PdfDocumentParseService.class)
|
||||
public PdfDocumentParseService pdfDocumentParseService(MineruDocumentProperties properties,
|
||||
CommonMineruDocumentProperties commonProperties) {
|
||||
return new MineruPdfDocumentParseService(toMineruProperties(properties, commonProperties));
|
||||
public PdfDocumentParseService pdfDocumentParseService(CommonMineruDocumentProperties commonProperties) {
|
||||
return new MineruPdfDocumentParseService(toMineruProperties(commonProperties));
|
||||
}
|
||||
|
||||
/**
|
||||
* 将 PDF 服务以统一文档解析服务类型暴露,便于调用方直接按抽象注入。
|
||||
* 将默认文档解析服务名注册为 PDF 服务别名,避免重复创建同类型 Bean。
|
||||
* 这里显式走 alias,而不是第二个 {@link DocumentParseService} Bean,
|
||||
* 这样既能保持默认契约,也不会破坏按 {@link PdfDocumentParseService} 类型的唯一注入。
|
||||
*
|
||||
* @param pdfDocumentParseService PDF 解析服务
|
||||
* @return 统一文档解析服务
|
||||
* @return BeanFactory 后置处理器
|
||||
*/
|
||||
@Bean
|
||||
@ConditionalOnMissingBean(DocumentParseService.class)
|
||||
public DocumentParseService documentParseService(PdfDocumentParseService pdfDocumentParseService) {
|
||||
return pdfDocumentParseService;
|
||||
public static BeanFactoryPostProcessor defaultDocumentParseServiceAliasPostProcessor() {
|
||||
return beanFactory -> {
|
||||
if (!(beanFactory instanceof BeanDefinitionRegistry)) {
|
||||
return;
|
||||
}
|
||||
BeanDefinitionRegistry registry = (BeanDefinitionRegistry) beanFactory;
|
||||
String aliasTarget = resolveAliasTarget(beanFactory, registry);
|
||||
if (!StringUtil.hasText(aliasTarget)) {
|
||||
return;
|
||||
}
|
||||
if (registry.containsBeanDefinition(DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME)
|
||||
|| registry.isAlias(DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME)) {
|
||||
return;
|
||||
}
|
||||
if (registry.containsBeanDefinition(aliasTarget)
|
||||
&& !registry.getBeanDefinition(aliasTarget).isPrimary()) {
|
||||
registry.getBeanDefinition(aliasTarget).setPrimary(true);
|
||||
}
|
||||
registry.registerAlias(aliasTarget, DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME);
|
||||
};
|
||||
}
|
||||
|
||||
private MineruProperties toMineruProperties(MineruDocumentProperties properties,
|
||||
CommonMineruDocumentProperties commonProperties) {
|
||||
private static String resolveAliasTarget(org.springframework.beans.factory.config.ConfigurableListableBeanFactory beanFactory,
|
||||
BeanDefinitionRegistry registry) {
|
||||
String[] candidateNames = beanFactory.getBeanNamesForType(PdfDocumentParseService.class, true, false);
|
||||
if (candidateNames == null || candidateNames.length == 0) {
|
||||
return null;
|
||||
}
|
||||
if (candidateNames.length == 1) {
|
||||
return candidateNames[0];
|
||||
}
|
||||
String primaryBeanName = null;
|
||||
for (String candidateName : candidateNames) {
|
||||
if (!registry.containsBeanDefinition(candidateName)) {
|
||||
continue;
|
||||
}
|
||||
if (!registry.getBeanDefinition(candidateName).isPrimary()) {
|
||||
continue;
|
||||
}
|
||||
if (primaryBeanName != null) {
|
||||
return null;
|
||||
}
|
||||
primaryBeanName = candidateName;
|
||||
}
|
||||
return primaryBeanName;
|
||||
}
|
||||
|
||||
private MineruProperties toMineruProperties(CommonMineruDocumentProperties commonProperties) {
|
||||
MineruProperties mineruProperties = new MineruProperties();
|
||||
boolean useCommon = commonProperties != null && StringUtil.hasText(commonProperties.getBaseUrl());
|
||||
mineruProperties.setBaseUrl(useCommon ? commonProperties.getBaseUrl() : properties.getBaseUrl());
|
||||
mineruProperties.setConnectTimeoutMs(useCommon ? commonProperties.getConnectTimeoutMs() : properties.getConnectTimeoutMs());
|
||||
mineruProperties.setReadTimeoutMs(useCommon ? commonProperties.getReadTimeoutMs() : properties.getReadTimeoutMs());
|
||||
mineruProperties.setWriteTimeoutMs(useCommon ? commonProperties.getWriteTimeoutMs() : properties.getWriteTimeoutMs());
|
||||
mineruProperties.setPollIntervalMs(useCommon ? commonProperties.getPollIntervalMs() : properties.getPollIntervalMs());
|
||||
mineruProperties.setResultTimeoutMs(useCommon ? commonProperties.getResultTimeoutMs() : properties.getResultTimeoutMs());
|
||||
mineruProperties.setDefaultBackend(useCommon ? commonProperties.getDefaultBackend() : properties.getDefaultBackend());
|
||||
mineruProperties.setDefaultParseMethod(useCommon ? commonProperties.getDefaultParseMethod() : properties.getDefaultParseMethod());
|
||||
mineruProperties.setDefaultLangList(useCommon ? commonProperties.getDefaultLangList() : properties.getDefaultLangList());
|
||||
mineruProperties.setDefaultFormulaEnable(useCommon ? commonProperties.getDefaultFormulaEnable() : properties.getDefaultFormulaEnable());
|
||||
mineruProperties.setDefaultTableEnable(useCommon ? commonProperties.getDefaultTableEnable() : properties.getDefaultTableEnable());
|
||||
mineruProperties.setBaseUrl(useCommon ? commonProperties.getBaseUrl() : null);
|
||||
mineruProperties.setConnectTimeoutMs(useCommon ? commonProperties.getConnectTimeoutMs() : null);
|
||||
mineruProperties.setReadTimeoutMs(useCommon ? commonProperties.getReadTimeoutMs() : null);
|
||||
mineruProperties.setWriteTimeoutMs(useCommon ? commonProperties.getWriteTimeoutMs() : null);
|
||||
mineruProperties.setPollIntervalMs(useCommon ? commonProperties.getPollIntervalMs() : null);
|
||||
mineruProperties.setResultTimeoutMs(useCommon ? commonProperties.getResultTimeoutMs() : null);
|
||||
mineruProperties.setDefaultBackend(useCommon ? commonProperties.getDefaultBackend() : null);
|
||||
mineruProperties.setDefaultParseMethod(useCommon ? commonProperties.getDefaultParseMethod() : null);
|
||||
mineruProperties.setDefaultLangList(useCommon ? commonProperties.getDefaultLangList() : null);
|
||||
mineruProperties.setDefaultFormulaEnable(useCommon ? commonProperties.getDefaultFormulaEnable() : null);
|
||||
mineruProperties.setDefaultTableEnable(useCommon ? commonProperties.getDefaultTableEnable() : null);
|
||||
return mineruProperties;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.pptx.PptxDocumentParseService;
|
||||
import com.easyagents.document.pptx.mineru.MineruPptxDocumentParseService;
|
||||
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
|
||||
import org.springframework.beans.factory.annotation.Qualifier;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
@@ -24,21 +25,24 @@ import java.util.concurrent.Executors;
|
||||
*/
|
||||
@Configuration(proxyBeanMethods = false)
|
||||
@ConditionalOnClass(MineruPptxDocumentParseService.class)
|
||||
@ConditionalOnProperty(prefix = "easy-agents.document.pptx", name = "enabled", havingValue = "true")
|
||||
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, PptxDocumentProperties.class})
|
||||
@ConditionalOnProperty(prefix = "easy-agents.document.ocr", name = "provider", havingValue = "mineru")
|
||||
@EnableConfigurationProperties(CommonMineruDocumentProperties.class)
|
||||
public class MineruPptxAutoConfiguration {
|
||||
|
||||
public static final String PPTX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME = "pptxDocumentAsyncTaskManager";
|
||||
private static final int DEFAULT_ASYNC_THREADS = 2;
|
||||
|
||||
@Bean
|
||||
@ConditionalOnMissingBean(name = "pptxDocumentAsyncTaskManager")
|
||||
public DocumentAsyncTaskManager pptxDocumentAsyncTaskManager(PptxDocumentProperties properties) {
|
||||
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads();
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
|
||||
@ConditionalOnMissingBean(name = PPTX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
|
||||
public DocumentAsyncTaskManager pptxDocumentAsyncTaskManager() {
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(DEFAULT_ASYNC_THREADS);
|
||||
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
|
||||
}
|
||||
|
||||
@Bean
|
||||
@ConditionalOnMissingBean(PptxDocumentParseService.class)
|
||||
public PptxDocumentParseService pptxDocumentParseService(CommonMineruDocumentProperties commonProperties,
|
||||
@Qualifier(PPTX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
|
||||
DocumentAsyncTaskManager pptxDocumentAsyncTaskManager) {
|
||||
return new MineruPptxDocumentParseService(toMineruProperties(commonProperties), pptxDocumentAsyncTaskManager);
|
||||
}
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
package com.easyagents.spring.boot.document.pptx;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
/**
|
||||
* PPTX 文档配置。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
@ConfigurationProperties(prefix = "easy-agents.document.pptx")
|
||||
public class PptxDocumentProperties {
|
||||
|
||||
private Boolean enabled = false;
|
||||
private Integer asyncThreads = 2;
|
||||
|
||||
public Boolean getEnabled() {
|
||||
return enabled;
|
||||
}
|
||||
|
||||
public void setEnabled(Boolean enabled) {
|
||||
this.enabled = enabled;
|
||||
}
|
||||
|
||||
public Integer getAsyncThreads() {
|
||||
return asyncThreads;
|
||||
}
|
||||
|
||||
public void setAsyncThreads(Integer asyncThreads) {
|
||||
this.asyncThreads = asyncThreads;
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@ import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.xlsx.XlsxDocumentParseService;
|
||||
import com.easyagents.document.xlsx.mineru.MineruXlsxDocumentParseService;
|
||||
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
|
||||
import org.springframework.beans.factory.annotation.Qualifier;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
@@ -24,21 +25,24 @@ import java.util.concurrent.Executors;
|
||||
*/
|
||||
@Configuration(proxyBeanMethods = false)
|
||||
@ConditionalOnClass(MineruXlsxDocumentParseService.class)
|
||||
@ConditionalOnProperty(prefix = "easy-agents.document.xlsx", name = "enabled", havingValue = "true")
|
||||
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, XlsxDocumentProperties.class})
|
||||
@ConditionalOnProperty(prefix = "easy-agents.document.ocr", name = "provider", havingValue = "mineru")
|
||||
@EnableConfigurationProperties(CommonMineruDocumentProperties.class)
|
||||
public class MineruXlsxAutoConfiguration {
|
||||
|
||||
public static final String XLSX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME = "xlsxDocumentAsyncTaskManager";
|
||||
private static final int DEFAULT_ASYNC_THREADS = 2;
|
||||
|
||||
@Bean
|
||||
@ConditionalOnMissingBean(name = "xlsxDocumentAsyncTaskManager")
|
||||
public DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager(XlsxDocumentProperties properties) {
|
||||
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads();
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
|
||||
@ConditionalOnMissingBean(name = XLSX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
|
||||
public DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager() {
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(DEFAULT_ASYNC_THREADS);
|
||||
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
|
||||
}
|
||||
|
||||
@Bean
|
||||
@ConditionalOnMissingBean(XlsxDocumentParseService.class)
|
||||
public XlsxDocumentParseService xlsxDocumentParseService(CommonMineruDocumentProperties commonProperties,
|
||||
@Qualifier(XLSX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
|
||||
DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager) {
|
||||
return new MineruXlsxDocumentParseService(toMineruProperties(commonProperties), xlsxDocumentAsyncTaskManager);
|
||||
}
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
package com.easyagents.spring.boot.document.xlsx;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
/**
|
||||
* XLSX 文档配置。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
@ConfigurationProperties(prefix = "easy-agents.document.xlsx")
|
||||
public class XlsxDocumentProperties {
|
||||
|
||||
private Boolean enabled = false;
|
||||
private Integer asyncThreads = 2;
|
||||
|
||||
public Boolean getEnabled() {
|
||||
return enabled;
|
||||
}
|
||||
|
||||
public void setEnabled(Boolean enabled) {
|
||||
this.enabled = enabled;
|
||||
}
|
||||
|
||||
public Integer getAsyncThreads() {
|
||||
return asyncThreads;
|
||||
}
|
||||
|
||||
public void setAsyncThreads(Integer asyncThreads) {
|
||||
this.asyncThreads = asyncThreads;
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,10 @@
|
||||
package com.easyagents.spring.boot.autoconfigure;
|
||||
|
||||
import com.easyagents.document.core.DocumentParseService;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import com.easyagents.document.pdf.PdfDocumentParseService;
|
||||
import com.easyagents.document.pptx.PptxDocumentParseService;
|
||||
import com.easyagents.document.xlsx.XlsxDocumentParseService;
|
||||
@@ -13,6 +17,8 @@ import com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration;
|
||||
import com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
import org.springframework.boot.test.context.runner.ApplicationContextRunner;
|
||||
|
||||
public class StarterConditionalAutoConfigurationTest {
|
||||
@@ -49,27 +55,105 @@ public class StarterConditionalAutoConfigurationTest {
|
||||
public void shouldCreateMineruDocumentBeansWhenConfigured() {
|
||||
contextRunner
|
||||
.withPropertyValues(
|
||||
"easy-agents.document.pdf.provider=mineru",
|
||||
"easy-agents.document.pdf.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
|
||||
"easy-agents.document.ocr.provider=mineru",
|
||||
"easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
|
||||
)
|
||||
.run(context -> {
|
||||
Assert.assertNotNull(context.getBean(PdfDocumentParseService.class));
|
||||
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
|
||||
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
|
||||
Assert.assertNotNull(context.getBean(DocumentParseService.class));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldCreatePptxAndXlsxBeansWhenEnabled() {
|
||||
public void shouldCreatePptxAndXlsxBeansWhenMineruOcrConfigured() {
|
||||
contextRunner
|
||||
.withPropertyValues(
|
||||
"easy-agents.document.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api",
|
||||
"easy-agents.document.pptx.enabled=true",
|
||||
"easy-agents.document.xlsx.enabled=true"
|
||||
"easy-agents.document.ocr.provider=mineru",
|
||||
"easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
|
||||
)
|
||||
.run(context -> {
|
||||
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
|
||||
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
|
||||
Assert.assertFalse(context.containsBean("documentParseService"));
|
||||
Assert.assertNotNull(context.getBean(DocumentParseService.class));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldKeepPdfAsDefaultDocumentParseServiceWhenMineruOcrConfigured() {
|
||||
contextRunner
|
||||
.withPropertyValues(
|
||||
"easy-agents.document.ocr.provider=mineru",
|
||||
"easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
|
||||
)
|
||||
.run(context -> {
|
||||
PdfDocumentParseService pdfService = context.getBean(PdfDocumentParseService.class);
|
||||
Assert.assertNotNull(pdfService);
|
||||
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
|
||||
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
|
||||
Assert.assertSame(pdfService, context.getBean("documentParseService"));
|
||||
Assert.assertSame(pdfService, context.getBean(DocumentParseService.class));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldAliasCustomNamedPdfServiceAsDefaultDocumentParseService() {
|
||||
new ApplicationContextRunner()
|
||||
.withUserConfiguration(CustomPdfParseServiceConfiguration.class)
|
||||
.withUserConfiguration(
|
||||
RagIngestionAutoConfiguration.class,
|
||||
OllamaAutoConfiguration.class,
|
||||
OpenSearchAutoConfiguration.class,
|
||||
MineruPdfAutoConfiguration.class,
|
||||
MineruPptxAutoConfiguration.class,
|
||||
MineruXlsxAutoConfiguration.class
|
||||
)
|
||||
.withPropertyValues(
|
||||
"easy-agents.document.ocr.provider=mineru",
|
||||
"easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
|
||||
)
|
||||
.run(context -> {
|
||||
PdfDocumentParseService pdfService = context.getBean(PdfDocumentParseService.class);
|
||||
Assert.assertSame(pdfService, context.getBean("documentParseService"));
|
||||
Assert.assertSame(pdfService, context.getBean(DocumentParseService.class));
|
||||
});
|
||||
}
|
||||
|
||||
@Configuration(proxyBeanMethods = false)
|
||||
static class CustomPdfParseServiceConfiguration {
|
||||
|
||||
@Bean("customPdfService")
|
||||
PdfDocumentParseService customPdfService() {
|
||||
return new NoopPdfDocumentParseService();
|
||||
}
|
||||
}
|
||||
|
||||
static class NoopPdfDocumentParseService implements PdfDocumentParseService {
|
||||
|
||||
@Override
|
||||
public ParseResponse parse(ParseRequest request) {
|
||||
return new ParseResponse();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus submit(ParseRequest request) {
|
||||
return new ParseTaskStatus();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
return new ParseTaskStatus();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
return new ParseResponse();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||
return new ParseTaskInfo();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user