refactor: 收敛文档 OCR 配置并补齐 Office 解析兼容

- 统一 MinerU OCR 配置结构并移除分模块冗余属性类

- 补齐 JSON 字符串化结果拆包、XLSX 图片兼容与对应回归测试
This commit is contained in:
2026-04-18 13:01:17 +08:00
parent b66876d0fd
commit 56ee149e7c
15 changed files with 559 additions and 272 deletions

View File

@@ -81,6 +81,29 @@ public class MineruPptxDocumentParseServiceTest {
Assert.assertEquals(1, taskInfo.getResult().getResults().size());
}
@Test
public void shouldSupportStringifiedMineruSlideArtifacts() throws IOException {
RecordingClient client = new RecordingClient(defaultProperties(), true);
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruPptxDocumentParseService service = new MineruPptxDocumentParseService(
defaultProperties(),
client,
mapper,
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
);
PptxParseRequest request = new PptxParseRequest();
request.addFile(ParseFile.of("demo.pptx", buildPptxBytes()));
ParseResponse response = service.parse(request);
Assert.assertEquals(1, response.getResults().size());
ParseResult result = response.getResults().get(0);
Assert.assertTrue(result.getMarkdown().contains("slide-ocr-1"));
Assert.assertFalse(result.getBlocks().isEmpty());
Assert.assertEquals(2, result.getImages().size());
}
private byte[] buildPptxBytes() throws IOException {
XMLSlideShow slideShow = new XMLSlideShow();
slideShow.setPageSize(new java.awt.Dimension(640, 360));
@@ -117,9 +140,15 @@ public class MineruPptxDocumentParseServiceTest {
private static class RecordingClient extends MineruClient {
private int parseCount;
private final boolean stringifyArtifacts;
private RecordingClient(MineruProperties properties) {
this(properties, false);
}
private RecordingClient(MineruProperties properties, boolean stringifyArtifacts) {
super(properties, new MineruMapper(properties));
this.stringifyArtifacts = stringifyArtifacts;
}
@Override
@@ -134,10 +163,10 @@ public class MineruPptxDocumentParseServiceTest {
payload.put("version", "3.0.9");
JSONObject result = new JSONObject();
result.put("md_content", "slide-ocr-" + index);
result.put("middle_json", middleJson());
result.put("content_list", contentList(index));
result.put("middle_json", stringifyArtifacts ? middleJson().toJSONString() : middleJson());
result.put("content_list", stringifyArtifacts ? contentList(index).toJSONString() : contentList(index));
JSONObject results = new JSONObject();
results.put("slide-" + index, result);
results.put("slide-" + index, stringifyArtifacts ? result.toJSONString() : result);
payload.put("results", results);
return payload;
}