refactor: 收敛文档 OCR 配置并补齐 Office 解析兼容

- 统一 MinerU OCR 配置结构并移除分模块冗余属性类

- 补齐 JSON 字符串化结果拆包、XLSX 图片兼容与对应回归测试
This commit is contained in:
2026-04-18 13:01:17 +08:00
parent b66876d0fd
commit 56ee149e7c
15 changed files with 559 additions and 272 deletions

View File

@@ -253,10 +253,11 @@ public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseSe
appendSheetHeader(extraction.markdown, sheet.getSheetName());
if (maxRow < 0 || maxCol <= 0) {
extraction.markdown.append("_empty sheet_");
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
if (!imageArtifacts.isEmpty()) {
appendImageOnlySheet(extraction.markdown, sheet.getSheetName(), request, imageArtifacts);
return extraction;
}
extraction.markdown.append("_empty sheet_");
return extraction;
}
@@ -488,6 +489,29 @@ public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseSe
}
}
private void appendImageOnlySheet(StringBuilder markdownBuilder,
String sheetName,
XlsxParseRequest request,
List<XlsxCellImageArtifact> imageArtifacts) {
markdownBuilder.append("## ").append(sheetName).append(" 图片内容\n\n");
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
markdownBuilder.append("[IMG:")
.append(imageArtifact.getReferenceKey())
.append("]\n\n");
}
if (Boolean.TRUE.equals(request.getIncludeImageAppendix())) {
appendImageAppendix(markdownBuilder, sheetName, imageArtifacts);
return;
}
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
markdownBuilder.append("![")
.append(imageArtifact.getReferenceKey())
.append("](")
.append(imageArtifact.getSourcePath())
.append(")\n\n");
}
}
private List<String> extractMergedRanges(XSSFSheet sheet) {
List<String> mergedRanges = new ArrayList<String>();
for (int index = 0; index < sheet.getNumMergedRegions(); index++) {

View File

@@ -138,7 +138,9 @@ public class MineruXlsxDocumentParseServiceTest {
XlsxParseArtifact artifact = extractXlsxArtifact(result);
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
Assert.assertTrue(result.getMarkdown().contains("_empty sheet_"));
Assert.assertFalse(result.getMarkdown().contains("_empty sheet_"));
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片内容"));
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片说明"));
Assert.assertTrue(result.getMarkdown().contains("![sheet1-r2c2-001](images/sheet1/r2c2-001.png)"));
Assert.assertTrue(result.getMarkdown().contains("- 占位符:[IMG:sheet1-r2c2-001]"));
@@ -147,6 +149,30 @@ public class MineruXlsxDocumentParseServiceTest {
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
}
@Test
public void shouldKeepMarkdownImageReferenceWhenImageAppendixDisabled() throws Exception {
RecordingClient client = new RecordingClient(defaultProperties());
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
defaultProperties(),
client,
mapper,
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
);
XlsxParseRequest request = new XlsxParseRequest();
request.setIncludeImageAppendix(Boolean.FALSE);
request.addFile(ParseFile.of("image-only.xlsx", buildWorkbookBytesWithImageOnlySheet()));
ParseResponse response = service.parse(request);
ParseResult result = response.getResults().get(0);
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片内容"));
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
Assert.assertTrue(result.getMarkdown().contains("![sheet1-r2c2-001](images/sheet1/r2c2-001.png)"));
Assert.assertFalse(result.getMarkdown().contains("## Sheet1 图片说明"));
}
@Test
public void shouldTrackAsyncLifecycleAndExposeResult() throws Exception {
RecordingClient client = new RecordingClient(defaultProperties());