refactor: 收敛文档 OCR 配置并补齐 Office 解析兼容
- 统一 MinerU OCR 配置结构并移除分模块冗余属性类 - 补齐 JSON 字符串化结果拆包、XLSX 图片兼容与对应回归测试
This commit is contained in:
@@ -253,10 +253,11 @@ public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseSe
|
||||
appendSheetHeader(extraction.markdown, sheet.getSheetName());
|
||||
|
||||
if (maxRow < 0 || maxCol <= 0) {
|
||||
extraction.markdown.append("_empty sheet_");
|
||||
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
|
||||
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
|
||||
if (!imageArtifacts.isEmpty()) {
|
||||
appendImageOnlySheet(extraction.markdown, sheet.getSheetName(), request, imageArtifacts);
|
||||
return extraction;
|
||||
}
|
||||
extraction.markdown.append("_empty sheet_");
|
||||
return extraction;
|
||||
}
|
||||
|
||||
@@ -488,6 +489,29 @@ public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseSe
|
||||
}
|
||||
}
|
||||
|
||||
private void appendImageOnlySheet(StringBuilder markdownBuilder,
|
||||
String sheetName,
|
||||
XlsxParseRequest request,
|
||||
List<XlsxCellImageArtifact> imageArtifacts) {
|
||||
markdownBuilder.append("## ").append(sheetName).append(" 图片内容\n\n");
|
||||
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
|
||||
markdownBuilder.append("[IMG:")
|
||||
.append(imageArtifact.getReferenceKey())
|
||||
.append("]\n\n");
|
||||
}
|
||||
if (Boolean.TRUE.equals(request.getIncludeImageAppendix())) {
|
||||
appendImageAppendix(markdownBuilder, sheetName, imageArtifacts);
|
||||
return;
|
||||
}
|
||||
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
|
||||
markdownBuilder.append("
|
||||
.append(imageArtifact.getSourcePath())
|
||||
.append(")\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> extractMergedRanges(XSSFSheet sheet) {
|
||||
List<String> mergedRanges = new ArrayList<String>();
|
||||
for (int index = 0; index < sheet.getNumMergedRegions(); index++) {
|
||||
|
||||
@@ -138,7 +138,9 @@ public class MineruXlsxDocumentParseServiceTest {
|
||||
XlsxParseArtifact artifact = extractXlsxArtifact(result);
|
||||
|
||||
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("_empty sheet_"));
|
||||
Assert.assertFalse(result.getMarkdown().contains("_empty sheet_"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片内容"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片说明"));
|
||||
Assert.assertTrue(result.getMarkdown().contains(""));
|
||||
Assert.assertTrue(result.getMarkdown().contains("- 占位符:[IMG:sheet1-r2c2-001]"));
|
||||
@@ -147,6 +149,30 @@ public class MineruXlsxDocumentParseServiceTest {
|
||||
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldKeepMarkdownImageReferenceWhenImageAppendixDisabled() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.setIncludeImageAppendix(Boolean.FALSE);
|
||||
request.addFile(ParseFile.of("image-only.xlsx", buildWorkbookBytesWithImageOnlySheet()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
ParseResult result = response.getResults().get(0);
|
||||
|
||||
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片内容"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
|
||||
Assert.assertTrue(result.getMarkdown().contains(""));
|
||||
Assert.assertFalse(result.getMarkdown().contains("## Sheet1 图片说明"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldTrackAsyncLifecycleAndExposeResult() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
|
||||
Reference in New Issue
Block a user