feat: 扩展 Office 文档解析能力

- 重构 document-core 与 MinerU 公共层,补齐 Office 异步任务基础设施

- 新增 PPTX/XLSX 解析模块与 starter 自动装配

- 补充 README 与相关测试覆盖
This commit is contained in:
2026-04-16 21:51:16 +08:00
parent 547d4f6ee0
commit b66876d0fd
66 changed files with 4015 additions and 296 deletions

View File

@@ -0,0 +1,13 @@
package com.easyagents.document.xlsx;
import com.easyagents.document.core.DocumentParseService;
import com.easyagents.document.core.entity.XlsxParseRequest;
/**
* XLSX 文档解析服务。
*
* @author Codex
* @since 2026-04-16
*/
public interface XlsxDocumentParseService extends DocumentParseService<XlsxParseRequest> {
}

View File

@@ -0,0 +1,17 @@
package com.easyagents.document.xlsx;
/**
* XLSX provider SPI。
*
* @author Codex
* @since 2026-04-16
*/
public interface XlsxDocumentProvider extends XlsxDocumentParseService {
/**
* 获取 provider 标识。
*
* @return provider 名称
*/
String getProvider();
}

View File

@@ -0,0 +1,625 @@
package com.easyagents.document.xlsx.mineru;
import com.easyagents.core.util.StringUtil;
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
import com.easyagents.document.core.async.DocumentAsyncTaskRepository;
import com.easyagents.document.core.async.DocumentAsyncTaskUpdater;
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
import com.easyagents.document.core.mineru.MineruClient;
import com.easyagents.document.core.mineru.MineruMapper;
import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.core.entity.DocumentImage;
import com.easyagents.document.core.entity.ParseFile;
import com.easyagents.document.core.entity.ParseRequest;
import com.easyagents.document.core.entity.ParseResponse;
import com.easyagents.document.core.entity.ParseResult;
import com.easyagents.document.core.entity.XlsxParseRequest;
import com.easyagents.document.core.support.AbstractAsyncDocumentParseService;
import com.easyagents.document.xlsx.XlsxDocumentProvider;
import com.easyagents.document.xlsx.model.XlsxCellArtifact;
import com.easyagents.document.xlsx.model.XlsxCellImageArtifact;
import com.easyagents.document.xlsx.model.XlsxParseArtifact;
import com.easyagents.document.xlsx.model.XlsxRowArtifact;
import com.easyagents.document.xlsx.model.XlsxSheetArtifact;
import com.easyagents.document.xlsx.model.XlsxSheetImagesArtifact;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.FormulaEvaluator;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.xssf.usermodel.XSSFClientAnchor;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFPicture;
import org.apache.poi.xssf.usermodel.XSSFPictureData;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.ByteArrayInputStream;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* XLSX 文档解析服务OCR 由 mineru 提供支持
*
* @author Codex
* @since 2026-04-16
*/
public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseService<XlsxParseRequest> implements XlsxDocumentProvider {
public static final String PROVIDER_NAME = "mineru";
private final MineruProperties properties;
private final MineruClient client;
private final MineruMapper mapper;
/**
* 创建服务实例。
*
* @param properties MinerU 配置
*/
public MineruXlsxDocumentParseService(MineruProperties properties) {
this(properties, new MineruMapper(properties));
}
/**
* 创建服务实例。
*
* @param properties MinerU 配置
* @param mapper MinerU 映射器
*/
public MineruXlsxDocumentParseService(MineruProperties properties, MineruMapper mapper) {
this(properties, new MineruClient(properties, mapper), mapper, defaultTaskManager());
}
/**
* 创建服务实例。
*
* @param properties MinerU 配置
* @param taskManager 异步任务管理器
*/
public MineruXlsxDocumentParseService(MineruProperties properties, DocumentAsyncTaskManager taskManager) {
this(properties, new MineruMapper(properties), taskManager);
}
/**
* 创建服务实例。
*
* @param properties MinerU 配置
* @param mapper MinerU 映射器
* @param taskManager 异步任务管理器
*/
public MineruXlsxDocumentParseService(MineruProperties properties,
MineruMapper mapper,
DocumentAsyncTaskManager taskManager) {
this(properties, new MineruClient(properties, mapper), mapper, taskManager);
}
/**
* 创建服务实例。
*
* @param properties MinerU 配置
* @param client MinerU 客户端
* @param mapper MinerU 映射器
* @param taskManager 异步任务管理器
*/
public MineruXlsxDocumentParseService(MineruProperties properties,
MineruClient client,
MineruMapper mapper,
DocumentAsyncTaskManager taskManager) {
super(taskManager);
this.properties = properties;
this.client = client;
this.mapper = mapper;
}
@Override
public String getProvider() {
return PROVIDER_NAME;
}
@Override
protected XlsxParseRequest normalizeRequest(ParseRequest request) {
XlsxParseRequest normalized = XlsxParseRequest.from(request);
if (normalized.getFiles() == null || normalized.getFiles().isEmpty()) {
throw new IllegalArgumentException("XlsxParseRequest files must not be empty");
}
normalized.setBackend(StringUtil.hasText(normalized.getBackend()) ? normalized.getBackend() : properties.getDefaultBackend());
if (normalized.getLanguages() == null || normalized.getLanguages().isEmpty()) {
normalized.setLanguages(new ArrayList<String>(properties.getDefaultLangList()));
}
normalized.setReturnMarkdown(normalized.getReturnMarkdown() == null ? Boolean.TRUE : normalized.getReturnMarkdown());
normalized.setReturnMiddleJson(normalized.getReturnMiddleJson() == null ? Boolean.FALSE : normalized.getReturnMiddleJson());
normalized.setReturnContentList(normalized.getReturnContentList() == null ? Boolean.FALSE : normalized.getReturnContentList());
normalized.setReturnModelOutput(normalized.getReturnModelOutput() == null ? Boolean.FALSE : normalized.getReturnModelOutput());
normalized.setReturnImages(normalized.getReturnImages() == null ? Boolean.TRUE : normalized.getReturnImages());
normalized.setIncludeHiddenSheets(normalized.getIncludeHiddenSheets() == null ? Boolean.FALSE : normalized.getIncludeHiddenSheets());
normalized.setOcrEmbeddedImages(normalized.getOcrEmbeddedImages() == null ? Boolean.TRUE : normalized.getOcrEmbeddedImages());
normalized.setIncludeImageAppendix(normalized.getIncludeImageAppendix() == null ? Boolean.TRUE : normalized.getIncludeImageAppendix());
return normalized;
}
@Override
protected ParseResponse doParse(XlsxParseRequest request, DocumentAsyncTaskUpdater updater) {
ParseResponse response = new ParseResponse();
List<ParseResult> results = new ArrayList<ParseResult>();
String backend = null;
int processedFiles = 0;
int totalFiles = request.getFiles().size();
for (ParseFile file : request.getFiles()) {
updateProgress(updater, "extracting", processedFiles, totalFiles, "正在读取工作簿结构");
ParseResult result = parseSingleWorkbook(file, request, updater);
processedFiles++;
if (backend == null) {
backend = (String) result.getMetadata().get("ocrBackend");
}
result.getMetadata().remove("ocrBackend");
results.add(result);
}
updateProgress(updater, "assembling", processedFiles, totalFiles, "正在汇总 XLSX 解析结果");
response.setBackend(StringUtil.hasText(backend) ? backend : request.getBackend());
response.setResults(results);
return response;
}
private ParseResult parseSingleWorkbook(ParseFile file, XlsxParseRequest request, DocumentAsyncTaskUpdater updater) {
ParseResult aggregate = new ParseResult();
aggregate.setFileName(file.getFileName());
XlsxParseArtifact artifact = new XlsxParseArtifact();
artifact.setWorkbookName(file.getFileName());
StringBuilder markdownBuilder = new StringBuilder();
String backend = null;
try (XSSFWorkbook workbook = new XSSFWorkbook(new ByteArrayInputStream(file.getContent()))) {
FormulaEvaluator evaluator = workbook.getCreationHelper().createFormulaEvaluator();
DataFormatter formatter = new DataFormatter();
List<Integer> sheetIndexes = resolveSheetIndexes(workbook, request);
int processedSheets = 0;
for (Integer sheetIndex : sheetIndexes) {
XSSFSheet sheet = workbook.getSheetAt(sheetIndex);
updateProgress(updater, "extracting", processedSheets, sheetIndexes.size(), "正在读取 Sheet " + sheet.getSheetName());
SheetExtraction sheetExtraction = extractSheet(sheet, sheetIndex, formatter, evaluator, request, updater);
artifact.getSheets().add(sheetExtraction.sheetArtifact);
artifact.getCellImages().addAll(sheetExtraction.imageArtifacts);
artifact.getSheetImages().add(sheetExtraction.sheetImagesArtifact);
artifact.getMergedRanges().addAll(sheetExtraction.mergedRanges);
aggregate.getImages().addAll(sheetExtraction.documentImages);
if (markdownBuilder.length() > 0) {
markdownBuilder.append("\n\n");
}
markdownBuilder.append(sheetExtraction.markdown);
if (backend == null) {
backend = sheetExtraction.ocrBackend;
}
processedSheets++;
}
} catch (Exception exception) {
throw new IllegalStateException("Failed to parse XLSX file: " + file.getFileName(), exception);
}
aggregate.setMarkdown(markdownBuilder.toString().trim());
aggregate.setPlainText(aggregate.getMarkdown());
aggregate.getArtifacts().getExtraJsonArtifacts().put("xlsx", artifact);
aggregate.getMetadata().put("ocrBackend", backend);
return aggregate;
}
private SheetExtraction extractSheet(XSSFSheet sheet,
int sheetIndex,
DataFormatter formatter,
FormulaEvaluator evaluator,
XlsxParseRequest request,
DocumentAsyncTaskUpdater updater) {
SheetExtraction extraction = new SheetExtraction();
extraction.sheetArtifact = new XlsxSheetArtifact();
extraction.sheetArtifact.setSheetName(sheet.getSheetName());
extraction.sheetArtifact.setSheetIndex(sheetIndex);
extraction.sheetArtifact.setHidden(Boolean.valueOf(sheet.getWorkbook().isSheetHidden(sheetIndex)
|| sheet.getWorkbook().isSheetVeryHidden(sheetIndex)));
extraction.sheetImagesArtifact = new XlsxSheetImagesArtifact();
extraction.sheetImagesArtifact.setSheetName(sheet.getSheetName());
extraction.sheetImagesArtifact.setSheetIndex(sheetIndex);
Map<String, List<XlsxCellImageArtifact>> imagesByCell = new LinkedHashMap<String, List<XlsxCellImageArtifact>>();
List<SheetImageExtraction> sheetImages = extractImages(sheet, sheetIndex, request, updater);
List<XlsxCellImageArtifact> imageArtifacts = new ArrayList<XlsxCellImageArtifact>();
for (SheetImageExtraction sheetImage : sheetImages) {
XlsxCellImageArtifact imageArtifact = sheetImage.imageArtifact;
imageArtifacts.add(imageArtifact);
extraction.imageArtifacts.add(imageArtifact);
extraction.sheetImagesArtifact.getReferenceKeys().add(imageArtifact.getReferenceKey());
extraction.sheetImagesArtifact.getSourcePaths().add(imageArtifact.getSourcePath());
String anchorCell = imageArtifact.getAnchorCell();
List<XlsxCellImageArtifact> cellImages = imagesByCell.get(anchorCell);
if (cellImages == null) {
cellImages = new ArrayList<XlsxCellImageArtifact>();
imagesByCell.put(anchorCell, cellImages);
}
cellImages.add(imageArtifact);
extraction.documentImages.add(sheetImage.documentImage);
}
int maxRow = resolveMaxRow(sheet, request.getMaxRowsPerSheet());
int maxCol = resolveMaxCol(sheet, maxRow, imagesByCell);
extraction.sheetArtifact.setRowCount(maxRow + 1);
extraction.sheetArtifact.setColumnCount(maxCol);
appendSheetHeader(extraction.markdown, sheet.getSheetName());
if (maxRow < 0 || maxCol <= 0) {
extraction.markdown.append("_empty sheet_");
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
}
return extraction;
}
List<List<String>> markdownRows = new ArrayList<List<String>>();
for (int rowIndex = 0; rowIndex <= maxRow; rowIndex++) {
org.apache.poi.ss.usermodel.Row row = sheet.getRow(rowIndex);
XlsxRowArtifact rowArtifact = new XlsxRowArtifact();
rowArtifact.setRowIndex(rowIndex);
List<String> rowValues = new ArrayList<String>();
for (int colIndex = 0; colIndex < maxCol; colIndex++) {
String cellRef = new CellReference(rowIndex, colIndex).formatAsString();
String cellText = readCellText(row, colIndex, formatter, evaluator);
List<XlsxCellImageArtifact> cellImages = imagesByCell.get(cellRef);
String displayValue = mergeDisplayValue(cellText, cellImages);
rowValues.add(escapeMarkdown(displayValue));
XlsxCellArtifact cellArtifact = new XlsxCellArtifact();
cellArtifact.setRowIndex(rowIndex);
cellArtifact.setColumnIndex(colIndex);
cellArtifact.setCellRef(cellRef);
cellArtifact.setText(cellText);
if (cellImages != null) {
List<String> imageKeys = new ArrayList<String>();
for (XlsxCellImageArtifact cellImage : cellImages) {
imageKeys.add(cellImage.getReferenceKey());
}
cellArtifact.setImageKeys(imageKeys);
}
rowArtifact.getCells().add(cellArtifact);
}
extraction.sheetArtifact.getRows().add(rowArtifact);
markdownRows.add(rowValues);
}
appendMarkdownTable(extraction.markdown, markdownRows);
extraction.mergedRanges.addAll(extractMergedRanges(sheet));
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
}
return extraction;
}
private List<SheetImageExtraction> extractImages(XSSFSheet sheet,
int sheetIndex,
XlsxParseRequest request,
DocumentAsyncTaskUpdater updater) {
List<SheetImageExtraction> images = new ArrayList<SheetImageExtraction>();
XSSFDrawing drawing = sheet.getDrawingPatriarch();
if (drawing == null) {
return images;
}
String sheetKey = buildSheetKey(sheet.getSheetName(), sheetIndex);
int imageIndex = 0;
for (XSSFShape shape : drawing.getShapes()) {
if (!(shape instanceof XSSFPicture)) {
continue;
}
imageIndex++;
XSSFPicture picture = (XSSFPicture) shape;
XSSFClientAnchor anchor = picture.getPreferredSize();
if (anchor == null) {
continue;
}
XSSFPictureData pictureData = picture.getPictureData();
String extension = pictureData == null || !StringUtil.hasText(pictureData.suggestFileExtension())
? "png"
: pictureData.suggestFileExtension();
String imageName = buildImageName(sheetKey, anchor.getRow1(), anchor.getCol1(), imageIndex);
String sourcePath = buildImageSourcePath(sheetKey, anchor.getRow1(), anchor.getCol1(), imageIndex, extension);
XlsxCellImageArtifact imageArtifact = new XlsxCellImageArtifact();
imageArtifact.setSheetName(sheet.getSheetName());
imageArtifact.setAnchorCell(new CellReference(anchor.getRow1(), anchor.getCol1()).formatAsString());
imageArtifact.setFromRow(anchor.getRow1());
imageArtifact.setFromCol((int) anchor.getCol1());
imageArtifact.setToRow(anchor.getRow2());
imageArtifact.setToCol((int) anchor.getCol2());
imageArtifact.setName(imageName);
imageArtifact.setReferenceKey(imageName);
imageArtifact.setSourcePath(sourcePath);
if (Boolean.TRUE.equals(request.getOcrEmbeddedImages()) && pictureData != null) {
updateProgress(updater, "ocr", imageIndex - 1, drawing.getShapes().size(), "正在识别 Sheet " + sheet.getSheetName() + " 中的图片");
imageArtifact.setOcrText(parseImageOcr(pictureData.getData(), extension, request, imageName));
}
DocumentImage documentImage = new DocumentImage();
documentImage.setName(imageName);
documentImage.setSourcePath(sourcePath);
documentImage.setMimeType(detectImageMimeType(sourcePath));
documentImage.setContent(pictureData == null ? null : pictureData.getData());
SheetImageExtraction sheetImage = new SheetImageExtraction();
sheetImage.imageArtifact = imageArtifact;
sheetImage.documentImage = documentImage;
images.add(sheetImage);
}
return images;
}
private String parseImageOcr(byte[] imageBytes, String extension, XlsxParseRequest request, String imageName) {
ParseRequest imageRequest = new ParseRequest();
imageRequest.addFile(ParseFile.of(imageName + "." + extension, imageBytes, "image/" + extension));
imageRequest.setBackend(request.getBackend());
imageRequest.setLanguages(request.getLanguages());
imageRequest.setReturnMarkdown(true);
imageRequest.setReturnMiddleJson(false);
imageRequest.setReturnContentList(false);
imageRequest.setReturnModelOutput(false);
imageRequest.setReturnImages(false);
ParseResponse response = mapper.toParseResponse(client.parse(imageRequest));
if (response.getResults().isEmpty()) {
return null;
}
ParseResult result = response.getResults().get(0);
return StringUtil.hasText(result.getMarkdown()) ? result.getMarkdown() : result.getPlainText();
}
private List<Integer> resolveSheetIndexes(XSSFWorkbook workbook, XlsxParseRequest request) {
List<Integer> indexes = new ArrayList<Integer>();
for (int index = 0; index < workbook.getNumberOfSheets(); index++) {
String sheetName = workbook.getSheetName(index);
if (!Boolean.TRUE.equals(request.getIncludeHiddenSheets())
&& (workbook.isSheetHidden(index) || workbook.isSheetVeryHidden(index))) {
continue;
}
if (request.getSheetNames() != null && !request.getSheetNames().isEmpty()
&& !request.getSheetNames().contains(sheetName)) {
continue;
}
indexes.add(index);
}
return indexes;
}
private int resolveMaxRow(XSSFSheet sheet, Integer maxRowsPerSheet) {
int lastRow = sheet.getLastRowNum();
if (lastRow < 0) {
return -1;
}
if (maxRowsPerSheet == null || maxRowsPerSheet <= 0) {
return lastRow;
}
return Math.min(lastRow, maxRowsPerSheet - 1);
}
private int resolveMaxCol(XSSFSheet sheet, int maxRow, Map<String, List<XlsxCellImageArtifact>> imagesByCell) {
int maxCol = 0;
for (int rowIndex = 0; rowIndex <= maxRow; rowIndex++) {
org.apache.poi.ss.usermodel.Row row = sheet.getRow(rowIndex);
if (row != null && row.getLastCellNum() > maxCol) {
maxCol = row.getLastCellNum();
}
}
for (String cellRef : imagesByCell.keySet()) {
CellReference reference = new CellReference(cellRef);
if (reference.getCol() + 1 > maxCol) {
maxCol = reference.getCol() + 1;
}
}
return maxCol;
}
private String readCellText(org.apache.poi.ss.usermodel.Row row, int colIndex, DataFormatter formatter, FormulaEvaluator evaluator) {
if (row == null) {
return "";
}
org.apache.poi.ss.usermodel.Cell cell = row.getCell(colIndex);
if (cell == null) {
return "";
}
return formatter.formatCellValue(cell, evaluator);
}
private String mergeDisplayValue(String cellText, List<XlsxCellImageArtifact> cellImages) {
StringBuilder builder = new StringBuilder();
if (StringUtil.hasText(cellText)) {
builder.append(cellText.trim());
}
if (cellImages != null && !cellImages.isEmpty()) {
for (XlsxCellImageArtifact cellImage : cellImages) {
if (builder.length() > 0) {
builder.append('\n');
}
builder.append("[IMG:").append(cellImage.getReferenceKey()).append(']');
}
}
return builder.toString();
}
private void appendSheetHeader(StringBuilder markdownBuilder, String sheetName) {
markdownBuilder.append("# ").append(sheetName).append("\n\n");
}
private void appendMarkdownTable(StringBuilder markdownBuilder, List<List<String>> rows) {
if (rows.isEmpty()) {
markdownBuilder.append("_empty sheet_");
return;
}
List<String> header = rows.get(0);
markdownBuilder.append("| ").append(joinCells(header)).append(" |\n");
markdownBuilder.append("|");
for (int index = 0; index < header.size(); index++) {
markdownBuilder.append(" --- |");
}
markdownBuilder.append("\n");
for (int rowIndex = 1; rowIndex < rows.size(); rowIndex++) {
markdownBuilder.append("| ").append(joinCells(rows.get(rowIndex))).append(" |\n");
}
}
private void appendImageAppendix(StringBuilder markdownBuilder,
String sheetName,
List<XlsxCellImageArtifact> imageArtifacts) {
markdownBuilder.append("\n## ").append(sheetName).append(" 图片说明\n\n");
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
markdownBuilder.append("![")
.append(imageArtifact.getReferenceKey())
.append("](")
.append(imageArtifact.getSourcePath())
.append(")\n\n");
markdownBuilder.append("- 占位符:[IMG:")
.append(imageArtifact.getReferenceKey())
.append("]\n");
markdownBuilder.append("- 锚点:")
.append(imageArtifact.getAnchorCell())
.append("\n");
markdownBuilder.append("- OCR")
.append(StringUtil.hasText(imageArtifact.getOcrText()) ? imageArtifact.getOcrText() : "")
.append("\n\n");
}
}
private List<String> extractMergedRanges(XSSFSheet sheet) {
List<String> mergedRanges = new ArrayList<String>();
for (int index = 0; index < sheet.getNumMergedRegions(); index++) {
mergedRanges.add(sheet.getMergedRegion(index).formatAsString());
}
return mergedRanges;
}
private String joinCells(List<String> cells) {
StringBuilder builder = new StringBuilder();
for (int index = 0; index < cells.size(); index++) {
if (index > 0) {
builder.append(" | ");
}
builder.append(cells.get(index));
}
return builder.toString();
}
private String escapeMarkdown(String text) {
if (!StringUtil.hasText(text)) {
return "";
}
return text.replace("|", "\\|").replace("\r", " ").replace("\n", "<br/>");
}
private String buildImageName(String sheetKey, int rowIndex, int colIndex, int imageIndex) {
return sheetKey + "-r" + (rowIndex + 1) + "c" + (colIndex + 1) + "-" + formatIndex(imageIndex);
}
private String buildImageSourcePath(String sheetKey, int rowIndex, int colIndex, int imageIndex, String extension) {
return "images/" + sheetKey + "/r" + (rowIndex + 1) + "c" + (colIndex + 1) + "-" + formatIndex(imageIndex) + "." + extension;
}
private String buildSheetKey(String sheetName, int sheetIndex) {
if (!StringUtil.hasText(sheetName)) {
return "sheet-" + formatIndex(sheetIndex + 1);
}
String lowerCaseName = sheetName.toLowerCase(Locale.ROOT);
StringBuilder builder = new StringBuilder();
for (int index = 0; index < lowerCaseName.length(); index++) {
char character = lowerCaseName.charAt(index);
if ((character >= 'a' && character <= 'z') || (character >= '0' && character <= '9')) {
builder.append(character);
continue;
}
if (builder.length() > 0 && builder.charAt(builder.length() - 1) != '-') {
builder.append('-');
}
builder.append('u').append(String.format(Locale.ROOT, "%04x", (int) character)).append('-');
}
String normalized = builder.toString();
while (normalized.startsWith("-")) {
normalized = normalized.substring(1);
}
while (normalized.endsWith("-")) {
normalized = normalized.substring(0, normalized.length() - 1);
}
return StringUtil.hasText(normalized) ? normalized : "sheet-" + formatIndex(sheetIndex + 1);
}
private String formatIndex(int index) {
int displayIndex = index <= 0 ? 1 : index;
if (displayIndex < 10) {
return "00" + displayIndex;
}
if (displayIndex < 100) {
return "0" + displayIndex;
}
return String.valueOf(displayIndex);
}
private String detectImageMimeType(String path) {
if (!StringUtil.hasText(path)) {
return "application/octet-stream";
}
String mimeType = URLConnection.guessContentTypeFromName(path);
if (StringUtil.hasText(mimeType)) {
return mimeType;
}
String lowerCasePath = path.toLowerCase(Locale.ROOT);
if (lowerCasePath.endsWith(".jpg") || lowerCasePath.endsWith(".jpeg")) {
return "image/jpeg";
}
if (lowerCasePath.endsWith(".png")) {
return "image/png";
}
if (lowerCasePath.endsWith(".gif")) {
return "image/gif";
}
if (lowerCasePath.endsWith(".bmp")) {
return "image/bmp";
}
if (lowerCasePath.endsWith(".webp")) {
return "image/webp";
}
return "application/octet-stream";
}
private void updateProgress(DocumentAsyncTaskUpdater updater,
String stage,
int processedItems,
int totalItems,
String message) {
if (updater == null) {
return;
}
int safeTotal = totalItems <= 0 ? 1 : totalItems;
int percent = (int) Math.min(99, Math.round(processedItems * 100.0d / safeTotal));
updater.update(stage, percent, processedItems, totalItems, message);
}
private static DocumentAsyncTaskManager defaultTaskManager() {
DocumentAsyncTaskRepository repository = new InMemoryDocumentAsyncTaskRepository();
ExecutorService executorService = Executors.newFixedThreadPool(2);
return new DocumentAsyncTaskManager(repository, executorService);
}
private static class SheetExtraction {
private final StringBuilder markdown = new StringBuilder();
private final List<XlsxCellImageArtifact> imageArtifacts = new ArrayList<XlsxCellImageArtifact>();
private final List<DocumentImage> documentImages = new ArrayList<DocumentImage>();
private final List<String> mergedRanges = new ArrayList<String>();
private XlsxSheetArtifact sheetArtifact;
private XlsxSheetImagesArtifact sheetImagesArtifact;
private String ocrBackend;
}
private static class SheetImageExtraction {
private XlsxCellImageArtifact imageArtifact;
private DocumentImage documentImage;
}
}

View File

@@ -0,0 +1,59 @@
package com.easyagents.document.xlsx.model;
import java.util.ArrayList;
import java.util.List;
/**
* 单元格工件。
*
* @author Codex
* @since 2026-04-16
*/
public class XlsxCellArtifact {
private Integer rowIndex;
private Integer columnIndex;
private String cellRef;
private String text;
private List<String> imageKeys = new ArrayList<String>();
public Integer getRowIndex() {
return rowIndex;
}
public void setRowIndex(Integer rowIndex) {
this.rowIndex = rowIndex;
}
public Integer getColumnIndex() {
return columnIndex;
}
public void setColumnIndex(Integer columnIndex) {
this.columnIndex = columnIndex;
}
public String getCellRef() {
return cellRef;
}
public void setCellRef(String cellRef) {
this.cellRef = cellRef;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public List<String> getImageKeys() {
return imageKeys;
}
public void setImageKeys(List<String> imageKeys) {
this.imageKeys = imageKeys == null ? new ArrayList<String>() : imageKeys;
}
}

View File

@@ -0,0 +1,101 @@
package com.easyagents.document.xlsx.model;
/**
* 单元格图片工件。
*
* @author Codex
* @since 2026-04-16
*/
public class XlsxCellImageArtifact {
private String sheetName;
private String anchorCell;
private Integer fromRow;
private Integer fromCol;
private Integer toRow;
private Integer toCol;
private String name;
private String referenceKey;
private String sourcePath;
private String ocrText;
public String getSheetName() {
return sheetName;
}
public void setSheetName(String sheetName) {
this.sheetName = sheetName;
}
public String getAnchorCell() {
return anchorCell;
}
public void setAnchorCell(String anchorCell) {
this.anchorCell = anchorCell;
}
public Integer getFromRow() {
return fromRow;
}
public void setFromRow(Integer fromRow) {
this.fromRow = fromRow;
}
public Integer getFromCol() {
return fromCol;
}
public void setFromCol(Integer fromCol) {
this.fromCol = fromCol;
}
public Integer getToRow() {
return toRow;
}
public void setToRow(Integer toRow) {
this.toRow = toRow;
}
public Integer getToCol() {
return toCol;
}
public void setToCol(Integer toCol) {
this.toCol = toCol;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getReferenceKey() {
return referenceKey;
}
public void setReferenceKey(String referenceKey) {
this.referenceKey = referenceKey;
}
public String getSourcePath() {
return sourcePath;
}
public void setSourcePath(String sourcePath) {
this.sourcePath = sourcePath;
}
public String getOcrText() {
return ocrText;
}
public void setOcrText(String ocrText) {
this.ocrText = ocrText;
}
}

View File

@@ -0,0 +1,59 @@
package com.easyagents.document.xlsx.model;
import java.util.ArrayList;
import java.util.List;
/**
* XLSX 结构化工件。
*
* @author Codex
* @since 2026-04-16
*/
public class XlsxParseArtifact {
private String workbookName;
private List<XlsxSheetArtifact> sheets = new ArrayList<XlsxSheetArtifact>();
private List<XlsxSheetImagesArtifact> sheetImages = new ArrayList<XlsxSheetImagesArtifact>();
private List<String> mergedRanges = new ArrayList<String>();
private List<XlsxCellImageArtifact> cellImages = new ArrayList<XlsxCellImageArtifact>();
public String getWorkbookName() {
return workbookName;
}
public void setWorkbookName(String workbookName) {
this.workbookName = workbookName;
}
public List<XlsxSheetArtifact> getSheets() {
return sheets;
}
public void setSheets(List<XlsxSheetArtifact> sheets) {
this.sheets = sheets == null ? new ArrayList<XlsxSheetArtifact>() : sheets;
}
public List<XlsxSheetImagesArtifact> getSheetImages() {
return sheetImages;
}
public void setSheetImages(List<XlsxSheetImagesArtifact> sheetImages) {
this.sheetImages = sheetImages == null ? new ArrayList<XlsxSheetImagesArtifact>() : sheetImages;
}
public List<String> getMergedRanges() {
return mergedRanges;
}
public void setMergedRanges(List<String> mergedRanges) {
this.mergedRanges = mergedRanges == null ? new ArrayList<String>() : mergedRanges;
}
public List<XlsxCellImageArtifact> getCellImages() {
return cellImages;
}
public void setCellImages(List<XlsxCellImageArtifact> cellImages) {
this.cellImages = cellImages == null ? new ArrayList<XlsxCellImageArtifact>() : cellImages;
}
}

View File

@@ -0,0 +1,32 @@
package com.easyagents.document.xlsx.model;
import java.util.ArrayList;
import java.util.List;
/**
* 行工件。
*
* @author Codex
* @since 2026-04-16
*/
public class XlsxRowArtifact {
private Integer rowIndex;
private List<XlsxCellArtifact> cells = new ArrayList<XlsxCellArtifact>();
public Integer getRowIndex() {
return rowIndex;
}
public void setRowIndex(Integer rowIndex) {
this.rowIndex = rowIndex;
}
public List<XlsxCellArtifact> getCells() {
return cells;
}
public void setCells(List<XlsxCellArtifact> cells) {
this.cells = cells == null ? new ArrayList<XlsxCellArtifact>() : cells;
}
}

View File

@@ -0,0 +1,68 @@
package com.easyagents.document.xlsx.model;
import java.util.ArrayList;
import java.util.List;
/**
* Sheet 工件。
*
* @author Codex
* @since 2026-04-16
*/
public class XlsxSheetArtifact {
private String sheetName;
private Integer sheetIndex;
private Boolean hidden;
private Integer rowCount;
private Integer columnCount;
private List<XlsxRowArtifact> rows = new ArrayList<XlsxRowArtifact>();
public String getSheetName() {
return sheetName;
}
public void setSheetName(String sheetName) {
this.sheetName = sheetName;
}
public Integer getSheetIndex() {
return sheetIndex;
}
public void setSheetIndex(Integer sheetIndex) {
this.sheetIndex = sheetIndex;
}
public Boolean getHidden() {
return hidden;
}
public void setHidden(Boolean hidden) {
this.hidden = hidden;
}
public Integer getRowCount() {
return rowCount;
}
public void setRowCount(Integer rowCount) {
this.rowCount = rowCount;
}
public Integer getColumnCount() {
return columnCount;
}
public void setColumnCount(Integer columnCount) {
this.columnCount = columnCount;
}
public List<XlsxRowArtifact> getRows() {
return rows;
}
public void setRows(List<XlsxRowArtifact> rows) {
this.rows = rows == null ? new ArrayList<XlsxRowArtifact>() : rows;
}
}

View File

@@ -0,0 +1,50 @@
package com.easyagents.document.xlsx.model;
import java.util.ArrayList;
import java.util.List;
/**
* Sheet 维度的图片索引工件。
*
* @author Codex
* @since 2026-04-16
*/
public class XlsxSheetImagesArtifact {
private String sheetName;
private Integer sheetIndex;
private List<String> referenceKeys = new ArrayList<String>();
private List<String> sourcePaths = new ArrayList<String>();
public String getSheetName() {
return sheetName;
}
public void setSheetName(String sheetName) {
this.sheetName = sheetName;
}
public Integer getSheetIndex() {
return sheetIndex;
}
public void setSheetIndex(Integer sheetIndex) {
this.sheetIndex = sheetIndex;
}
public List<String> getReferenceKeys() {
return referenceKeys;
}
public void setReferenceKeys(List<String> referenceKeys) {
this.referenceKeys = referenceKeys == null ? new ArrayList<String>() : referenceKeys;
}
public List<String> getSourcePaths() {
return sourcePaths;
}
public void setSourcePaths(List<String> sourcePaths) {
this.sourcePaths = sourcePaths == null ? new ArrayList<String>() : sourcePaths;
}
}

View File

@@ -0,0 +1,333 @@
package com.easyagents.document.xlsx.mineru;
import com.alibaba.fastjson2.JSONObject;
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
import com.easyagents.document.core.mineru.MineruClient;
import com.easyagents.document.core.mineru.MineruMapper;
import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.core.mineru.MineruResultPayload;
import com.easyagents.document.core.entity.ParseArtifacts;
import com.easyagents.document.core.entity.ParseFile;
import com.easyagents.document.core.entity.ParseResponse;
import com.easyagents.document.core.entity.ParseResult;
import com.easyagents.document.core.entity.ParseTaskInfo;
import com.easyagents.document.core.entity.ParseTaskStatus;
import com.easyagents.document.core.entity.XlsxParseRequest;
import com.easyagents.document.core.exception.DocumentParseException;
import com.easyagents.document.xlsx.model.XlsxParseArtifact;
import org.apache.poi.ss.usermodel.ClientAnchor;
import org.apache.poi.xssf.usermodel.XSSFDrawing;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.Assert;
import org.junit.Test;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.util.ArrayDeque;
import java.util.Queue;
import java.util.concurrent.Executor;
/**
* XLSX MinerU 服务测试。
*
* @author Codex
* @since 2026-04-16
*/
public class MineruXlsxDocumentParseServiceTest {
@Test
public void shouldBuildMarkdownAndImageArtifacts() throws Exception {
RecordingClient client = new RecordingClient(defaultProperties());
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
defaultProperties(),
client,
mapper,
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
);
XlsxParseRequest request = new XlsxParseRequest();
request.addFile(ParseFile.of("demo.xlsx", buildWorkbookBytes()));
ParseResponse response = service.parse(request);
Assert.assertEquals(1, response.getResults().size());
ParseResult result = response.getResults().get(0);
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
Assert.assertTrue(result.getMarkdown().contains("images/sheet1/r2c2-001.png"));
Assert.assertTrue(result.getMarkdown().contains("图片文字描述"));
Assert.assertEquals(1, result.getImages().size());
Assert.assertNotNull(result.getImages().get(0).getContent());
XlsxParseArtifact artifact = extractXlsxArtifact(result);
Assert.assertEquals("demo.xlsx", artifact.getWorkbookName());
Assert.assertEquals(1, artifact.getSheets().size());
Assert.assertEquals(1, artifact.getSheetImages().size());
Assert.assertEquals(1, artifact.getCellImages().size());
Assert.assertEquals("sheet1-r2c2-001", artifact.getCellImages().get(0).getReferenceKey());
Assert.assertEquals("images/sheet1/r2c2-001.png", artifact.getCellImages().get(0).getSourcePath());
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
Assert.assertEquals("images/sheet1/r2c2-001.png", artifact.getSheetImages().get(0).getSourcePaths().get(0));
}
@Test
public void shouldKeepImageKeysUniqueForNonAsciiSheetNames() throws Exception {
RecordingClient client = new RecordingClient(defaultProperties());
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
defaultProperties(),
client,
mapper,
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
);
XlsxParseRequest request = new XlsxParseRequest();
request.addFile(ParseFile.of("unicode-sheets.xlsx", buildWorkbookBytesWithUnicodeSheetNames()));
ParseResponse response = service.parse(request);
ParseResult result = response.getResults().get(0);
Assert.assertEquals(2, result.getImages().size());
Assert.assertNotEquals(result.getImages().get(0).getName(), result.getImages().get(1).getName());
Assert.assertNotEquals(result.getImages().get(0).getSourcePath(), result.getImages().get(1).getSourcePath());
Assert.assertTrue(result.getMarkdown().contains("[IMG:" + result.getImages().get(0).getName() + "]"));
Assert.assertTrue(result.getMarkdown().contains("[IMG:" + result.getImages().get(1).getName() + "]"));
}
@Test
public void shouldDetectJpegMimeType() throws Exception {
RecordingClient client = new RecordingClient(defaultProperties());
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
defaultProperties(),
client,
mapper,
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
);
XlsxParseRequest request = new XlsxParseRequest();
request.addFile(ParseFile.of("jpeg.xlsx", buildWorkbookBytesWithJpegImage()));
ParseResponse response = service.parse(request);
ParseResult result = response.getResults().get(0);
Assert.assertEquals(1, result.getImages().size());
Assert.assertEquals("image/jpeg", result.getImages().get(0).getMimeType());
}
@Test
public void shouldAppendImageReferenceForImageOnlySheet() throws Exception {
RecordingClient client = new RecordingClient(defaultProperties());
MineruMapper mapper = new MineruMapper(defaultProperties());
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
defaultProperties(),
client,
mapper,
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
);
XlsxParseRequest request = new XlsxParseRequest();
request.addFile(ParseFile.of("image-only.xlsx", buildWorkbookBytesWithImageOnlySheet()));
ParseResponse response = service.parse(request);
ParseResult result = response.getResults().get(0);
XlsxParseArtifact artifact = extractXlsxArtifact(result);
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
Assert.assertTrue(result.getMarkdown().contains("_empty sheet_"));
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片说明"));
Assert.assertTrue(result.getMarkdown().contains("![sheet1-r2c2-001](images/sheet1/r2c2-001.png)"));
Assert.assertTrue(result.getMarkdown().contains("- 占位符:[IMG:sheet1-r2c2-001]"));
Assert.assertEquals(1, result.getImages().size());
Assert.assertEquals(1, artifact.getSheetImages().size());
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
}
@Test
public void shouldTrackAsyncLifecycleAndExposeResult() throws Exception {
RecordingClient client = new RecordingClient(defaultProperties());
MineruMapper mapper = new MineruMapper(defaultProperties());
ManualExecutor executor = new ManualExecutor();
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
defaultProperties(),
client,
mapper,
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executor)
);
XlsxParseRequest request = new XlsxParseRequest();
request.addFile(ParseFile.of("async.xlsx", buildWorkbookBytes()));
ParseTaskStatus submitted = service.submit(request);
Assert.assertEquals("queued", submitted.getStatus());
Assert.assertEquals("queued", submitted.getCurrentStage());
Assert.assertEquals(Integer.valueOf(0), submitted.getProgressPercent());
ParseTaskInfo queuedInfo = service.queryTaskInfo(submitted.getTaskId());
Assert.assertNull(queuedInfo.getResult());
try {
service.queryResult(submitted.getTaskId());
Assert.fail("任务未完成时应抛出异常");
} catch (DocumentParseException expected) {
Assert.assertTrue(expected.getMessage().contains(submitted.getTaskId()));
}
executor.runNext();
ParseTaskStatus completed = service.queryTask(submitted.getTaskId());
Assert.assertEquals("completed", completed.getStatus());
Assert.assertEquals("completed", completed.getCurrentStage());
Assert.assertEquals(Integer.valueOf(100), completed.getProgressPercent());
Assert.assertEquals("任务执行完成", completed.getStatusMessage());
ParseTaskInfo completedInfo = service.queryTaskInfo(submitted.getTaskId());
Assert.assertNotNull(completedInfo.getResult());
Assert.assertTrue(completedInfo.getResult().getResults().get(0).getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
Assert.assertEquals(completedInfo.getResult(), service.queryResult(submitted.getTaskId()));
}
private byte[] buildWorkbookBytes() throws Exception {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet("Sheet1");
sheet.createRow(0).createCell(0).setCellValue("商品");
sheet.getRow(0).createCell(1).setCellValue("图片");
sheet.createRow(1).createCell(0).setCellValue("手机");
addPicture(workbook, sheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
return writeWorkbook(workbook);
}
private byte[] buildWorkbookBytesWithUnicodeSheetNames() throws Exception {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet detailSheet = workbook.createSheet("明细");
detailSheet.createRow(0).createCell(0).setCellValue("图片");
addPicture(workbook, detailSheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
XSSFSheet summarySheet = workbook.createSheet("汇总");
summarySheet.createRow(0).createCell(0).setCellValue("图片");
addPicture(workbook, summarySheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
return writeWorkbook(workbook);
}
private byte[] buildWorkbookBytesWithJpegImage() throws Exception {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet("Sheet1");
sheet.createRow(0).createCell(0).setCellValue("图片");
addPicture(workbook, sheet, 1, 1, createImageBytes("jpg"), XSSFWorkbook.PICTURE_TYPE_JPEG);
return writeWorkbook(workbook);
}
private byte[] buildWorkbookBytesWithImageOnlySheet() throws Exception {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet("Sheet1");
addPicture(workbook, sheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
return writeWorkbook(workbook);
}
private void addPicture(XSSFWorkbook workbook,
XSSFSheet sheet,
int rowIndex,
int colIndex,
byte[] imageBytes,
int pictureType) {
int pictureIndex = workbook.addPicture(imageBytes, pictureType);
XSSFDrawing drawing = sheet.createDrawingPatriarch();
ClientAnchor anchor = workbook.getCreationHelper().createClientAnchor();
anchor.setRow1(rowIndex);
anchor.setCol1(colIndex);
anchor.setRow2(rowIndex + 1);
anchor.setCol2(colIndex + 1);
drawing.createPicture(anchor, pictureIndex);
}
private byte[] writeWorkbook(XSSFWorkbook workbook) throws Exception {
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
workbook.write(outputStream);
workbook.close();
return outputStream.toByteArray();
}
private byte[] createImageBytes(String format) throws Exception {
BufferedImage image = new BufferedImage(2, 2, BufferedImage.TYPE_INT_RGB);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
ImageIO.write(image, format, outputStream);
return outputStream.toByteArray();
}
private MineruProperties defaultProperties() {
MineruProperties properties = new MineruProperties();
properties.setBaseUrl("http://127.0.0.1:8000");
return properties;
}
private Executor directExecutor() {
return new Executor() {
@Override
public void execute(Runnable command) {
command.run();
}
};
}
private XlsxParseArtifact extractXlsxArtifact(ParseResult result) {
ParseArtifacts artifacts = result.getArtifacts();
Assert.assertNotNull(artifacts);
Object artifact = artifacts.getExtraJsonArtifacts().get("xlsx");
Assert.assertTrue(artifact instanceof XlsxParseArtifact);
return (XlsxParseArtifact) artifact;
}
/**
* 手动执行的测试执行器,用于验证异步任务状态流转。
*/
private static class ManualExecutor implements Executor {
private final Queue<Runnable> tasks = new ArrayDeque<Runnable>();
@Override
public void execute(Runnable command) {
tasks.offer(command);
}
private void runNext() {
Runnable task = tasks.poll();
Assert.assertNotNull("应当存在待执行任务", task);
task.run();
}
}
private static class RecordingClient extends MineruClient {
private RecordingClient(MineruProperties properties) {
super(properties, new MineruMapper(properties));
}
@Override
public MineruResultPayload parse(com.easyagents.document.core.entity.ParseRequest request) {
return new MineruMapper(testProperties()).toResultPayload(syncPayload());
}
private JSONObject syncPayload() {
JSONObject payload = new JSONObject();
payload.put("backend", "vlm-http-client");
payload.put("version", "3.0.9");
JSONObject result = new JSONObject();
result.put("md_content", "图片文字描述");
JSONObject results = new JSONObject();
results.put("image", result);
payload.put("results", results);
return payload;
}
private static MineruProperties testProperties() {
MineruProperties properties = new MineruProperties();
properties.setBaseUrl("http://127.0.0.1:8000");
return properties;
}
}
}