feat: 扩展 Office 文档解析能力
- 重构 document-core 与 MinerU 公共层,补齐 Office 异步任务基础设施 - 新增 PPTX/XLSX 解析模块与 starter 自动装配 - 补充 README 与相关测试覆盖
This commit is contained in:
44
easy-agents-document/easy-agents-document-xlsx/pom.xml
Normal file
44
easy-agents-document/easy-agents-document-xlsx/pom.xml
Normal file
@@ -0,0 +1,44 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document</artifactId>
|
||||
<version>${revision}</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>easy-agents-document-xlsx</artifactId>
|
||||
<name>easy-agents-document-xlsx</name>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.alibaba.fastjson2</groupId>
|
||||
<artifactId>fastjson2</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.easyagents.document.xlsx;
|
||||
|
||||
import com.easyagents.document.core.DocumentParseService;
|
||||
import com.easyagents.document.core.entity.XlsxParseRequest;
|
||||
|
||||
/**
|
||||
* XLSX 文档解析服务。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface XlsxDocumentParseService extends DocumentParseService<XlsxParseRequest> {
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package com.easyagents.document.xlsx;
|
||||
|
||||
/**
|
||||
* XLSX provider SPI。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface XlsxDocumentProvider extends XlsxDocumentParseService {
|
||||
|
||||
/**
|
||||
* 获取 provider 标识。
|
||||
*
|
||||
* @return provider 名称
|
||||
*/
|
||||
String getProvider();
|
||||
}
|
||||
@@ -0,0 +1,625 @@
|
||||
package com.easyagents.document.xlsx.mineru;
|
||||
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskRepository;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskUpdater;
|
||||
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||
import com.easyagents.document.core.mineru.MineruClient;
|
||||
import com.easyagents.document.core.mineru.MineruMapper;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.core.entity.DocumentImage;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.XlsxParseRequest;
|
||||
import com.easyagents.document.core.support.AbstractAsyncDocumentParseService;
|
||||
import com.easyagents.document.xlsx.XlsxDocumentProvider;
|
||||
import com.easyagents.document.xlsx.model.XlsxCellArtifact;
|
||||
import com.easyagents.document.xlsx.model.XlsxCellImageArtifact;
|
||||
import com.easyagents.document.xlsx.model.XlsxParseArtifact;
|
||||
import com.easyagents.document.xlsx.model.XlsxRowArtifact;
|
||||
import com.easyagents.document.xlsx.model.XlsxSheetArtifact;
|
||||
import com.easyagents.document.xlsx.model.XlsxSheetImagesArtifact;
|
||||
import org.apache.poi.ss.usermodel.DataFormatter;
|
||||
import org.apache.poi.ss.usermodel.FormulaEvaluator;
|
||||
import org.apache.poi.ss.util.CellReference;
|
||||
import org.apache.poi.xssf.usermodel.XSSFClientAnchor;
|
||||
import org.apache.poi.xssf.usermodel.XSSFDrawing;
|
||||
import org.apache.poi.xssf.usermodel.XSSFPicture;
|
||||
import org.apache.poi.xssf.usermodel.XSSFPictureData;
|
||||
import org.apache.poi.xssf.usermodel.XSSFShape;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSheet;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.net.URLConnection;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
/**
|
||||
* XLSX 文档解析服务,OCR 由 mineru 提供支持
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseService<XlsxParseRequest> implements XlsxDocumentProvider {
|
||||
|
||||
public static final String PROVIDER_NAME = "mineru";
|
||||
|
||||
private final MineruProperties properties;
|
||||
private final MineruClient client;
|
||||
private final MineruMapper mapper;
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
*/
|
||||
public MineruXlsxDocumentParseService(MineruProperties properties) {
|
||||
this(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param mapper MinerU 映射器
|
||||
*/
|
||||
public MineruXlsxDocumentParseService(MineruProperties properties, MineruMapper mapper) {
|
||||
this(properties, new MineruClient(properties, mapper), mapper, defaultTaskManager());
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param taskManager 异步任务管理器
|
||||
*/
|
||||
public MineruXlsxDocumentParseService(MineruProperties properties, DocumentAsyncTaskManager taskManager) {
|
||||
this(properties, new MineruMapper(properties), taskManager);
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param mapper MinerU 映射器
|
||||
* @param taskManager 异步任务管理器
|
||||
*/
|
||||
public MineruXlsxDocumentParseService(MineruProperties properties,
|
||||
MineruMapper mapper,
|
||||
DocumentAsyncTaskManager taskManager) {
|
||||
this(properties, new MineruClient(properties, mapper), mapper, taskManager);
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param client MinerU 客户端
|
||||
* @param mapper MinerU 映射器
|
||||
* @param taskManager 异步任务管理器
|
||||
*/
|
||||
public MineruXlsxDocumentParseService(MineruProperties properties,
|
||||
MineruClient client,
|
||||
MineruMapper mapper,
|
||||
DocumentAsyncTaskManager taskManager) {
|
||||
super(taskManager);
|
||||
this.properties = properties;
|
||||
this.client = client;
|
||||
this.mapper = mapper;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getProvider() {
|
||||
return PROVIDER_NAME;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected XlsxParseRequest normalizeRequest(ParseRequest request) {
|
||||
XlsxParseRequest normalized = XlsxParseRequest.from(request);
|
||||
if (normalized.getFiles() == null || normalized.getFiles().isEmpty()) {
|
||||
throw new IllegalArgumentException("XlsxParseRequest files must not be empty");
|
||||
}
|
||||
normalized.setBackend(StringUtil.hasText(normalized.getBackend()) ? normalized.getBackend() : properties.getDefaultBackend());
|
||||
if (normalized.getLanguages() == null || normalized.getLanguages().isEmpty()) {
|
||||
normalized.setLanguages(new ArrayList<String>(properties.getDefaultLangList()));
|
||||
}
|
||||
normalized.setReturnMarkdown(normalized.getReturnMarkdown() == null ? Boolean.TRUE : normalized.getReturnMarkdown());
|
||||
normalized.setReturnMiddleJson(normalized.getReturnMiddleJson() == null ? Boolean.FALSE : normalized.getReturnMiddleJson());
|
||||
normalized.setReturnContentList(normalized.getReturnContentList() == null ? Boolean.FALSE : normalized.getReturnContentList());
|
||||
normalized.setReturnModelOutput(normalized.getReturnModelOutput() == null ? Boolean.FALSE : normalized.getReturnModelOutput());
|
||||
normalized.setReturnImages(normalized.getReturnImages() == null ? Boolean.TRUE : normalized.getReturnImages());
|
||||
normalized.setIncludeHiddenSheets(normalized.getIncludeHiddenSheets() == null ? Boolean.FALSE : normalized.getIncludeHiddenSheets());
|
||||
normalized.setOcrEmbeddedImages(normalized.getOcrEmbeddedImages() == null ? Boolean.TRUE : normalized.getOcrEmbeddedImages());
|
||||
normalized.setIncludeImageAppendix(normalized.getIncludeImageAppendix() == null ? Boolean.TRUE : normalized.getIncludeImageAppendix());
|
||||
return normalized;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ParseResponse doParse(XlsxParseRequest request, DocumentAsyncTaskUpdater updater) {
|
||||
ParseResponse response = new ParseResponse();
|
||||
List<ParseResult> results = new ArrayList<ParseResult>();
|
||||
String backend = null;
|
||||
int processedFiles = 0;
|
||||
int totalFiles = request.getFiles().size();
|
||||
|
||||
for (ParseFile file : request.getFiles()) {
|
||||
updateProgress(updater, "extracting", processedFiles, totalFiles, "正在读取工作簿结构");
|
||||
ParseResult result = parseSingleWorkbook(file, request, updater);
|
||||
processedFiles++;
|
||||
if (backend == null) {
|
||||
backend = (String) result.getMetadata().get("ocrBackend");
|
||||
}
|
||||
result.getMetadata().remove("ocrBackend");
|
||||
results.add(result);
|
||||
}
|
||||
|
||||
updateProgress(updater, "assembling", processedFiles, totalFiles, "正在汇总 XLSX 解析结果");
|
||||
response.setBackend(StringUtil.hasText(backend) ? backend : request.getBackend());
|
||||
response.setResults(results);
|
||||
return response;
|
||||
}
|
||||
|
||||
private ParseResult parseSingleWorkbook(ParseFile file, XlsxParseRequest request, DocumentAsyncTaskUpdater updater) {
|
||||
ParseResult aggregate = new ParseResult();
|
||||
aggregate.setFileName(file.getFileName());
|
||||
XlsxParseArtifact artifact = new XlsxParseArtifact();
|
||||
artifact.setWorkbookName(file.getFileName());
|
||||
StringBuilder markdownBuilder = new StringBuilder();
|
||||
String backend = null;
|
||||
|
||||
try (XSSFWorkbook workbook = new XSSFWorkbook(new ByteArrayInputStream(file.getContent()))) {
|
||||
FormulaEvaluator evaluator = workbook.getCreationHelper().createFormulaEvaluator();
|
||||
DataFormatter formatter = new DataFormatter();
|
||||
List<Integer> sheetIndexes = resolveSheetIndexes(workbook, request);
|
||||
int processedSheets = 0;
|
||||
|
||||
for (Integer sheetIndex : sheetIndexes) {
|
||||
XSSFSheet sheet = workbook.getSheetAt(sheetIndex);
|
||||
updateProgress(updater, "extracting", processedSheets, sheetIndexes.size(), "正在读取 Sheet " + sheet.getSheetName());
|
||||
SheetExtraction sheetExtraction = extractSheet(sheet, sheetIndex, formatter, evaluator, request, updater);
|
||||
artifact.getSheets().add(sheetExtraction.sheetArtifact);
|
||||
artifact.getCellImages().addAll(sheetExtraction.imageArtifacts);
|
||||
artifact.getSheetImages().add(sheetExtraction.sheetImagesArtifact);
|
||||
artifact.getMergedRanges().addAll(sheetExtraction.mergedRanges);
|
||||
aggregate.getImages().addAll(sheetExtraction.documentImages);
|
||||
if (markdownBuilder.length() > 0) {
|
||||
markdownBuilder.append("\n\n");
|
||||
}
|
||||
markdownBuilder.append(sheetExtraction.markdown);
|
||||
if (backend == null) {
|
||||
backend = sheetExtraction.ocrBackend;
|
||||
}
|
||||
processedSheets++;
|
||||
}
|
||||
} catch (Exception exception) {
|
||||
throw new IllegalStateException("Failed to parse XLSX file: " + file.getFileName(), exception);
|
||||
}
|
||||
|
||||
aggregate.setMarkdown(markdownBuilder.toString().trim());
|
||||
aggregate.setPlainText(aggregate.getMarkdown());
|
||||
aggregate.getArtifacts().getExtraJsonArtifacts().put("xlsx", artifact);
|
||||
aggregate.getMetadata().put("ocrBackend", backend);
|
||||
return aggregate;
|
||||
}
|
||||
|
||||
private SheetExtraction extractSheet(XSSFSheet sheet,
|
||||
int sheetIndex,
|
||||
DataFormatter formatter,
|
||||
FormulaEvaluator evaluator,
|
||||
XlsxParseRequest request,
|
||||
DocumentAsyncTaskUpdater updater) {
|
||||
SheetExtraction extraction = new SheetExtraction();
|
||||
extraction.sheetArtifact = new XlsxSheetArtifact();
|
||||
extraction.sheetArtifact.setSheetName(sheet.getSheetName());
|
||||
extraction.sheetArtifact.setSheetIndex(sheetIndex);
|
||||
extraction.sheetArtifact.setHidden(Boolean.valueOf(sheet.getWorkbook().isSheetHidden(sheetIndex)
|
||||
|| sheet.getWorkbook().isSheetVeryHidden(sheetIndex)));
|
||||
extraction.sheetImagesArtifact = new XlsxSheetImagesArtifact();
|
||||
extraction.sheetImagesArtifact.setSheetName(sheet.getSheetName());
|
||||
extraction.sheetImagesArtifact.setSheetIndex(sheetIndex);
|
||||
|
||||
Map<String, List<XlsxCellImageArtifact>> imagesByCell = new LinkedHashMap<String, List<XlsxCellImageArtifact>>();
|
||||
List<SheetImageExtraction> sheetImages = extractImages(sheet, sheetIndex, request, updater);
|
||||
List<XlsxCellImageArtifact> imageArtifacts = new ArrayList<XlsxCellImageArtifact>();
|
||||
for (SheetImageExtraction sheetImage : sheetImages) {
|
||||
XlsxCellImageArtifact imageArtifact = sheetImage.imageArtifact;
|
||||
imageArtifacts.add(imageArtifact);
|
||||
extraction.imageArtifacts.add(imageArtifact);
|
||||
extraction.sheetImagesArtifact.getReferenceKeys().add(imageArtifact.getReferenceKey());
|
||||
extraction.sheetImagesArtifact.getSourcePaths().add(imageArtifact.getSourcePath());
|
||||
String anchorCell = imageArtifact.getAnchorCell();
|
||||
List<XlsxCellImageArtifact> cellImages = imagesByCell.get(anchorCell);
|
||||
if (cellImages == null) {
|
||||
cellImages = new ArrayList<XlsxCellImageArtifact>();
|
||||
imagesByCell.put(anchorCell, cellImages);
|
||||
}
|
||||
cellImages.add(imageArtifact);
|
||||
extraction.documentImages.add(sheetImage.documentImage);
|
||||
}
|
||||
|
||||
int maxRow = resolveMaxRow(sheet, request.getMaxRowsPerSheet());
|
||||
int maxCol = resolveMaxCol(sheet, maxRow, imagesByCell);
|
||||
extraction.sheetArtifact.setRowCount(maxRow + 1);
|
||||
extraction.sheetArtifact.setColumnCount(maxCol);
|
||||
appendSheetHeader(extraction.markdown, sheet.getSheetName());
|
||||
|
||||
if (maxRow < 0 || maxCol <= 0) {
|
||||
extraction.markdown.append("_empty sheet_");
|
||||
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
|
||||
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
|
||||
}
|
||||
return extraction;
|
||||
}
|
||||
|
||||
List<List<String>> markdownRows = new ArrayList<List<String>>();
|
||||
for (int rowIndex = 0; rowIndex <= maxRow; rowIndex++) {
|
||||
org.apache.poi.ss.usermodel.Row row = sheet.getRow(rowIndex);
|
||||
XlsxRowArtifact rowArtifact = new XlsxRowArtifact();
|
||||
rowArtifact.setRowIndex(rowIndex);
|
||||
List<String> rowValues = new ArrayList<String>();
|
||||
for (int colIndex = 0; colIndex < maxCol; colIndex++) {
|
||||
String cellRef = new CellReference(rowIndex, colIndex).formatAsString();
|
||||
String cellText = readCellText(row, colIndex, formatter, evaluator);
|
||||
List<XlsxCellImageArtifact> cellImages = imagesByCell.get(cellRef);
|
||||
String displayValue = mergeDisplayValue(cellText, cellImages);
|
||||
rowValues.add(escapeMarkdown(displayValue));
|
||||
|
||||
XlsxCellArtifact cellArtifact = new XlsxCellArtifact();
|
||||
cellArtifact.setRowIndex(rowIndex);
|
||||
cellArtifact.setColumnIndex(colIndex);
|
||||
cellArtifact.setCellRef(cellRef);
|
||||
cellArtifact.setText(cellText);
|
||||
if (cellImages != null) {
|
||||
List<String> imageKeys = new ArrayList<String>();
|
||||
for (XlsxCellImageArtifact cellImage : cellImages) {
|
||||
imageKeys.add(cellImage.getReferenceKey());
|
||||
}
|
||||
cellArtifact.setImageKeys(imageKeys);
|
||||
}
|
||||
rowArtifact.getCells().add(cellArtifact);
|
||||
}
|
||||
extraction.sheetArtifact.getRows().add(rowArtifact);
|
||||
markdownRows.add(rowValues);
|
||||
}
|
||||
|
||||
appendMarkdownTable(extraction.markdown, markdownRows);
|
||||
extraction.mergedRanges.addAll(extractMergedRanges(sheet));
|
||||
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
|
||||
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
|
||||
}
|
||||
return extraction;
|
||||
}
|
||||
|
||||
private List<SheetImageExtraction> extractImages(XSSFSheet sheet,
|
||||
int sheetIndex,
|
||||
XlsxParseRequest request,
|
||||
DocumentAsyncTaskUpdater updater) {
|
||||
List<SheetImageExtraction> images = new ArrayList<SheetImageExtraction>();
|
||||
XSSFDrawing drawing = sheet.getDrawingPatriarch();
|
||||
if (drawing == null) {
|
||||
return images;
|
||||
}
|
||||
String sheetKey = buildSheetKey(sheet.getSheetName(), sheetIndex);
|
||||
int imageIndex = 0;
|
||||
for (XSSFShape shape : drawing.getShapes()) {
|
||||
if (!(shape instanceof XSSFPicture)) {
|
||||
continue;
|
||||
}
|
||||
imageIndex++;
|
||||
XSSFPicture picture = (XSSFPicture) shape;
|
||||
XSSFClientAnchor anchor = picture.getPreferredSize();
|
||||
if (anchor == null) {
|
||||
continue;
|
||||
}
|
||||
XSSFPictureData pictureData = picture.getPictureData();
|
||||
String extension = pictureData == null || !StringUtil.hasText(pictureData.suggestFileExtension())
|
||||
? "png"
|
||||
: pictureData.suggestFileExtension();
|
||||
String imageName = buildImageName(sheetKey, anchor.getRow1(), anchor.getCol1(), imageIndex);
|
||||
String sourcePath = buildImageSourcePath(sheetKey, anchor.getRow1(), anchor.getCol1(), imageIndex, extension);
|
||||
|
||||
XlsxCellImageArtifact imageArtifact = new XlsxCellImageArtifact();
|
||||
imageArtifact.setSheetName(sheet.getSheetName());
|
||||
imageArtifact.setAnchorCell(new CellReference(anchor.getRow1(), anchor.getCol1()).formatAsString());
|
||||
imageArtifact.setFromRow(anchor.getRow1());
|
||||
imageArtifact.setFromCol((int) anchor.getCol1());
|
||||
imageArtifact.setToRow(anchor.getRow2());
|
||||
imageArtifact.setToCol((int) anchor.getCol2());
|
||||
imageArtifact.setName(imageName);
|
||||
imageArtifact.setReferenceKey(imageName);
|
||||
imageArtifact.setSourcePath(sourcePath);
|
||||
if (Boolean.TRUE.equals(request.getOcrEmbeddedImages()) && pictureData != null) {
|
||||
updateProgress(updater, "ocr", imageIndex - 1, drawing.getShapes().size(), "正在识别 Sheet " + sheet.getSheetName() + " 中的图片");
|
||||
imageArtifact.setOcrText(parseImageOcr(pictureData.getData(), extension, request, imageName));
|
||||
}
|
||||
DocumentImage documentImage = new DocumentImage();
|
||||
documentImage.setName(imageName);
|
||||
documentImage.setSourcePath(sourcePath);
|
||||
documentImage.setMimeType(detectImageMimeType(sourcePath));
|
||||
documentImage.setContent(pictureData == null ? null : pictureData.getData());
|
||||
|
||||
SheetImageExtraction sheetImage = new SheetImageExtraction();
|
||||
sheetImage.imageArtifact = imageArtifact;
|
||||
sheetImage.documentImage = documentImage;
|
||||
images.add(sheetImage);
|
||||
}
|
||||
return images;
|
||||
}
|
||||
|
||||
private String parseImageOcr(byte[] imageBytes, String extension, XlsxParseRequest request, String imageName) {
|
||||
ParseRequest imageRequest = new ParseRequest();
|
||||
imageRequest.addFile(ParseFile.of(imageName + "." + extension, imageBytes, "image/" + extension));
|
||||
imageRequest.setBackend(request.getBackend());
|
||||
imageRequest.setLanguages(request.getLanguages());
|
||||
imageRequest.setReturnMarkdown(true);
|
||||
imageRequest.setReturnMiddleJson(false);
|
||||
imageRequest.setReturnContentList(false);
|
||||
imageRequest.setReturnModelOutput(false);
|
||||
imageRequest.setReturnImages(false);
|
||||
ParseResponse response = mapper.toParseResponse(client.parse(imageRequest));
|
||||
if (response.getResults().isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
ParseResult result = response.getResults().get(0);
|
||||
return StringUtil.hasText(result.getMarkdown()) ? result.getMarkdown() : result.getPlainText();
|
||||
}
|
||||
|
||||
private List<Integer> resolveSheetIndexes(XSSFWorkbook workbook, XlsxParseRequest request) {
|
||||
List<Integer> indexes = new ArrayList<Integer>();
|
||||
for (int index = 0; index < workbook.getNumberOfSheets(); index++) {
|
||||
String sheetName = workbook.getSheetName(index);
|
||||
if (!Boolean.TRUE.equals(request.getIncludeHiddenSheets())
|
||||
&& (workbook.isSheetHidden(index) || workbook.isSheetVeryHidden(index))) {
|
||||
continue;
|
||||
}
|
||||
if (request.getSheetNames() != null && !request.getSheetNames().isEmpty()
|
||||
&& !request.getSheetNames().contains(sheetName)) {
|
||||
continue;
|
||||
}
|
||||
indexes.add(index);
|
||||
}
|
||||
return indexes;
|
||||
}
|
||||
|
||||
private int resolveMaxRow(XSSFSheet sheet, Integer maxRowsPerSheet) {
|
||||
int lastRow = sheet.getLastRowNum();
|
||||
if (lastRow < 0) {
|
||||
return -1;
|
||||
}
|
||||
if (maxRowsPerSheet == null || maxRowsPerSheet <= 0) {
|
||||
return lastRow;
|
||||
}
|
||||
return Math.min(lastRow, maxRowsPerSheet - 1);
|
||||
}
|
||||
|
||||
private int resolveMaxCol(XSSFSheet sheet, int maxRow, Map<String, List<XlsxCellImageArtifact>> imagesByCell) {
|
||||
int maxCol = 0;
|
||||
for (int rowIndex = 0; rowIndex <= maxRow; rowIndex++) {
|
||||
org.apache.poi.ss.usermodel.Row row = sheet.getRow(rowIndex);
|
||||
if (row != null && row.getLastCellNum() > maxCol) {
|
||||
maxCol = row.getLastCellNum();
|
||||
}
|
||||
}
|
||||
for (String cellRef : imagesByCell.keySet()) {
|
||||
CellReference reference = new CellReference(cellRef);
|
||||
if (reference.getCol() + 1 > maxCol) {
|
||||
maxCol = reference.getCol() + 1;
|
||||
}
|
||||
}
|
||||
return maxCol;
|
||||
}
|
||||
|
||||
private String readCellText(org.apache.poi.ss.usermodel.Row row, int colIndex, DataFormatter formatter, FormulaEvaluator evaluator) {
|
||||
if (row == null) {
|
||||
return "";
|
||||
}
|
||||
org.apache.poi.ss.usermodel.Cell cell = row.getCell(colIndex);
|
||||
if (cell == null) {
|
||||
return "";
|
||||
}
|
||||
return formatter.formatCellValue(cell, evaluator);
|
||||
}
|
||||
|
||||
private String mergeDisplayValue(String cellText, List<XlsxCellImageArtifact> cellImages) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
if (StringUtil.hasText(cellText)) {
|
||||
builder.append(cellText.trim());
|
||||
}
|
||||
if (cellImages != null && !cellImages.isEmpty()) {
|
||||
for (XlsxCellImageArtifact cellImage : cellImages) {
|
||||
if (builder.length() > 0) {
|
||||
builder.append('\n');
|
||||
}
|
||||
builder.append("[IMG:").append(cellImage.getReferenceKey()).append(']');
|
||||
}
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private void appendSheetHeader(StringBuilder markdownBuilder, String sheetName) {
|
||||
markdownBuilder.append("# ").append(sheetName).append("\n\n");
|
||||
}
|
||||
|
||||
private void appendMarkdownTable(StringBuilder markdownBuilder, List<List<String>> rows) {
|
||||
if (rows.isEmpty()) {
|
||||
markdownBuilder.append("_empty sheet_");
|
||||
return;
|
||||
}
|
||||
List<String> header = rows.get(0);
|
||||
markdownBuilder.append("| ").append(joinCells(header)).append(" |\n");
|
||||
markdownBuilder.append("|");
|
||||
for (int index = 0; index < header.size(); index++) {
|
||||
markdownBuilder.append(" --- |");
|
||||
}
|
||||
markdownBuilder.append("\n");
|
||||
for (int rowIndex = 1; rowIndex < rows.size(); rowIndex++) {
|
||||
markdownBuilder.append("| ").append(joinCells(rows.get(rowIndex))).append(" |\n");
|
||||
}
|
||||
}
|
||||
|
||||
private void appendImageAppendix(StringBuilder markdownBuilder,
|
||||
String sheetName,
|
||||
List<XlsxCellImageArtifact> imageArtifacts) {
|
||||
markdownBuilder.append("\n## ").append(sheetName).append(" 图片说明\n\n");
|
||||
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
|
||||
markdownBuilder.append("
|
||||
.append(imageArtifact.getSourcePath())
|
||||
.append(")\n\n");
|
||||
markdownBuilder.append("- 占位符:[IMG:")
|
||||
.append(imageArtifact.getReferenceKey())
|
||||
.append("]\n");
|
||||
markdownBuilder.append("- 锚点:")
|
||||
.append(imageArtifact.getAnchorCell())
|
||||
.append("\n");
|
||||
markdownBuilder.append("- OCR:")
|
||||
.append(StringUtil.hasText(imageArtifact.getOcrText()) ? imageArtifact.getOcrText() : "")
|
||||
.append("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> extractMergedRanges(XSSFSheet sheet) {
|
||||
List<String> mergedRanges = new ArrayList<String>();
|
||||
for (int index = 0; index < sheet.getNumMergedRegions(); index++) {
|
||||
mergedRanges.add(sheet.getMergedRegion(index).formatAsString());
|
||||
}
|
||||
return mergedRanges;
|
||||
}
|
||||
|
||||
private String joinCells(List<String> cells) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int index = 0; index < cells.size(); index++) {
|
||||
if (index > 0) {
|
||||
builder.append(" | ");
|
||||
}
|
||||
builder.append(cells.get(index));
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private String escapeMarkdown(String text) {
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return "";
|
||||
}
|
||||
return text.replace("|", "\\|").replace("\r", " ").replace("\n", "<br/>");
|
||||
}
|
||||
|
||||
private String buildImageName(String sheetKey, int rowIndex, int colIndex, int imageIndex) {
|
||||
return sheetKey + "-r" + (rowIndex + 1) + "c" + (colIndex + 1) + "-" + formatIndex(imageIndex);
|
||||
}
|
||||
|
||||
private String buildImageSourcePath(String sheetKey, int rowIndex, int colIndex, int imageIndex, String extension) {
|
||||
return "images/" + sheetKey + "/r" + (rowIndex + 1) + "c" + (colIndex + 1) + "-" + formatIndex(imageIndex) + "." + extension;
|
||||
}
|
||||
|
||||
private String buildSheetKey(String sheetName, int sheetIndex) {
|
||||
if (!StringUtil.hasText(sheetName)) {
|
||||
return "sheet-" + formatIndex(sheetIndex + 1);
|
||||
}
|
||||
String lowerCaseName = sheetName.toLowerCase(Locale.ROOT);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int index = 0; index < lowerCaseName.length(); index++) {
|
||||
char character = lowerCaseName.charAt(index);
|
||||
if ((character >= 'a' && character <= 'z') || (character >= '0' && character <= '9')) {
|
||||
builder.append(character);
|
||||
continue;
|
||||
}
|
||||
if (builder.length() > 0 && builder.charAt(builder.length() - 1) != '-') {
|
||||
builder.append('-');
|
||||
}
|
||||
builder.append('u').append(String.format(Locale.ROOT, "%04x", (int) character)).append('-');
|
||||
}
|
||||
String normalized = builder.toString();
|
||||
while (normalized.startsWith("-")) {
|
||||
normalized = normalized.substring(1);
|
||||
}
|
||||
while (normalized.endsWith("-")) {
|
||||
normalized = normalized.substring(0, normalized.length() - 1);
|
||||
}
|
||||
return StringUtil.hasText(normalized) ? normalized : "sheet-" + formatIndex(sheetIndex + 1);
|
||||
}
|
||||
|
||||
private String formatIndex(int index) {
|
||||
int displayIndex = index <= 0 ? 1 : index;
|
||||
if (displayIndex < 10) {
|
||||
return "00" + displayIndex;
|
||||
}
|
||||
if (displayIndex < 100) {
|
||||
return "0" + displayIndex;
|
||||
}
|
||||
return String.valueOf(displayIndex);
|
||||
}
|
||||
|
||||
private String detectImageMimeType(String path) {
|
||||
if (!StringUtil.hasText(path)) {
|
||||
return "application/octet-stream";
|
||||
}
|
||||
String mimeType = URLConnection.guessContentTypeFromName(path);
|
||||
if (StringUtil.hasText(mimeType)) {
|
||||
return mimeType;
|
||||
}
|
||||
String lowerCasePath = path.toLowerCase(Locale.ROOT);
|
||||
if (lowerCasePath.endsWith(".jpg") || lowerCasePath.endsWith(".jpeg")) {
|
||||
return "image/jpeg";
|
||||
}
|
||||
if (lowerCasePath.endsWith(".png")) {
|
||||
return "image/png";
|
||||
}
|
||||
if (lowerCasePath.endsWith(".gif")) {
|
||||
return "image/gif";
|
||||
}
|
||||
if (lowerCasePath.endsWith(".bmp")) {
|
||||
return "image/bmp";
|
||||
}
|
||||
if (lowerCasePath.endsWith(".webp")) {
|
||||
return "image/webp";
|
||||
}
|
||||
return "application/octet-stream";
|
||||
}
|
||||
|
||||
private void updateProgress(DocumentAsyncTaskUpdater updater,
|
||||
String stage,
|
||||
int processedItems,
|
||||
int totalItems,
|
||||
String message) {
|
||||
if (updater == null) {
|
||||
return;
|
||||
}
|
||||
int safeTotal = totalItems <= 0 ? 1 : totalItems;
|
||||
int percent = (int) Math.min(99, Math.round(processedItems * 100.0d / safeTotal));
|
||||
updater.update(stage, percent, processedItems, totalItems, message);
|
||||
}
|
||||
|
||||
private static DocumentAsyncTaskManager defaultTaskManager() {
|
||||
DocumentAsyncTaskRepository repository = new InMemoryDocumentAsyncTaskRepository();
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(2);
|
||||
return new DocumentAsyncTaskManager(repository, executorService);
|
||||
}
|
||||
|
||||
private static class SheetExtraction {
|
||||
|
||||
private final StringBuilder markdown = new StringBuilder();
|
||||
private final List<XlsxCellImageArtifact> imageArtifacts = new ArrayList<XlsxCellImageArtifact>();
|
||||
private final List<DocumentImage> documentImages = new ArrayList<DocumentImage>();
|
||||
private final List<String> mergedRanges = new ArrayList<String>();
|
||||
private XlsxSheetArtifact sheetArtifact;
|
||||
private XlsxSheetImagesArtifact sheetImagesArtifact;
|
||||
private String ocrBackend;
|
||||
}
|
||||
|
||||
private static class SheetImageExtraction {
|
||||
|
||||
private XlsxCellImageArtifact imageArtifact;
|
||||
private DocumentImage documentImage;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 单元格工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxCellArtifact {
|
||||
|
||||
private Integer rowIndex;
|
||||
private Integer columnIndex;
|
||||
private String cellRef;
|
||||
private String text;
|
||||
private List<String> imageKeys = new ArrayList<String>();
|
||||
|
||||
public Integer getRowIndex() {
|
||||
return rowIndex;
|
||||
}
|
||||
|
||||
public void setRowIndex(Integer rowIndex) {
|
||||
this.rowIndex = rowIndex;
|
||||
}
|
||||
|
||||
public Integer getColumnIndex() {
|
||||
return columnIndex;
|
||||
}
|
||||
|
||||
public void setColumnIndex(Integer columnIndex) {
|
||||
this.columnIndex = columnIndex;
|
||||
}
|
||||
|
||||
public String getCellRef() {
|
||||
return cellRef;
|
||||
}
|
||||
|
||||
public void setCellRef(String cellRef) {
|
||||
this.cellRef = cellRef;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
public void setText(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public List<String> getImageKeys() {
|
||||
return imageKeys;
|
||||
}
|
||||
|
||||
public void setImageKeys(List<String> imageKeys) {
|
||||
this.imageKeys = imageKeys == null ? new ArrayList<String>() : imageKeys;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
/**
|
||||
* 单元格图片工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxCellImageArtifact {
|
||||
|
||||
private String sheetName;
|
||||
private String anchorCell;
|
||||
private Integer fromRow;
|
||||
private Integer fromCol;
|
||||
private Integer toRow;
|
||||
private Integer toCol;
|
||||
private String name;
|
||||
private String referenceKey;
|
||||
private String sourcePath;
|
||||
private String ocrText;
|
||||
|
||||
public String getSheetName() {
|
||||
return sheetName;
|
||||
}
|
||||
|
||||
public void setSheetName(String sheetName) {
|
||||
this.sheetName = sheetName;
|
||||
}
|
||||
|
||||
public String getAnchorCell() {
|
||||
return anchorCell;
|
||||
}
|
||||
|
||||
public void setAnchorCell(String anchorCell) {
|
||||
this.anchorCell = anchorCell;
|
||||
}
|
||||
|
||||
public Integer getFromRow() {
|
||||
return fromRow;
|
||||
}
|
||||
|
||||
public void setFromRow(Integer fromRow) {
|
||||
this.fromRow = fromRow;
|
||||
}
|
||||
|
||||
public Integer getFromCol() {
|
||||
return fromCol;
|
||||
}
|
||||
|
||||
public void setFromCol(Integer fromCol) {
|
||||
this.fromCol = fromCol;
|
||||
}
|
||||
|
||||
public Integer getToRow() {
|
||||
return toRow;
|
||||
}
|
||||
|
||||
public void setToRow(Integer toRow) {
|
||||
this.toRow = toRow;
|
||||
}
|
||||
|
||||
public Integer getToCol() {
|
||||
return toCol;
|
||||
}
|
||||
|
||||
public void setToCol(Integer toCol) {
|
||||
this.toCol = toCol;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getReferenceKey() {
|
||||
return referenceKey;
|
||||
}
|
||||
|
||||
public void setReferenceKey(String referenceKey) {
|
||||
this.referenceKey = referenceKey;
|
||||
}
|
||||
|
||||
public String getSourcePath() {
|
||||
return sourcePath;
|
||||
}
|
||||
|
||||
public void setSourcePath(String sourcePath) {
|
||||
this.sourcePath = sourcePath;
|
||||
}
|
||||
|
||||
public String getOcrText() {
|
||||
return ocrText;
|
||||
}
|
||||
|
||||
public void setOcrText(String ocrText) {
|
||||
this.ocrText = ocrText;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* XLSX 结构化工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxParseArtifact {
|
||||
|
||||
private String workbookName;
|
||||
private List<XlsxSheetArtifact> sheets = new ArrayList<XlsxSheetArtifact>();
|
||||
private List<XlsxSheetImagesArtifact> sheetImages = new ArrayList<XlsxSheetImagesArtifact>();
|
||||
private List<String> mergedRanges = new ArrayList<String>();
|
||||
private List<XlsxCellImageArtifact> cellImages = new ArrayList<XlsxCellImageArtifact>();
|
||||
|
||||
public String getWorkbookName() {
|
||||
return workbookName;
|
||||
}
|
||||
|
||||
public void setWorkbookName(String workbookName) {
|
||||
this.workbookName = workbookName;
|
||||
}
|
||||
|
||||
public List<XlsxSheetArtifact> getSheets() {
|
||||
return sheets;
|
||||
}
|
||||
|
||||
public void setSheets(List<XlsxSheetArtifact> sheets) {
|
||||
this.sheets = sheets == null ? new ArrayList<XlsxSheetArtifact>() : sheets;
|
||||
}
|
||||
|
||||
public List<XlsxSheetImagesArtifact> getSheetImages() {
|
||||
return sheetImages;
|
||||
}
|
||||
|
||||
public void setSheetImages(List<XlsxSheetImagesArtifact> sheetImages) {
|
||||
this.sheetImages = sheetImages == null ? new ArrayList<XlsxSheetImagesArtifact>() : sheetImages;
|
||||
}
|
||||
|
||||
public List<String> getMergedRanges() {
|
||||
return mergedRanges;
|
||||
}
|
||||
|
||||
public void setMergedRanges(List<String> mergedRanges) {
|
||||
this.mergedRanges = mergedRanges == null ? new ArrayList<String>() : mergedRanges;
|
||||
}
|
||||
|
||||
public List<XlsxCellImageArtifact> getCellImages() {
|
||||
return cellImages;
|
||||
}
|
||||
|
||||
public void setCellImages(List<XlsxCellImageArtifact> cellImages) {
|
||||
this.cellImages = cellImages == null ? new ArrayList<XlsxCellImageArtifact>() : cellImages;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 行工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxRowArtifact {
|
||||
|
||||
private Integer rowIndex;
|
||||
private List<XlsxCellArtifact> cells = new ArrayList<XlsxCellArtifact>();
|
||||
|
||||
public Integer getRowIndex() {
|
||||
return rowIndex;
|
||||
}
|
||||
|
||||
public void setRowIndex(Integer rowIndex) {
|
||||
this.rowIndex = rowIndex;
|
||||
}
|
||||
|
||||
public List<XlsxCellArtifact> getCells() {
|
||||
return cells;
|
||||
}
|
||||
|
||||
public void setCells(List<XlsxCellArtifact> cells) {
|
||||
this.cells = cells == null ? new ArrayList<XlsxCellArtifact>() : cells;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Sheet 工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxSheetArtifact {
|
||||
|
||||
private String sheetName;
|
||||
private Integer sheetIndex;
|
||||
private Boolean hidden;
|
||||
private Integer rowCount;
|
||||
private Integer columnCount;
|
||||
private List<XlsxRowArtifact> rows = new ArrayList<XlsxRowArtifact>();
|
||||
|
||||
public String getSheetName() {
|
||||
return sheetName;
|
||||
}
|
||||
|
||||
public void setSheetName(String sheetName) {
|
||||
this.sheetName = sheetName;
|
||||
}
|
||||
|
||||
public Integer getSheetIndex() {
|
||||
return sheetIndex;
|
||||
}
|
||||
|
||||
public void setSheetIndex(Integer sheetIndex) {
|
||||
this.sheetIndex = sheetIndex;
|
||||
}
|
||||
|
||||
public Boolean getHidden() {
|
||||
return hidden;
|
||||
}
|
||||
|
||||
public void setHidden(Boolean hidden) {
|
||||
this.hidden = hidden;
|
||||
}
|
||||
|
||||
public Integer getRowCount() {
|
||||
return rowCount;
|
||||
}
|
||||
|
||||
public void setRowCount(Integer rowCount) {
|
||||
this.rowCount = rowCount;
|
||||
}
|
||||
|
||||
public Integer getColumnCount() {
|
||||
return columnCount;
|
||||
}
|
||||
|
||||
public void setColumnCount(Integer columnCount) {
|
||||
this.columnCount = columnCount;
|
||||
}
|
||||
|
||||
public List<XlsxRowArtifact> getRows() {
|
||||
return rows;
|
||||
}
|
||||
|
||||
public void setRows(List<XlsxRowArtifact> rows) {
|
||||
this.rows = rows == null ? new ArrayList<XlsxRowArtifact>() : rows;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Sheet 维度的图片索引工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxSheetImagesArtifact {
|
||||
|
||||
private String sheetName;
|
||||
private Integer sheetIndex;
|
||||
private List<String> referenceKeys = new ArrayList<String>();
|
||||
private List<String> sourcePaths = new ArrayList<String>();
|
||||
|
||||
public String getSheetName() {
|
||||
return sheetName;
|
||||
}
|
||||
|
||||
public void setSheetName(String sheetName) {
|
||||
this.sheetName = sheetName;
|
||||
}
|
||||
|
||||
public Integer getSheetIndex() {
|
||||
return sheetIndex;
|
||||
}
|
||||
|
||||
public void setSheetIndex(Integer sheetIndex) {
|
||||
this.sheetIndex = sheetIndex;
|
||||
}
|
||||
|
||||
public List<String> getReferenceKeys() {
|
||||
return referenceKeys;
|
||||
}
|
||||
|
||||
public void setReferenceKeys(List<String> referenceKeys) {
|
||||
this.referenceKeys = referenceKeys == null ? new ArrayList<String>() : referenceKeys;
|
||||
}
|
||||
|
||||
public List<String> getSourcePaths() {
|
||||
return sourcePaths;
|
||||
}
|
||||
|
||||
public void setSourcePaths(List<String> sourcePaths) {
|
||||
this.sourcePaths = sourcePaths == null ? new ArrayList<String>() : sourcePaths;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,333 @@
|
||||
package com.easyagents.document.xlsx.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||
import com.easyagents.document.core.mineru.MineruClient;
|
||||
import com.easyagents.document.core.mineru.MineruMapper;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.core.mineru.MineruResultPayload;
|
||||
import com.easyagents.document.core.entity.ParseArtifacts;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import com.easyagents.document.core.entity.XlsxParseRequest;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.xlsx.model.XlsxParseArtifact;
|
||||
import org.apache.poi.ss.usermodel.ClientAnchor;
|
||||
import org.apache.poi.xssf.usermodel.XSSFDrawing;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSheet;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.Executor;
|
||||
|
||||
/**
|
||||
* XLSX MinerU 服务测试。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruXlsxDocumentParseServiceTest {
|
||||
|
||||
@Test
|
||||
public void shouldBuildMarkdownAndImageArtifacts() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.addFile(ParseFile.of("demo.xlsx", buildWorkbookBytes()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
|
||||
Assert.assertEquals(1, response.getResults().size());
|
||||
ParseResult result = response.getResults().get(0);
|
||||
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("images/sheet1/r2c2-001.png"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("图片文字描述"));
|
||||
Assert.assertEquals(1, result.getImages().size());
|
||||
Assert.assertNotNull(result.getImages().get(0).getContent());
|
||||
|
||||
XlsxParseArtifact artifact = extractXlsxArtifact(result);
|
||||
Assert.assertEquals("demo.xlsx", artifact.getWorkbookName());
|
||||
Assert.assertEquals(1, artifact.getSheets().size());
|
||||
Assert.assertEquals(1, artifact.getSheetImages().size());
|
||||
Assert.assertEquals(1, artifact.getCellImages().size());
|
||||
Assert.assertEquals("sheet1-r2c2-001", artifact.getCellImages().get(0).getReferenceKey());
|
||||
Assert.assertEquals("images/sheet1/r2c2-001.png", artifact.getCellImages().get(0).getSourcePath());
|
||||
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
|
||||
Assert.assertEquals("images/sheet1/r2c2-001.png", artifact.getSheetImages().get(0).getSourcePaths().get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldKeepImageKeysUniqueForNonAsciiSheetNames() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.addFile(ParseFile.of("unicode-sheets.xlsx", buildWorkbookBytesWithUnicodeSheetNames()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
ParseResult result = response.getResults().get(0);
|
||||
|
||||
Assert.assertEquals(2, result.getImages().size());
|
||||
Assert.assertNotEquals(result.getImages().get(0).getName(), result.getImages().get(1).getName());
|
||||
Assert.assertNotEquals(result.getImages().get(0).getSourcePath(), result.getImages().get(1).getSourcePath());
|
||||
Assert.assertTrue(result.getMarkdown().contains("[IMG:" + result.getImages().get(0).getName() + "]"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("[IMG:" + result.getImages().get(1).getName() + "]"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldDetectJpegMimeType() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.addFile(ParseFile.of("jpeg.xlsx", buildWorkbookBytesWithJpegImage()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
ParseResult result = response.getResults().get(0);
|
||||
|
||||
Assert.assertEquals(1, result.getImages().size());
|
||||
Assert.assertEquals("image/jpeg", result.getImages().get(0).getMimeType());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldAppendImageReferenceForImageOnlySheet() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.addFile(ParseFile.of("image-only.xlsx", buildWorkbookBytesWithImageOnlySheet()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
ParseResult result = response.getResults().get(0);
|
||||
XlsxParseArtifact artifact = extractXlsxArtifact(result);
|
||||
|
||||
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("_empty sheet_"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片说明"));
|
||||
Assert.assertTrue(result.getMarkdown().contains(""));
|
||||
Assert.assertTrue(result.getMarkdown().contains("- 占位符:[IMG:sheet1-r2c2-001]"));
|
||||
Assert.assertEquals(1, result.getImages().size());
|
||||
Assert.assertEquals(1, artifact.getSheetImages().size());
|
||||
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldTrackAsyncLifecycleAndExposeResult() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
ManualExecutor executor = new ManualExecutor();
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executor)
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.addFile(ParseFile.of("async.xlsx", buildWorkbookBytes()));
|
||||
|
||||
ParseTaskStatus submitted = service.submit(request);
|
||||
Assert.assertEquals("queued", submitted.getStatus());
|
||||
Assert.assertEquals("queued", submitted.getCurrentStage());
|
||||
Assert.assertEquals(Integer.valueOf(0), submitted.getProgressPercent());
|
||||
|
||||
ParseTaskInfo queuedInfo = service.queryTaskInfo(submitted.getTaskId());
|
||||
Assert.assertNull(queuedInfo.getResult());
|
||||
try {
|
||||
service.queryResult(submitted.getTaskId());
|
||||
Assert.fail("任务未完成时应抛出异常");
|
||||
} catch (DocumentParseException expected) {
|
||||
Assert.assertTrue(expected.getMessage().contains(submitted.getTaskId()));
|
||||
}
|
||||
|
||||
executor.runNext();
|
||||
|
||||
ParseTaskStatus completed = service.queryTask(submitted.getTaskId());
|
||||
Assert.assertEquals("completed", completed.getStatus());
|
||||
Assert.assertEquals("completed", completed.getCurrentStage());
|
||||
Assert.assertEquals(Integer.valueOf(100), completed.getProgressPercent());
|
||||
Assert.assertEquals("任务执行完成", completed.getStatusMessage());
|
||||
|
||||
ParseTaskInfo completedInfo = service.queryTaskInfo(submitted.getTaskId());
|
||||
Assert.assertNotNull(completedInfo.getResult());
|
||||
Assert.assertTrue(completedInfo.getResult().getResults().get(0).getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
|
||||
Assert.assertEquals(completedInfo.getResult(), service.queryResult(submitted.getTaskId()));
|
||||
}
|
||||
|
||||
private byte[] buildWorkbookBytes() throws Exception {
|
||||
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||
XSSFSheet sheet = workbook.createSheet("Sheet1");
|
||||
sheet.createRow(0).createCell(0).setCellValue("商品");
|
||||
sheet.getRow(0).createCell(1).setCellValue("图片");
|
||||
sheet.createRow(1).createCell(0).setCellValue("手机");
|
||||
addPicture(workbook, sheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||
return writeWorkbook(workbook);
|
||||
}
|
||||
|
||||
private byte[] buildWorkbookBytesWithUnicodeSheetNames() throws Exception {
|
||||
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||
|
||||
XSSFSheet detailSheet = workbook.createSheet("明细");
|
||||
detailSheet.createRow(0).createCell(0).setCellValue("图片");
|
||||
addPicture(workbook, detailSheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||
|
||||
XSSFSheet summarySheet = workbook.createSheet("汇总");
|
||||
summarySheet.createRow(0).createCell(0).setCellValue("图片");
|
||||
addPicture(workbook, summarySheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||
|
||||
return writeWorkbook(workbook);
|
||||
}
|
||||
|
||||
private byte[] buildWorkbookBytesWithJpegImage() throws Exception {
|
||||
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||
XSSFSheet sheet = workbook.createSheet("Sheet1");
|
||||
sheet.createRow(0).createCell(0).setCellValue("图片");
|
||||
addPicture(workbook, sheet, 1, 1, createImageBytes("jpg"), XSSFWorkbook.PICTURE_TYPE_JPEG);
|
||||
return writeWorkbook(workbook);
|
||||
}
|
||||
|
||||
private byte[] buildWorkbookBytesWithImageOnlySheet() throws Exception {
|
||||
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||
XSSFSheet sheet = workbook.createSheet("Sheet1");
|
||||
addPicture(workbook, sheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||
return writeWorkbook(workbook);
|
||||
}
|
||||
|
||||
private void addPicture(XSSFWorkbook workbook,
|
||||
XSSFSheet sheet,
|
||||
int rowIndex,
|
||||
int colIndex,
|
||||
byte[] imageBytes,
|
||||
int pictureType) {
|
||||
int pictureIndex = workbook.addPicture(imageBytes, pictureType);
|
||||
XSSFDrawing drawing = sheet.createDrawingPatriarch();
|
||||
ClientAnchor anchor = workbook.getCreationHelper().createClientAnchor();
|
||||
anchor.setRow1(rowIndex);
|
||||
anchor.setCol1(colIndex);
|
||||
anchor.setRow2(rowIndex + 1);
|
||||
anchor.setCol2(colIndex + 1);
|
||||
drawing.createPicture(anchor, pictureIndex);
|
||||
}
|
||||
|
||||
private byte[] writeWorkbook(XSSFWorkbook workbook) throws Exception {
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
workbook.write(outputStream);
|
||||
workbook.close();
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
|
||||
private byte[] createImageBytes(String format) throws Exception {
|
||||
BufferedImage image = new BufferedImage(2, 2, BufferedImage.TYPE_INT_RGB);
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
ImageIO.write(image, format, outputStream);
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
|
||||
private MineruProperties defaultProperties() {
|
||||
MineruProperties properties = new MineruProperties();
|
||||
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||
return properties;
|
||||
}
|
||||
|
||||
private Executor directExecutor() {
|
||||
return new Executor() {
|
||||
@Override
|
||||
public void execute(Runnable command) {
|
||||
command.run();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private XlsxParseArtifact extractXlsxArtifact(ParseResult result) {
|
||||
ParseArtifacts artifacts = result.getArtifacts();
|
||||
Assert.assertNotNull(artifacts);
|
||||
Object artifact = artifacts.getExtraJsonArtifacts().get("xlsx");
|
||||
Assert.assertTrue(artifact instanceof XlsxParseArtifact);
|
||||
return (XlsxParseArtifact) artifact;
|
||||
}
|
||||
|
||||
/**
|
||||
* 手动执行的测试执行器,用于验证异步任务状态流转。
|
||||
*/
|
||||
private static class ManualExecutor implements Executor {
|
||||
|
||||
private final Queue<Runnable> tasks = new ArrayDeque<Runnable>();
|
||||
|
||||
@Override
|
||||
public void execute(Runnable command) {
|
||||
tasks.offer(command);
|
||||
}
|
||||
|
||||
private void runNext() {
|
||||
Runnable task = tasks.poll();
|
||||
Assert.assertNotNull("应当存在待执行任务", task);
|
||||
task.run();
|
||||
}
|
||||
}
|
||||
|
||||
private static class RecordingClient extends MineruClient {
|
||||
|
||||
private RecordingClient(MineruProperties properties) {
|
||||
super(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
@Override
|
||||
public MineruResultPayload parse(com.easyagents.document.core.entity.ParseRequest request) {
|
||||
return new MineruMapper(testProperties()).toResultPayload(syncPayload());
|
||||
}
|
||||
|
||||
private JSONObject syncPayload() {
|
||||
JSONObject payload = new JSONObject();
|
||||
payload.put("backend", "vlm-http-client");
|
||||
payload.put("version", "3.0.9");
|
||||
JSONObject result = new JSONObject();
|
||||
result.put("md_content", "图片文字描述");
|
||||
JSONObject results = new JSONObject();
|
||||
results.put("image", result);
|
||||
payload.put("results", results);
|
||||
return payload;
|
||||
}
|
||||
|
||||
private static MineruProperties testProperties() {
|
||||
MineruProperties properties = new MineruProperties();
|
||||
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||
return properties;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user