feat: 扩展 Office 文档解析能力
- 重构 document-core 与 MinerU 公共层,补齐 Office 异步任务基础设施 - 新增 PPTX/XLSX 解析模块与 starter 自动装配 - 补充 README 与相关测试覆盖
This commit is contained in:
@@ -15,7 +15,7 @@ Easy-Agents 是一个轻量、可扩展的 Java AI 应用开发框架,覆盖
|
||||
|
||||
- `easy-agents-bom`:依赖版本管理(BOM)。
|
||||
- `easy-agents-core`:核心抽象与基础能力。
|
||||
- `easy-agents-document`:统一文档解析能力域,当前提供 PDF 解析抽象与 MinerU provider。
|
||||
- `easy-agents-document`:统一文档解析能力域,当前提供 PDF、PPTX、XLSX 解析抽象与 MinerU 复用能力。
|
||||
- `easy-agents-chat`:对话模型接入实现集合。
|
||||
- `easy-agents-embedding`:向量化模型实现集合。
|
||||
- `easy-agents-rerank`:重排模型实现集合。
|
||||
|
||||
@@ -66,6 +66,16 @@
|
||||
<artifactId>easy-agents-document-pdf</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document-pptx</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document-xlsx</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-rag-core</artifactId>
|
||||
|
||||
@@ -24,5 +24,26 @@
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.alibaba.fastjson2</groupId>
|
||||
<artifactId>fastjson2</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.squareup.okhttp3</groupId>
|
||||
<artifactId>okhttp</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
package com.easyagents.document.core;
|
||||
|
||||
import com.easyagents.document.core.model.ParseRequest;
|
||||
import com.easyagents.document.core.model.ParseResponse;
|
||||
import com.easyagents.document.core.model.ParseTaskInfo;
|
||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
|
||||
/**
|
||||
* 统一文档解析服务抽象。
|
||||
@@ -11,7 +11,7 @@ import com.easyagents.document.core.model.ParseTaskStatus;
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
*/
|
||||
public interface DocumentParseService {
|
||||
public interface DocumentParseService<R extends ParseRequest> {
|
||||
|
||||
/**
|
||||
* 同步解析文档并直接返回结果。
|
||||
|
||||
@@ -0,0 +1,173 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.Executor;
|
||||
|
||||
/**
|
||||
* 文档异步任务管理器。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class DocumentAsyncTaskManager {
|
||||
|
||||
private final DocumentAsyncTaskRepository repository;
|
||||
private final Executor executor;
|
||||
|
||||
/**
|
||||
* 创建任务管理器。
|
||||
*
|
||||
* @param repository 任务仓库
|
||||
* @param executor 执行器
|
||||
*/
|
||||
public DocumentAsyncTaskManager(DocumentAsyncTaskRepository repository, Executor executor) {
|
||||
if (repository == null) {
|
||||
throw new IllegalArgumentException("DocumentAsyncTaskRepository must not be null");
|
||||
}
|
||||
if (executor == null) {
|
||||
throw new IllegalArgumentException("Executor must not be null");
|
||||
}
|
||||
this.repository = repository;
|
||||
this.executor = executor;
|
||||
}
|
||||
|
||||
/**
|
||||
* 提交异步任务。
|
||||
*
|
||||
* @param backend 后端标识
|
||||
* @param fileNames 文件名列表
|
||||
* @param runner 任务执行器
|
||||
* @return 初始任务状态
|
||||
*/
|
||||
public ParseTaskStatus submit(String backend, List<String> fileNames, final DocumentAsyncTaskRunner runner) {
|
||||
final String taskId = UUID.randomUUID().toString();
|
||||
final ParseTaskStatus status = new ParseTaskStatus();
|
||||
status.setTaskId(taskId);
|
||||
status.setStatus("queued");
|
||||
status.setBackend(backend);
|
||||
status.setFileNames(fileNames == null ? new ArrayList<String>() : new ArrayList<String>(fileNames));
|
||||
status.setCreatedAt(Instant.now().toString());
|
||||
status.setCurrentStage("queued");
|
||||
status.setProgressPercent(0);
|
||||
status.setProcessedItems(0);
|
||||
status.setTotalItems(fileNames == null ? 0 : fileNames.size());
|
||||
status.setStatusMessage("任务已进入队列");
|
||||
|
||||
final DocumentAsyncTaskRecord record = new DocumentAsyncTaskRecord(status);
|
||||
repository.save(record);
|
||||
|
||||
executor.execute(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
markRunning(record);
|
||||
try {
|
||||
ParseResponse response = runner.run(new RepositoryBackedTaskUpdater(record));
|
||||
ParseTaskStatus completed = record.getStatusSnapshot();
|
||||
completed.setStatus("completed");
|
||||
completed.setCompletedAt(Instant.now().toString());
|
||||
completed.setProgressPercent(100);
|
||||
completed.setCurrentStage("completed");
|
||||
completed.setStatusMessage("任务执行完成");
|
||||
record.setResult(response);
|
||||
record.updateStatus(completed);
|
||||
} catch (Exception exception) {
|
||||
ParseTaskStatus failed = record.getStatusSnapshot();
|
||||
failed.setStatus("failed");
|
||||
failed.setCompletedAt(Instant.now().toString());
|
||||
failed.setCurrentStage("failed");
|
||||
failed.setStatusMessage(exception.getMessage());
|
||||
failed.setError(exception.getMessage());
|
||||
record.updateStatus(failed);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return record.getStatusSnapshot();
|
||||
}
|
||||
|
||||
/**
|
||||
* 查询任务状态。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 任务状态
|
||||
*/
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
return requireRecord(taskId).getStatusSnapshot();
|
||||
}
|
||||
|
||||
/**
|
||||
* 查询任务聚合信息。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 聚合信息
|
||||
*/
|
||||
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||
return requireRecord(taskId).getTaskInfoSnapshot();
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取任务结果。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 任务结果
|
||||
*/
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
DocumentAsyncTaskRecord record = requireRecord(taskId);
|
||||
ParseTaskStatus status = record.getStatusSnapshot();
|
||||
if (!"completed".equalsIgnoreCase(status.getStatus())) {
|
||||
throw new DocumentParseException("Document async task is not completed: " + taskId);
|
||||
}
|
||||
return record.getResult();
|
||||
}
|
||||
|
||||
private DocumentAsyncTaskRecord requireRecord(String taskId) {
|
||||
if (!StringUtil.hasText(taskId)) {
|
||||
throw new IllegalArgumentException("taskId must not be empty");
|
||||
}
|
||||
DocumentAsyncTaskRecord record = repository.find(taskId);
|
||||
if (record == null) {
|
||||
throw new DocumentParseException("Document async task not found: " + taskId);
|
||||
}
|
||||
return record;
|
||||
}
|
||||
|
||||
private void markRunning(DocumentAsyncTaskRecord record) {
|
||||
ParseTaskStatus status = record.getStatusSnapshot();
|
||||
status.setStatus("preparing");
|
||||
status.setStartedAt(Instant.now().toString());
|
||||
status.setCurrentStage("preparing");
|
||||
status.setProgressPercent(0);
|
||||
status.setStatusMessage("任务开始执行");
|
||||
record.updateStatus(status);
|
||||
}
|
||||
|
||||
private static class RepositoryBackedTaskUpdater implements DocumentAsyncTaskUpdater {
|
||||
|
||||
private final DocumentAsyncTaskRecord record;
|
||||
|
||||
private RepositoryBackedTaskUpdater(DocumentAsyncTaskRecord record) {
|
||||
this.record = record;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void update(String stage, Integer progressPercent, Integer processedItems, Integer totalItems, String statusMessage) {
|
||||
ParseTaskStatus status = record.getStatusSnapshot();
|
||||
status.setStatus("completed".equalsIgnoreCase(stage) ? "completed" : "running");
|
||||
status.setCurrentStage(stage);
|
||||
status.setProgressPercent(progressPercent);
|
||||
status.setProcessedItems(processedItems);
|
||||
status.setTotalItems(totalItems);
|
||||
status.setStatusMessage(statusMessage);
|
||||
record.updateStatus(status);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
|
||||
/**
|
||||
* 文档异步任务记录。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class DocumentAsyncTaskRecord {
|
||||
|
||||
private final ParseTaskStatus status;
|
||||
private ParseResponse result;
|
||||
|
||||
/**
|
||||
* 创建任务记录。
|
||||
*
|
||||
* @param status 初始状态
|
||||
*/
|
||||
public DocumentAsyncTaskRecord(ParseTaskStatus status) {
|
||||
this.status = status;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取状态快照。
|
||||
*
|
||||
* @return 状态快照
|
||||
*/
|
||||
public synchronized ParseTaskStatus getStatusSnapshot() {
|
||||
return copyStatus(status);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取聚合信息快照。
|
||||
*
|
||||
* @return 聚合信息
|
||||
*/
|
||||
public synchronized ParseTaskInfo getTaskInfoSnapshot() {
|
||||
ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(status);
|
||||
taskInfo.setResult(result);
|
||||
return taskInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取结果。
|
||||
*
|
||||
* @return 最终结果
|
||||
*/
|
||||
public synchronized ParseResponse getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 更新任务状态。
|
||||
*
|
||||
* @param newStatus 新状态
|
||||
*/
|
||||
public synchronized void updateStatus(ParseTaskStatus newStatus) {
|
||||
if (newStatus == null) {
|
||||
return;
|
||||
}
|
||||
copyInto(newStatus, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* 更新任务结果。
|
||||
*
|
||||
* @param result 最终结果
|
||||
*/
|
||||
public synchronized void setResult(ParseResponse result) {
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
private ParseTaskStatus copyStatus(ParseTaskStatus source) {
|
||||
ParseTaskStatus copy = new ParseTaskStatus();
|
||||
copyInto(source, copy);
|
||||
return copy;
|
||||
}
|
||||
|
||||
private void copyInto(ParseTaskStatus source, ParseTaskStatus target) {
|
||||
target.setTaskId(source.getTaskId());
|
||||
target.setStatus(source.getStatus());
|
||||
target.setBackend(source.getBackend());
|
||||
target.setFileNames(source.getFileNames());
|
||||
target.setCreatedAt(source.getCreatedAt());
|
||||
target.setStartedAt(source.getStartedAt());
|
||||
target.setCompletedAt(source.getCompletedAt());
|
||||
target.setError(source.getError());
|
||||
target.setStatusUrl(source.getStatusUrl());
|
||||
target.setResultUrl(source.getResultUrl());
|
||||
target.setQueuedAhead(source.getQueuedAhead());
|
||||
target.setProgressPercent(source.getProgressPercent());
|
||||
target.setCurrentStage(source.getCurrentStage());
|
||||
target.setProcessedItems(source.getProcessedItems());
|
||||
target.setTotalItems(source.getTotalItems());
|
||||
target.setStatusMessage(source.getStatusMessage());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
/**
|
||||
* 文档异步任务仓库。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface DocumentAsyncTaskRepository {
|
||||
|
||||
/**
|
||||
* 保存任务记录。
|
||||
*
|
||||
* @param record 任务记录
|
||||
*/
|
||||
void save(DocumentAsyncTaskRecord record);
|
||||
|
||||
/**
|
||||
* 获取任务记录。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 任务记录,不存在时返回 {@code null}
|
||||
*/
|
||||
DocumentAsyncTaskRecord find(String taskId);
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
|
||||
/**
|
||||
* 文档异步任务执行器。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface DocumentAsyncTaskRunner {
|
||||
|
||||
/**
|
||||
* 执行任务。
|
||||
*
|
||||
* @param updater 状态更新器
|
||||
* @return 解析结果
|
||||
* @throws Exception 执行异常
|
||||
*/
|
||||
ParseResponse run(DocumentAsyncTaskUpdater updater) throws Exception;
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
/**
|
||||
* 文档异步任务进度更新器。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface DocumentAsyncTaskUpdater {
|
||||
|
||||
/**
|
||||
* 更新任务状态。
|
||||
*
|
||||
* @param stage 当前阶段
|
||||
* @param progressPercent 进度百分比
|
||||
* @param processedItems 已处理数量
|
||||
* @param totalItems 总数量
|
||||
* @param statusMessage 状态说明
|
||||
*/
|
||||
void update(String stage, Integer progressPercent, Integer processedItems, Integer totalItems, String statusMessage);
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* 基于内存的异步任务仓库。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class InMemoryDocumentAsyncTaskRepository implements DocumentAsyncTaskRepository {
|
||||
|
||||
private final Map<String, DocumentAsyncTaskRecord> records = new ConcurrentHashMap<String, DocumentAsyncTaskRecord>();
|
||||
|
||||
@Override
|
||||
public void save(DocumentAsyncTaskRecord record) {
|
||||
if (record == null || record.getStatusSnapshot() == null || record.getStatusSnapshot().getTaskId() == null) {
|
||||
return;
|
||||
}
|
||||
records.put(record.getStatusSnapshot().getTaskId(), record);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocumentAsyncTaskRecord find(String taskId) {
|
||||
return records.get(taskId);
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -16,6 +16,7 @@ public class DocumentImage {
|
||||
private String mimeType;
|
||||
private String sourcePath;
|
||||
private String dataUrl;
|
||||
private byte[] content;
|
||||
private List<Double> boundingBox = new ArrayList<Double>();
|
||||
private List<String> captions = new ArrayList<String>();
|
||||
private List<String> footnotes = new ArrayList<String>();
|
||||
@@ -60,6 +61,14 @@ public class DocumentImage {
|
||||
this.dataUrl = dataUrl;
|
||||
}
|
||||
|
||||
public byte[] getContent() {
|
||||
return content;
|
||||
}
|
||||
|
||||
public void setContent(byte[] content) {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
public List<Double> getBoundingBox() {
|
||||
return boundingBox;
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -13,12 +13,7 @@ public class ParseRequest {
|
||||
|
||||
private List<ParseFile> files = new ArrayList<ParseFile>();
|
||||
private String backend;
|
||||
private String parseMethod = "auto";
|
||||
private List<String> languages = new ArrayList<String>();
|
||||
private Boolean formulaEnabled = true;
|
||||
private Boolean tableEnabled = true;
|
||||
private Integer startPageIndex = 0;
|
||||
private Integer endPageIndex = 99999;
|
||||
private Boolean returnMarkdown = true;
|
||||
private Boolean returnMiddleJson = true;
|
||||
private Boolean returnContentList = true;
|
||||
@@ -38,6 +33,25 @@ public class ParseRequest {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 复制通用字段到目标请求。
|
||||
*
|
||||
* @param target 目标请求
|
||||
*/
|
||||
public void copyCommonFieldsTo(ParseRequest target) {
|
||||
if (target == null) {
|
||||
return;
|
||||
}
|
||||
target.setFiles(new ArrayList<ParseFile>(getFiles()));
|
||||
target.setBackend(getBackend());
|
||||
target.setLanguages(new ArrayList<String>(getLanguages()));
|
||||
target.setReturnMarkdown(getReturnMarkdown());
|
||||
target.setReturnMiddleJson(getReturnMiddleJson());
|
||||
target.setReturnContentList(getReturnContentList());
|
||||
target.setReturnModelOutput(getReturnModelOutput());
|
||||
target.setReturnImages(getReturnImages());
|
||||
}
|
||||
|
||||
public List<ParseFile> getFiles() {
|
||||
return files;
|
||||
}
|
||||
@@ -54,14 +68,6 @@ public class ParseRequest {
|
||||
this.backend = backend;
|
||||
}
|
||||
|
||||
public String getParseMethod() {
|
||||
return parseMethod;
|
||||
}
|
||||
|
||||
public void setParseMethod(String parseMethod) {
|
||||
this.parseMethod = parseMethod;
|
||||
}
|
||||
|
||||
public List<String> getLanguages() {
|
||||
return languages;
|
||||
}
|
||||
@@ -70,38 +76,6 @@ public class ParseRequest {
|
||||
this.languages = languages == null ? new ArrayList<String>() : languages;
|
||||
}
|
||||
|
||||
public Boolean getFormulaEnabled() {
|
||||
return formulaEnabled;
|
||||
}
|
||||
|
||||
public void setFormulaEnabled(Boolean formulaEnabled) {
|
||||
this.formulaEnabled = formulaEnabled;
|
||||
}
|
||||
|
||||
public Boolean getTableEnabled() {
|
||||
return tableEnabled;
|
||||
}
|
||||
|
||||
public void setTableEnabled(Boolean tableEnabled) {
|
||||
this.tableEnabled = tableEnabled;
|
||||
}
|
||||
|
||||
public Integer getStartPageIndex() {
|
||||
return startPageIndex;
|
||||
}
|
||||
|
||||
public void setStartPageIndex(Integer startPageIndex) {
|
||||
this.startPageIndex = startPageIndex;
|
||||
}
|
||||
|
||||
public Integer getEndPageIndex() {
|
||||
return endPageIndex;
|
||||
}
|
||||
|
||||
public void setEndPageIndex(Integer endPageIndex) {
|
||||
this.endPageIndex = endPageIndex;
|
||||
}
|
||||
|
||||
public Boolean getReturnMarkdown() {
|
||||
return returnMarkdown;
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
/**
|
||||
* 异步任务聚合查询结果。
|
||||
@@ -35,6 +35,11 @@ public class ParseTaskInfo extends ParseTaskStatus {
|
||||
taskInfo.setStatusUrl(status.getStatusUrl());
|
||||
taskInfo.setResultUrl(status.getResultUrl());
|
||||
taskInfo.setQueuedAhead(status.getQueuedAhead());
|
||||
taskInfo.setProgressPercent(status.getProgressPercent());
|
||||
taskInfo.setCurrentStage(status.getCurrentStage());
|
||||
taskInfo.setProcessedItems(status.getProcessedItems());
|
||||
taskInfo.setTotalItems(status.getTotalItems());
|
||||
taskInfo.setStatusMessage(status.getStatusMessage());
|
||||
return taskInfo;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -22,6 +22,11 @@ public class ParseTaskStatus {
|
||||
private String statusUrl;
|
||||
private String resultUrl;
|
||||
private Integer queuedAhead;
|
||||
private Integer progressPercent;
|
||||
private String currentStage;
|
||||
private Integer processedItems;
|
||||
private Integer totalItems;
|
||||
private String statusMessage;
|
||||
|
||||
public String getTaskId() {
|
||||
return taskId;
|
||||
@@ -110,4 +115,44 @@ public class ParseTaskStatus {
|
||||
public void setQueuedAhead(Integer queuedAhead) {
|
||||
this.queuedAhead = queuedAhead;
|
||||
}
|
||||
|
||||
public Integer getProgressPercent() {
|
||||
return progressPercent;
|
||||
}
|
||||
|
||||
public void setProgressPercent(Integer progressPercent) {
|
||||
this.progressPercent = progressPercent;
|
||||
}
|
||||
|
||||
public String getCurrentStage() {
|
||||
return currentStage;
|
||||
}
|
||||
|
||||
public void setCurrentStage(String currentStage) {
|
||||
this.currentStage = currentStage;
|
||||
}
|
||||
|
||||
public Integer getProcessedItems() {
|
||||
return processedItems;
|
||||
}
|
||||
|
||||
public void setProcessedItems(Integer processedItems) {
|
||||
this.processedItems = processedItems;
|
||||
}
|
||||
|
||||
public Integer getTotalItems() {
|
||||
return totalItems;
|
||||
}
|
||||
|
||||
public void setTotalItems(Integer totalItems) {
|
||||
this.totalItems = totalItems;
|
||||
}
|
||||
|
||||
public String getStatusMessage() {
|
||||
return statusMessage;
|
||||
}
|
||||
|
||||
public void setStatusMessage(String statusMessage) {
|
||||
this.statusMessage = statusMessage;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
/**
|
||||
* PDF 解析请求。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class PdfParseRequest extends ParseRequest {
|
||||
|
||||
private String parseMethod = "auto";
|
||||
private Boolean formulaEnabled = true;
|
||||
private Boolean tableEnabled = true;
|
||||
private Integer startPageIndex = 0;
|
||||
private Integer endPageIndex = 99999;
|
||||
|
||||
/**
|
||||
* 将通用请求转换为 PDF 请求。
|
||||
*
|
||||
* @param request 原始请求
|
||||
* @return PDF 请求
|
||||
*/
|
||||
public static PdfParseRequest from(ParseRequest request) {
|
||||
PdfParseRequest pdfParseRequest = new PdfParseRequest();
|
||||
if (request == null) {
|
||||
return pdfParseRequest;
|
||||
}
|
||||
request.copyCommonFieldsTo(pdfParseRequest);
|
||||
if (request instanceof PdfParseRequest) {
|
||||
PdfParseRequest source = (PdfParseRequest) request;
|
||||
pdfParseRequest.setParseMethod(source.getParseMethod());
|
||||
pdfParseRequest.setFormulaEnabled(source.getFormulaEnabled());
|
||||
pdfParseRequest.setTableEnabled(source.getTableEnabled());
|
||||
pdfParseRequest.setStartPageIndex(source.getStartPageIndex());
|
||||
pdfParseRequest.setEndPageIndex(source.getEndPageIndex());
|
||||
}
|
||||
return pdfParseRequest;
|
||||
}
|
||||
|
||||
public String getParseMethod() {
|
||||
return parseMethod;
|
||||
}
|
||||
|
||||
public void setParseMethod(String parseMethod) {
|
||||
this.parseMethod = parseMethod;
|
||||
}
|
||||
|
||||
public Boolean getFormulaEnabled() {
|
||||
return formulaEnabled;
|
||||
}
|
||||
|
||||
public void setFormulaEnabled(Boolean formulaEnabled) {
|
||||
this.formulaEnabled = formulaEnabled;
|
||||
}
|
||||
|
||||
public Boolean getTableEnabled() {
|
||||
return tableEnabled;
|
||||
}
|
||||
|
||||
public void setTableEnabled(Boolean tableEnabled) {
|
||||
this.tableEnabled = tableEnabled;
|
||||
}
|
||||
|
||||
public Integer getStartPageIndex() {
|
||||
return startPageIndex;
|
||||
}
|
||||
|
||||
public void setStartPageIndex(Integer startPageIndex) {
|
||||
this.startPageIndex = startPageIndex;
|
||||
}
|
||||
|
||||
public Integer getEndPageIndex() {
|
||||
return endPageIndex;
|
||||
}
|
||||
|
||||
public void setEndPageIndex(Integer endPageIndex) {
|
||||
this.endPageIndex = endPageIndex;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
/**
|
||||
* PPTX 解析请求。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class PptxParseRequest extends ParseRequest {
|
||||
|
||||
private Integer startSlideIndex = 0;
|
||||
private Integer endSlideIndex;
|
||||
private Double renderScale = 2.0d;
|
||||
private String imageFormat = "png";
|
||||
private Boolean includeSlideImageReference = true;
|
||||
|
||||
/**
|
||||
* 将通用请求转换为 PPTX 请求。
|
||||
*
|
||||
* @param request 原始请求
|
||||
* @return PPTX 请求
|
||||
*/
|
||||
public static PptxParseRequest from(ParseRequest request) {
|
||||
PptxParseRequest pptxParseRequest = new PptxParseRequest();
|
||||
if (request == null) {
|
||||
return pptxParseRequest;
|
||||
}
|
||||
request.copyCommonFieldsTo(pptxParseRequest);
|
||||
if (request instanceof PptxParseRequest) {
|
||||
PptxParseRequest source = (PptxParseRequest) request;
|
||||
pptxParseRequest.setStartSlideIndex(source.getStartSlideIndex());
|
||||
pptxParseRequest.setEndSlideIndex(source.getEndSlideIndex());
|
||||
pptxParseRequest.setRenderScale(source.getRenderScale());
|
||||
pptxParseRequest.setImageFormat(source.getImageFormat());
|
||||
pptxParseRequest.setIncludeSlideImageReference(source.getIncludeSlideImageReference());
|
||||
}
|
||||
return pptxParseRequest;
|
||||
}
|
||||
|
||||
public Integer getStartSlideIndex() {
|
||||
return startSlideIndex;
|
||||
}
|
||||
|
||||
public void setStartSlideIndex(Integer startSlideIndex) {
|
||||
this.startSlideIndex = startSlideIndex;
|
||||
}
|
||||
|
||||
public Integer getEndSlideIndex() {
|
||||
return endSlideIndex;
|
||||
}
|
||||
|
||||
public void setEndSlideIndex(Integer endSlideIndex) {
|
||||
this.endSlideIndex = endSlideIndex;
|
||||
}
|
||||
|
||||
public Double getRenderScale() {
|
||||
return renderScale;
|
||||
}
|
||||
|
||||
public void setRenderScale(Double renderScale) {
|
||||
this.renderScale = renderScale;
|
||||
}
|
||||
|
||||
public String getImageFormat() {
|
||||
return imageFormat;
|
||||
}
|
||||
|
||||
public void setImageFormat(String imageFormat) {
|
||||
this.imageFormat = imageFormat;
|
||||
}
|
||||
|
||||
public Boolean getIncludeSlideImageReference() {
|
||||
return includeSlideImageReference;
|
||||
}
|
||||
|
||||
public void setIncludeSlideImageReference(Boolean includeSlideImageReference) {
|
||||
this.includeSlideImageReference = includeSlideImageReference;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* XLSX 解析请求。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxParseRequest extends ParseRequest {
|
||||
|
||||
private List<String> sheetNames = new ArrayList<String>();
|
||||
private Boolean includeHiddenSheets = false;
|
||||
private Boolean ocrEmbeddedImages = true;
|
||||
private Integer maxRowsPerSheet;
|
||||
private Boolean includeImageAppendix = true;
|
||||
|
||||
/**
|
||||
* 将通用请求转换为 XLSX 请求。
|
||||
*
|
||||
* @param request 原始请求
|
||||
* @return XLSX 请求
|
||||
*/
|
||||
public static XlsxParseRequest from(ParseRequest request) {
|
||||
XlsxParseRequest xlsxParseRequest = new XlsxParseRequest();
|
||||
if (request == null) {
|
||||
return xlsxParseRequest;
|
||||
}
|
||||
request.copyCommonFieldsTo(xlsxParseRequest);
|
||||
if (request instanceof XlsxParseRequest) {
|
||||
XlsxParseRequest source = (XlsxParseRequest) request;
|
||||
xlsxParseRequest.setSheetNames(new ArrayList<String>(source.getSheetNames()));
|
||||
xlsxParseRequest.setIncludeHiddenSheets(source.getIncludeHiddenSheets());
|
||||
xlsxParseRequest.setOcrEmbeddedImages(source.getOcrEmbeddedImages());
|
||||
xlsxParseRequest.setMaxRowsPerSheet(source.getMaxRowsPerSheet());
|
||||
xlsxParseRequest.setIncludeImageAppendix(source.getIncludeImageAppendix());
|
||||
}
|
||||
return xlsxParseRequest;
|
||||
}
|
||||
|
||||
public List<String> getSheetNames() {
|
||||
return sheetNames;
|
||||
}
|
||||
|
||||
public void setSheetNames(List<String> sheetNames) {
|
||||
this.sheetNames = sheetNames == null ? new ArrayList<String>() : sheetNames;
|
||||
}
|
||||
|
||||
public Boolean getIncludeHiddenSheets() {
|
||||
return includeHiddenSheets;
|
||||
}
|
||||
|
||||
public void setIncludeHiddenSheets(Boolean includeHiddenSheets) {
|
||||
this.includeHiddenSheets = includeHiddenSheets;
|
||||
}
|
||||
|
||||
public Boolean getOcrEmbeddedImages() {
|
||||
return ocrEmbeddedImages;
|
||||
}
|
||||
|
||||
public void setOcrEmbeddedImages(Boolean ocrEmbeddedImages) {
|
||||
this.ocrEmbeddedImages = ocrEmbeddedImages;
|
||||
}
|
||||
|
||||
public Integer getMaxRowsPerSheet() {
|
||||
return maxRowsPerSheet;
|
||||
}
|
||||
|
||||
public void setMaxRowsPerSheet(Integer maxRowsPerSheet) {
|
||||
this.maxRowsPerSheet = maxRowsPerSheet;
|
||||
}
|
||||
|
||||
public Boolean getIncludeImageAppendix() {
|
||||
return includeImageAppendix;
|
||||
}
|
||||
|
||||
public void setIncludeImageAppendix(Boolean includeImageAppendix) {
|
||||
this.includeImageAppendix = includeImageAppendix;
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,11 @@
|
||||
package com.easyagents.document.pdf.mineru;
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.core.model.ParseFile;
|
||||
import com.easyagents.document.core.model.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import okhttp3.MediaType;
|
||||
import okhttp3.MultipartBody;
|
||||
import okhttp3.OkHttpClient;
|
||||
@@ -15,6 +15,7 @@ import okhttp3.Response;
|
||||
import okhttp3.ResponseBody;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URLConnection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@@ -23,11 +24,11 @@ import java.util.concurrent.TimeUnit;
|
||||
* MinerU HTTP 客户端。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruPdfClient {
|
||||
public class MineruClient {
|
||||
|
||||
private static final MediaType DEFAULT_PDF_MEDIA_TYPE = MediaType.parse("application/pdf");
|
||||
private static final MediaType DEFAULT_MEDIA_TYPE = MediaType.parse("application/octet-stream");
|
||||
|
||||
private final String baseUrl;
|
||||
private final OkHttpClient okHttpClient;
|
||||
@@ -39,7 +40,7 @@ public class MineruPdfClient {
|
||||
* @param properties MinerU 配置
|
||||
* @param mineruMapper DTO 映射器
|
||||
*/
|
||||
public MineruPdfClient(MineruProperties properties, MineruMapper mineruMapper) {
|
||||
public MineruClient(MineruProperties properties, MineruMapper mineruMapper) {
|
||||
this(
|
||||
properties,
|
||||
new OkHttpClient.Builder()
|
||||
@@ -58,7 +59,7 @@ public class MineruPdfClient {
|
||||
* @param okHttpClient HTTP 客户端
|
||||
* @param mineruMapper DTO 映射器
|
||||
*/
|
||||
public MineruPdfClient(MineruProperties properties, OkHttpClient okHttpClient, MineruMapper mineruMapper) {
|
||||
public MineruClient(MineruProperties properties, OkHttpClient okHttpClient, MineruMapper mineruMapper) {
|
||||
if (properties == null || !StringUtil.hasText(properties.getBaseUrl())) {
|
||||
throw new IllegalArgumentException("MinerU baseUrl must not be empty");
|
||||
}
|
||||
@@ -165,7 +166,7 @@ public class MineruPdfClient {
|
||||
}
|
||||
MediaType mediaType = StringUtil.hasText(file.getContentType())
|
||||
? MediaType.parse(file.getContentType())
|
||||
: DEFAULT_PDF_MEDIA_TYPE;
|
||||
: detectMediaType(file.getFileName());
|
||||
formBuilder.addFormDataPart(
|
||||
"files",
|
||||
file.getFileName(),
|
||||
@@ -208,4 +209,9 @@ public class MineruPdfClient {
|
||||
}
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
private MediaType detectMediaType(String fileName) {
|
||||
String mimeType = URLConnection.guessContentTypeFromName(fileName);
|
||||
return StringUtil.hasText(mimeType) ? MediaType.parse(mimeType) : DEFAULT_MEDIA_TYPE;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,218 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.DocumentParseService;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* 基于 MinerU API 的文档解析服务,支持 docx 文档和 pdf 文档。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruDocumentParseService<R extends ParseRequest> implements DocumentParseService<R> {
|
||||
|
||||
public static final String PROVIDER_NAME = "mineru";
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MineruDocumentParseService.class);
|
||||
|
||||
private final MineruProperties properties;
|
||||
private final MineruClient client;
|
||||
private final MineruMapper mapper;
|
||||
|
||||
/**
|
||||
* 创建默认服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
*/
|
||||
public MineruDocumentParseService(MineruProperties properties) {
|
||||
this(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建默认服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param mapper 结果映射器
|
||||
*/
|
||||
public MineruDocumentParseService(MineruProperties properties, MineruMapper mapper) {
|
||||
this(properties, new MineruClient(properties, mapper), mapper);
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param client HTTP 客户端
|
||||
* @param mapper 结果映射器
|
||||
*/
|
||||
public MineruDocumentParseService(MineruProperties properties, MineruClient client, MineruMapper mapper) {
|
||||
this.properties = properties;
|
||||
this.client = client;
|
||||
this.mapper = mapper;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse parse(ParseRequest request) {
|
||||
ParseRequest normalizedRequest = normalizeRequest(request);
|
||||
LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
normalizedRequest.getBackend());
|
||||
ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
|
||||
LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus submit(ParseRequest request) {
|
||||
ParseRequest normalizedRequest = normalizeRequest(request);
|
||||
normalizedRequest.setReturnMarkdown(true);
|
||||
normalizedRequest.setReturnMiddleJson(true);
|
||||
normalizedRequest.setReturnContentList(true);
|
||||
normalizedRequest.setReturnModelOutput(true);
|
||||
normalizedRequest.setReturnImages(true);
|
||||
LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
normalizedRequest.getBackend());
|
||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
|
||||
LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
|
||||
PROVIDER_NAME,
|
||||
taskStatus == null ? null : taskStatus.getTaskId(),
|
||||
taskStatus == null ? null : taskStatus.getStatus());
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
|
||||
LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
taskStatus == null ? null : taskStatus.getStatus());
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
|
||||
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
|
||||
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||
LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
MineruTaskStatus taskStatus = client.queryTask(taskId);
|
||||
ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(mapper.toParseTaskStatus(taskStatus));
|
||||
if ("completed".equalsIgnoreCase(taskStatus.getStatus())) {
|
||||
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||
taskInfo.setResult(response);
|
||||
}
|
||||
LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
taskInfo.getStatus(),
|
||||
taskInfo.getResult() != null);
|
||||
return taskInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取 MinerU 配置。
|
||||
*
|
||||
* @return MinerU 配置
|
||||
*/
|
||||
protected MineruProperties getProperties() {
|
||||
return properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* 归一化解析请求,补齐默认参数。
|
||||
*
|
||||
* @param request 原始请求
|
||||
* @return 归一化后的请求
|
||||
*/
|
||||
protected ParseRequest normalizeRequest(ParseRequest request) {
|
||||
if (request == null) {
|
||||
throw new IllegalArgumentException("ParseRequest must not be null");
|
||||
}
|
||||
if (request.getFiles() == null || request.getFiles().isEmpty()) {
|
||||
throw new IllegalArgumentException("ParseRequest files must not be empty");
|
||||
}
|
||||
ParseRequest normalizedRequest = new ParseRequest();
|
||||
normalizedRequest.setFiles(new ArrayList<ParseFile>(request.getFiles()));
|
||||
normalizedRequest.setBackend(StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend());
|
||||
normalizedRequest.setLanguages(
|
||||
request.getLanguages() == null || request.getLanguages().isEmpty()
|
||||
? new ArrayList<String>(properties.getDefaultLangList())
|
||||
: new ArrayList<String>(request.getLanguages())
|
||||
);
|
||||
normalizedRequest.setReturnMarkdown(request.getReturnMarkdown() == null ? Boolean.TRUE : request.getReturnMarkdown());
|
||||
normalizedRequest.setReturnMiddleJson(request.getReturnMiddleJson() == null ? Boolean.TRUE : request.getReturnMiddleJson());
|
||||
normalizedRequest.setReturnContentList(request.getReturnContentList() == null ? Boolean.TRUE : request.getReturnContentList());
|
||||
normalizedRequest.setReturnModelOutput(request.getReturnModelOutput() == null ? Boolean.FALSE : request.getReturnModelOutput());
|
||||
normalizedRequest.setReturnImages(request.getReturnImages() == null ? Boolean.TRUE : request.getReturnImages());
|
||||
return normalizedRequest;
|
||||
}
|
||||
|
||||
/**
|
||||
* 校验任务 ID。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
*/
|
||||
protected void validateTaskId(String taskId) {
|
||||
if (!StringUtil.hasText(taskId)) {
|
||||
throw new IllegalArgumentException("taskId must not be empty");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 轮询任务状态直到完成或失败。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 已完成的任务状态
|
||||
*/
|
||||
protected MineruTaskStatus waitForTaskCompleted(String taskId) {
|
||||
long deadline = System.currentTimeMillis() + properties.getResultTimeoutMs();
|
||||
while (true) {
|
||||
MineruTaskStatus taskStatus = client.queryTask(taskId);
|
||||
if ("completed".equals(taskStatus.getStatus())) {
|
||||
return taskStatus;
|
||||
}
|
||||
if ("failed".equals(taskStatus.getStatus())) {
|
||||
throw new DocumentParseException("MinerU task failed: " + taskStatus.getError());
|
||||
}
|
||||
if (System.currentTimeMillis() >= deadline) {
|
||||
throw new DocumentParseException("MinerU task result timeout: " + taskId);
|
||||
}
|
||||
try {
|
||||
Thread.sleep(properties.getPollIntervalMs());
|
||||
} catch (InterruptedException exception) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new DocumentParseException("Interrupted while waiting for MinerU task: " + taskId, exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,19 +1,20 @@
|
||||
package com.easyagents.document.pdf.mineru;
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.JSONArray;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.core.model.DocumentBlock;
|
||||
import com.easyagents.document.core.model.DocumentImage;
|
||||
import com.easyagents.document.core.model.DocumentPage;
|
||||
import com.easyagents.document.core.model.DocumentTable;
|
||||
import com.easyagents.document.core.model.ParseArtifacts;
|
||||
import com.easyagents.document.core.model.ParseRequest;
|
||||
import com.easyagents.document.core.model.ParseResponse;
|
||||
import com.easyagents.document.core.model.ParseResult;
|
||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
||||
import com.easyagents.document.core.entity.DocumentBlock;
|
||||
import com.easyagents.document.core.entity.DocumentImage;
|
||||
import com.easyagents.document.core.entity.DocumentPage;
|
||||
import com.easyagents.document.core.entity.DocumentTable;
|
||||
import com.easyagents.document.core.entity.PdfParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseArtifacts;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
@@ -31,7 +32,7 @@ import java.util.zip.ZipInputStream;
|
||||
* MinerU 原始协议与统一模型之间的映射器。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruMapper {
|
||||
|
||||
@@ -71,7 +72,6 @@ public class MineruMapper {
|
||||
*/
|
||||
public Map<String, List<String>> buildAsyncFormFields(ParseRequest request) {
|
||||
Map<String, List<String>> fields = buildBaseFormFields(request);
|
||||
// 异步结果固定按全量 ZIP 返回,避免超大结果通过 JSON 传输。
|
||||
putSingleValue(fields, "return_md", "true");
|
||||
putSingleValue(fields, "return_middle_json", "true");
|
||||
putSingleValue(fields, "return_content_list", "true");
|
||||
@@ -205,19 +205,24 @@ public class MineruMapper {
|
||||
private Map<String, List<String>> buildBaseFormFields(ParseRequest request) {
|
||||
Map<String, List<String>> fields = new LinkedHashMap<String, List<String>>();
|
||||
putSingleValue(fields, "backend", StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend());
|
||||
putSingleValue(fields, "parse_method", StringUtil.hasText(request.getParseMethod()) ? request.getParseMethod() : properties.getDefaultParseMethod());
|
||||
putSingleValue(fields, "formula_enable", String.valueOf(boolOrDefault(request.getFormulaEnabled(), properties.getDefaultFormulaEnable())));
|
||||
putSingleValue(fields, "table_enable", String.valueOf(boolOrDefault(request.getTableEnabled(), properties.getDefaultTableEnable())));
|
||||
putSingleValue(fields, "start_page_id", String.valueOf(intOrDefault(request.getStartPageIndex(), 0)));
|
||||
putSingleValue(fields, "end_page_id", String.valueOf(intOrDefault(request.getEndPageIndex(), 99999)));
|
||||
List<String> languages = request.getLanguages();
|
||||
if (languages == null || languages.isEmpty()) {
|
||||
languages = properties.getDefaultLangList();
|
||||
}
|
||||
if (languages != null && !languages.isEmpty()) {
|
||||
// MinerU 通过重复的 lang_list 表单字段接收多语言参数。
|
||||
fields.put("lang_list", new ArrayList<String>(languages));
|
||||
}
|
||||
if (request instanceof PdfParseRequest) {
|
||||
PdfParseRequest pdfParseRequest = (PdfParseRequest) request;
|
||||
putSingleValue(fields, "parse_method",
|
||||
StringUtil.hasText(pdfParseRequest.getParseMethod()) ? pdfParseRequest.getParseMethod() : properties.getDefaultParseMethod());
|
||||
putSingleValue(fields, "formula_enable",
|
||||
String.valueOf(boolOrDefault(pdfParseRequest.getFormulaEnabled(), properties.getDefaultFormulaEnable())));
|
||||
putSingleValue(fields, "table_enable",
|
||||
String.valueOf(boolOrDefault(pdfParseRequest.getTableEnabled(), properties.getDefaultTableEnable())));
|
||||
putSingleValue(fields, "start_page_id", String.valueOf(intOrDefault(pdfParseRequest.getStartPageIndex(), 0)));
|
||||
putSingleValue(fields, "end_page_id", String.valueOf(intOrDefault(pdfParseRequest.getEndPageIndex(), 99999)));
|
||||
}
|
||||
return fields;
|
||||
}
|
||||
|
||||
@@ -240,7 +245,8 @@ public class MineruMapper {
|
||||
result.setArtifacts(artifacts);
|
||||
|
||||
Map<String, String> imageDataUrls = toStringMap(fileResult.getJSONObject("images"));
|
||||
applyStructuredArtifacts(result, imageDataUrls);
|
||||
Map<String, byte[]> imageContents = toBinaryMap(imageDataUrls);
|
||||
applyStructuredArtifacts(result, imageDataUrls, imageContents);
|
||||
if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) {
|
||||
result.getWarnings().add("MinerU did not return markdown, middle_json or content_list");
|
||||
}
|
||||
@@ -264,7 +270,6 @@ public class MineruMapper {
|
||||
JSONArray contentList = asArray(contentListArtifact);
|
||||
Object modelOutput = modelOutputArtifact;
|
||||
|
||||
// MinerU 在 DOCX 等场景下可能将结构化块列表放在 middle/model 工件里,并且直接返回数组。
|
||||
if (contentList == null && middleArtifact instanceof JSONArray) {
|
||||
contentList = (JSONArray) middleArtifact;
|
||||
middleJson = null;
|
||||
@@ -289,10 +294,12 @@ public class MineruMapper {
|
||||
result.setArtifacts(artifacts);
|
||||
|
||||
Map<String, String> imageDataUrls = new LinkedHashMap<String, String>();
|
||||
Map<String, byte[]> imageContents = new LinkedHashMap<String, byte[]>();
|
||||
for (Map.Entry<String, byte[]> imageEntry : bundle.images.entrySet()) {
|
||||
imageDataUrls.put(imageEntry.getKey(), toDataUrl(imageEntry.getKey(), imageEntry.getValue()));
|
||||
imageContents.put(imageEntry.getKey(), imageEntry.getValue());
|
||||
}
|
||||
applyStructuredArtifacts(result, imageDataUrls);
|
||||
applyStructuredArtifacts(result, imageDataUrls, imageContents);
|
||||
|
||||
if (markdown == null && middleJson == null && contentList == null) {
|
||||
throw new DocumentParseException("MinerU ZIP result missing critical artifacts for file: " + fileName);
|
||||
@@ -300,7 +307,7 @@ public class MineruMapper {
|
||||
return result;
|
||||
}
|
||||
|
||||
private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls) {
|
||||
private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls, Map<String, byte[]> imageContents) {
|
||||
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson());
|
||||
JSONArray contentList = asArray(result.getArtifacts().getContentList());
|
||||
|
||||
@@ -311,9 +318,9 @@ public class MineruMapper {
|
||||
}
|
||||
|
||||
if (contentList != null) {
|
||||
fillFromContentList(result, contentList, imageDataUrls);
|
||||
fillFromContentList(result, contentList, imageDataUrls, imageContents);
|
||||
} else if (middleJson != null) {
|
||||
fillFromMiddleJson(result, middleJson, imageDataUrls);
|
||||
fillFromMiddleJson(result, middleJson, imageDataUrls, imageContents);
|
||||
}
|
||||
|
||||
if ((result.getImages() == null || result.getImages().isEmpty()) && imageDataUrls != null && !imageDataUrls.isEmpty()) {
|
||||
@@ -322,6 +329,7 @@ public class MineruMapper {
|
||||
image.setName(baseName(entry.getKey()));
|
||||
image.setSourcePath(entry.getKey());
|
||||
image.setDataUrl(entry.getValue());
|
||||
image.setContent(matchBinaryContent(entry.getKey(), imageContents));
|
||||
image.setMimeType(detectMimeType(entry.getKey()));
|
||||
result.getImages().add(image);
|
||||
}
|
||||
@@ -349,7 +357,10 @@ public class MineruMapper {
|
||||
result.setPages(pages);
|
||||
}
|
||||
|
||||
private void fillFromContentList(ParseResult result, JSONArray contentList, Map<String, String> imageDataUrls) {
|
||||
private void fillFromContentList(ParseResult result,
|
||||
JSONArray contentList,
|
||||
Map<String, String> imageDataUrls,
|
||||
Map<String, byte[]> imageContents) {
|
||||
for (int index = 0; index < contentList.size(); index++) {
|
||||
JSONObject item = contentList.getJSONObject(index);
|
||||
if (item == null) {
|
||||
@@ -391,12 +402,16 @@ public class MineruMapper {
|
||||
image.setCaptions(extractCaptions(item));
|
||||
image.setFootnotes(extractFootnotes(item));
|
||||
image.setDataUrl(matchDataUrl(item.getString("img_path"), imageDataUrls));
|
||||
image.setContent(matchBinaryContent(item.getString("img_path"), imageContents));
|
||||
result.getImages().add(image);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void fillFromMiddleJson(ParseResult result, JSONObject middleJson, Map<String, String> imageDataUrls) {
|
||||
private void fillFromMiddleJson(ParseResult result,
|
||||
JSONObject middleJson,
|
||||
Map<String, String> imageDataUrls,
|
||||
Map<String, byte[]> imageContents) {
|
||||
JSONArray pages = middleJson.getJSONArray("pdf_info");
|
||||
if (pages == null) {
|
||||
return;
|
||||
@@ -404,8 +419,8 @@ public class MineruMapper {
|
||||
for (int pageIndex = 0; pageIndex < pages.size(); pageIndex++) {
|
||||
JSONObject page = pages.getJSONObject(pageIndex);
|
||||
fillBlocksFromMiddlePage(result, page.getJSONArray("para_blocks"), page.getInteger("page_idx"));
|
||||
fillVisualsFromMiddlePage(result, page.getJSONArray("tables"), page.getInteger("page_idx"), true, imageDataUrls);
|
||||
fillVisualsFromMiddlePage(result, page.getJSONArray("images"), page.getInteger("page_idx"), false, imageDataUrls);
|
||||
fillVisualsFromMiddlePage(result, page.getJSONArray("tables"), page.getInteger("page_idx"), true, imageDataUrls, imageContents);
|
||||
fillVisualsFromMiddlePage(result, page.getJSONArray("images"), page.getInteger("page_idx"), false, imageDataUrls, imageContents);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -429,7 +444,12 @@ public class MineruMapper {
|
||||
}
|
||||
}
|
||||
|
||||
private void fillVisualsFromMiddlePage(ParseResult result, JSONArray blocks, Integer pageIndex, boolean table, Map<String, String> imageDataUrls) {
|
||||
private void fillVisualsFromMiddlePage(ParseResult result,
|
||||
JSONArray blocks,
|
||||
Integer pageIndex,
|
||||
boolean table,
|
||||
Map<String, String> imageDataUrls,
|
||||
Map<String, byte[]> imageContents) {
|
||||
if (blocks == null) {
|
||||
return;
|
||||
}
|
||||
@@ -456,6 +476,7 @@ public class MineruMapper {
|
||||
documentImage.setName(baseName(documentImage.getSourcePath()));
|
||||
documentImage.setMimeType(detectMimeType(documentImage.getSourcePath()));
|
||||
documentImage.setDataUrl(matchDataUrl(documentImage.getSourcePath(), imageDataUrls));
|
||||
documentImage.setContent(matchBinaryContent(documentImage.getSourcePath(), imageContents));
|
||||
result.getImages().add(documentImage);
|
||||
}
|
||||
}
|
||||
@@ -607,6 +628,20 @@ public class MineruMapper {
|
||||
return values;
|
||||
}
|
||||
|
||||
private Map<String, byte[]> toBinaryMap(Map<String, String> dataUrls) {
|
||||
Map<String, byte[]> values = new LinkedHashMap<String, byte[]>();
|
||||
if (dataUrls == null || dataUrls.isEmpty()) {
|
||||
return values;
|
||||
}
|
||||
for (Map.Entry<String, String> entry : dataUrls.entrySet()) {
|
||||
byte[] content = decodeDataUrl(entry.getValue());
|
||||
if (content != null) {
|
||||
values.put(entry.getKey(), content);
|
||||
}
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
private List<Double> toDoubleList(JSONArray jsonArray) {
|
||||
if (jsonArray == null || jsonArray.isEmpty()) {
|
||||
return new ArrayList<Double>();
|
||||
@@ -800,6 +835,25 @@ public class MineruMapper {
|
||||
return null;
|
||||
}
|
||||
|
||||
private byte[] matchBinaryContent(String imagePath, Map<String, byte[]> imageContents) {
|
||||
if (imageContents == null || imageContents.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
if (StringUtil.hasText(imagePath) && imageContents.containsKey(imagePath)) {
|
||||
return imageContents.get(imagePath);
|
||||
}
|
||||
String currentBaseName = baseName(imagePath);
|
||||
if (!StringUtil.hasText(currentBaseName)) {
|
||||
return null;
|
||||
}
|
||||
for (Map.Entry<String, byte[]> entry : imageContents.entrySet()) {
|
||||
if (currentBaseName.equals(baseName(entry.getKey()))) {
|
||||
return entry.getValue();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private String baseName(String path) {
|
||||
if (!StringUtil.hasText(path)) {
|
||||
return null;
|
||||
@@ -820,6 +874,21 @@ public class MineruMapper {
|
||||
return "data:" + detectMimeType(path) + ";base64," + Base64.getEncoder().encodeToString(content);
|
||||
}
|
||||
|
||||
private byte[] decodeDataUrl(String dataUrl) {
|
||||
if (!StringUtil.hasText(dataUrl)) {
|
||||
return null;
|
||||
}
|
||||
int commaIndex = dataUrl.indexOf(',');
|
||||
if (commaIndex < 0 || commaIndex == dataUrl.length() - 1) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return Base64.getDecoder().decode(dataUrl.substring(commaIndex + 1));
|
||||
} catch (IllegalArgumentException exception) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private String joinList(List<String> values) {
|
||||
if (values == null || values.isEmpty()) {
|
||||
return null;
|
||||
@@ -1,14 +1,14 @@
|
||||
package com.easyagents.document.pdf.mineru;
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* MinerU PDF 解析配置。
|
||||
* MinerU 文档解析配置。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruProperties {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.pdf.mineru;
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
|
||||
@@ -9,7 +9,7 @@ import java.util.Map;
|
||||
* MinerU 结果载荷。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruResultPayload {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.pdf.mineru;
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -7,7 +7,7 @@ import java.util.List;
|
||||
* MinerU 原始任务状态。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruTaskStatus {
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
package com.easyagents.document.core.support;
|
||||
|
||||
import com.easyagents.document.core.DocumentParseService;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskUpdater;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 带统一异步任务能力的文档解析服务基类。
|
||||
* 支持 ppt 和 excel,pdf 和 word 文档使用 mineru 自带异步能力
|
||||
*
|
||||
* @param <R> 请求类型
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public abstract class AbstractAsyncDocumentParseService<R extends ParseRequest> implements DocumentParseService<R> {
|
||||
|
||||
private final DocumentAsyncTaskManager taskManager;
|
||||
|
||||
/**
|
||||
* 创建服务基类。
|
||||
*
|
||||
* @param taskManager 异步任务管理器
|
||||
*/
|
||||
protected AbstractAsyncDocumentParseService(DocumentAsyncTaskManager taskManager) {
|
||||
if (taskManager == null) {
|
||||
throw new IllegalArgumentException("DocumentAsyncTaskManager must not be null");
|
||||
}
|
||||
this.taskManager = taskManager;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse parse(ParseRequest request) {
|
||||
return doParse(normalizeRequest(request), null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus submit(ParseRequest request) {
|
||||
final R normalizedRequest = normalizeRequest(request);
|
||||
return taskManager.submit(
|
||||
normalizedRequest.getBackend(),
|
||||
collectFileNames(normalizedRequest),
|
||||
updater -> doParse(normalizedRequest, updater)
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
return taskManager.queryTask(taskId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
return taskManager.queryResult(taskId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||
return taskManager.queryTaskInfo(taskId);
|
||||
}
|
||||
|
||||
/**
|
||||
* 归一化请求。
|
||||
*
|
||||
* @param request 原始请求
|
||||
* @return 归一化结果
|
||||
*/
|
||||
protected abstract R normalizeRequest(ParseRequest request);
|
||||
|
||||
/**
|
||||
* 执行解析。
|
||||
*
|
||||
* @param request 归一化请求
|
||||
* @param updater 进度更新器;同步解析时可能为 {@code null}
|
||||
* @return 解析结果
|
||||
*/
|
||||
protected abstract ParseResponse doParse(R request, DocumentAsyncTaskUpdater updater);
|
||||
|
||||
private List<String> collectFileNames(ParseRequest request) {
|
||||
List<String> fileNames = new ArrayList<String>();
|
||||
if (request == null || request.getFiles() == null) {
|
||||
return fileNames;
|
||||
}
|
||||
for (ParseFile file : request.getFiles()) {
|
||||
if (file != null && file.getFileName() != null) {
|
||||
fileNames.add(file.getFileName());
|
||||
}
|
||||
}
|
||||
return fileNames;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.concurrent.Executor;
|
||||
|
||||
/**
|
||||
* 异步任务管理器测试。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class DocumentAsyncTaskManagerTest {
|
||||
|
||||
@Test
|
||||
public void shouldTrackTaskLifecycleAndResult() {
|
||||
Executor directExecutor = new Executor() {
|
||||
@Override
|
||||
public void execute(Runnable command) {
|
||||
command.run();
|
||||
}
|
||||
};
|
||||
DocumentAsyncTaskManager manager = new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor);
|
||||
|
||||
ParseTaskStatus status = manager.submit("mineru", Collections.singletonList("demo.pptx"), updater -> {
|
||||
updater.update("ocr", 50, 1, 2, "处理中");
|
||||
ParseResponse response = new ParseResponse();
|
||||
ParseResult result = new ParseResult();
|
||||
result.setFileName("demo.pptx");
|
||||
result.setMarkdown("# Slide 1");
|
||||
response.setResults(Collections.singletonList(result));
|
||||
return response;
|
||||
});
|
||||
|
||||
ParseTaskInfo taskInfo = manager.queryTaskInfo(status.getTaskId());
|
||||
|
||||
Assert.assertEquals("completed", taskInfo.getStatus());
|
||||
Assert.assertEquals(Integer.valueOf(100), taskInfo.getProgressPercent());
|
||||
Assert.assertEquals("completed", taskInfo.getCurrentStage());
|
||||
Assert.assertNotNull(taskInfo.getResult());
|
||||
Assert.assertEquals(1, taskInfo.getResult().getResults().size());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,210 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import okhttp3.Request;
|
||||
import okio.Buffer;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
/**
|
||||
* MinerU 通用文档解析服务测试。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruDocumentParseServiceTest {
|
||||
|
||||
@Test
|
||||
public void shouldForceAsyncResultArtifacts() {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper);
|
||||
|
||||
ParseRequest request = buildRequest();
|
||||
request.setReturnMarkdown(false);
|
||||
request.setReturnMiddleJson(false);
|
||||
request.setReturnContentList(false);
|
||||
request.setReturnModelOutput(false);
|
||||
request.setReturnImages(false);
|
||||
|
||||
ParseTaskStatus status = service.submit(request);
|
||||
|
||||
Assert.assertEquals("task-1", status.getTaskId());
|
||||
Assert.assertTrue(client.lastSubmitRequest.getReturnMarkdown());
|
||||
Assert.assertTrue(client.lastSubmitRequest.getReturnMiddleJson());
|
||||
Assert.assertTrue(client.lastSubmitRequest.getReturnContentList());
|
||||
Assert.assertTrue(client.lastSubmitRequest.getReturnModelOutput());
|
||||
Assert.assertTrue(client.lastSubmitRequest.getReturnImages());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldUseTaskMetadataWhenQueryingAsyncZipResult() {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper);
|
||||
|
||||
ParseResponse response = service.queryResult("task-1");
|
||||
|
||||
Assert.assertEquals("vlm-http-client", response.getBackend());
|
||||
Assert.assertEquals("3.0.9", response.getVersion());
|
||||
Assert.assertEquals(1, response.getResults().size());
|
||||
Assert.assertEquals("demo", response.getResults().get(0).getFileName());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldReturnCompletedResultInTaskInfo() {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper);
|
||||
|
||||
ParseTaskInfo taskInfo = service.queryTaskInfo("task-1");
|
||||
|
||||
Assert.assertEquals("completed", taskInfo.getStatus());
|
||||
Assert.assertNotNull(taskInfo.getResult());
|
||||
Assert.assertEquals(1, taskInfo.getResult().getResults().size());
|
||||
Assert.assertEquals(1, client.queryResultZipCount);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldSendRepeatedLangListFields() {
|
||||
InspectingMultipartClient client = new InspectingMultipartClient(defaultProperties());
|
||||
ParseRequest request = buildRequest();
|
||||
request.setLanguages(java.util.Arrays.asList("zh", "en"));
|
||||
|
||||
client.parse(request);
|
||||
|
||||
Assert.assertEquals(2, countOccurrences(client.lastMultipartBody, "name=\"lang_list\""));
|
||||
Assert.assertTrue(client.lastMultipartBody.contains("\r\nzh\r\n"));
|
||||
Assert.assertTrue(client.lastMultipartBody.contains("\r\nen\r\n"));
|
||||
}
|
||||
|
||||
private ParseRequest buildRequest() {
|
||||
ParseRequest request = new ParseRequest();
|
||||
request.addFile(ParseFile.of("demo.pptx", "ppt".getBytes(StandardCharsets.UTF_8)));
|
||||
return request;
|
||||
}
|
||||
|
||||
private MineruProperties defaultProperties() {
|
||||
MineruProperties properties = new MineruProperties();
|
||||
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||
properties.setResultTimeoutMs(50);
|
||||
properties.setPollIntervalMs(1);
|
||||
return properties;
|
||||
}
|
||||
|
||||
private int countOccurrences(String source, String token) {
|
||||
int count = 0;
|
||||
int index = 0;
|
||||
while (source != null && token != null && !token.isEmpty() && (index = source.indexOf(token, index)) >= 0) {
|
||||
count++;
|
||||
index += token.length();
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private static class RecordingClient extends MineruClient {
|
||||
|
||||
private ParseRequest lastSubmitRequest;
|
||||
private int queryResultZipCount;
|
||||
|
||||
private RecordingClient(MineruProperties properties) {
|
||||
super(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
@Override
|
||||
public MineruTaskStatus submit(ParseRequest request) {
|
||||
this.lastSubmitRequest = request;
|
||||
MineruTaskStatus taskStatus = new MineruTaskStatus();
|
||||
taskStatus.setTaskId("task-1");
|
||||
taskStatus.setStatus("pending");
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MineruTaskStatus queryTask(String taskId) {
|
||||
MineruTaskStatus taskStatus = new MineruTaskStatus();
|
||||
taskStatus.setTaskId(taskId);
|
||||
taskStatus.setStatus("completed");
|
||||
taskStatus.setBackend("vlm-http-client");
|
||||
taskStatus.setVersion("3.0.9");
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] queryResultZip(String taskId) {
|
||||
queryResultZipCount++;
|
||||
try {
|
||||
return buildZipResult();
|
||||
} catch (IOException exception) {
|
||||
throw new IllegalStateException("Failed to build test ZIP", exception);
|
||||
}
|
||||
}
|
||||
|
||||
private static byte[] buildZipResult() throws IOException {
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
try (ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream)) {
|
||||
addEntry(zipOutputStream, "demo/vlm/demo.md", "# title");
|
||||
addEntry(zipOutputStream, "demo/vlm/demo_middle.json", middleJson().toJSONString());
|
||||
addEntry(zipOutputStream, "demo/vlm/demo_content_list.json", contentList().toJSONString());
|
||||
}
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
|
||||
private static void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException {
|
||||
zipOutputStream.putNextEntry(new ZipEntry(name));
|
||||
zipOutputStream.write(content.getBytes(StandardCharsets.UTF_8));
|
||||
zipOutputStream.closeEntry();
|
||||
}
|
||||
|
||||
private static JSONObject middleJson() {
|
||||
JSONObject middleJson = new JSONObject();
|
||||
middleJson.put("_backend", "vlm");
|
||||
middleJson.put("_version_name", "3.0.9");
|
||||
middleJson.put("pdf_info", new com.alibaba.fastjson2.JSONArray());
|
||||
return middleJson;
|
||||
}
|
||||
|
||||
private static com.alibaba.fastjson2.JSONArray contentList() {
|
||||
com.alibaba.fastjson2.JSONArray contentList = new com.alibaba.fastjson2.JSONArray();
|
||||
JSONObject text = new JSONObject();
|
||||
text.put("type", "text");
|
||||
text.put("text", "title");
|
||||
text.put("page_idx", 0);
|
||||
text.put("bbox", new com.alibaba.fastjson2.JSONArray());
|
||||
contentList.add(text);
|
||||
return contentList;
|
||||
}
|
||||
}
|
||||
|
||||
private static class InspectingMultipartClient extends MineruClient {
|
||||
|
||||
private String lastMultipartBody;
|
||||
|
||||
private InspectingMultipartClient(MineruProperties properties) {
|
||||
super(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected JSONObject executeJsonRequest(String path, Request request) {
|
||||
try {
|
||||
Buffer buffer = new Buffer();
|
||||
request.body().writeTo(buffer);
|
||||
this.lastMultipartBody = buffer.readUtf8();
|
||||
} catch (IOException exception) {
|
||||
throw new IllegalStateException("Failed to inspect multipart body", exception);
|
||||
}
|
||||
return new JSONObject();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
package com.easyagents.document.pdf;
|
||||
|
||||
import com.easyagents.document.core.DocumentParseService;
|
||||
import com.easyagents.document.core.entity.PdfParseRequest;
|
||||
|
||||
/**
|
||||
* PDF 文档解析服务。
|
||||
@@ -8,5 +9,5 @@ import com.easyagents.document.core.DocumentParseService;
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
*/
|
||||
public interface PdfDocumentParseService extends DocumentParseService {
|
||||
public interface PdfDocumentParseService extends DocumentParseService<PdfParseRequest> {
|
||||
}
|
||||
|
||||
@@ -1,31 +1,23 @@
|
||||
package com.easyagents.document.pdf.mineru;
|
||||
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.core.model.ParseRequest;
|
||||
import com.easyagents.document.core.model.ParseResponse;
|
||||
import com.easyagents.document.core.model.ParseTaskInfo;
|
||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
||||
import com.easyagents.document.core.mineru.MineruClient;
|
||||
import com.easyagents.document.core.mineru.MineruDocumentParseService;
|
||||
import com.easyagents.document.core.mineru.MineruMapper;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.PdfParseRequest;
|
||||
import com.easyagents.document.pdf.PdfDocumentProvider;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
|
||||
/**
|
||||
* 基于 MinerU API 的 PDF 解析服务。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
public class MineruPdfDocumentParseService extends MineruDocumentParseService<PdfParseRequest> implements PdfDocumentProvider {
|
||||
|
||||
public static final String PROVIDER_NAME = "mineru";
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MineruPdfDocumentParseService.class);
|
||||
|
||||
private final MineruProperties properties;
|
||||
private final MineruPdfClient client;
|
||||
private final MineruMapper mapper;
|
||||
|
||||
/**
|
||||
* 创建默认服务实例。
|
||||
@@ -33,7 +25,7 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
* @param properties MinerU 配置
|
||||
*/
|
||||
public MineruPdfDocumentParseService(MineruProperties properties) {
|
||||
this(properties, new MineruMapper(properties));
|
||||
super(properties);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -43,7 +35,7 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
* @param mapper 结果映射器
|
||||
*/
|
||||
public MineruPdfDocumentParseService(MineruProperties properties, MineruMapper mapper) {
|
||||
this(properties, new MineruPdfClient(properties, mapper), mapper);
|
||||
super(properties, mapper);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -53,10 +45,8 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
* @param client HTTP 客户端
|
||||
* @param mapper 结果映射器
|
||||
*/
|
||||
public MineruPdfDocumentParseService(MineruProperties properties, MineruPdfClient client, MineruMapper mapper) {
|
||||
this.properties = properties;
|
||||
this.client = client;
|
||||
this.mapper = mapper;
|
||||
public MineruPdfDocumentParseService(MineruProperties properties, MineruClient client, MineruMapper mapper) {
|
||||
super(properties, client, mapper);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -65,145 +55,21 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse parse(ParseRequest request) {
|
||||
ParseRequest normalizedRequest = normalizeRequest(request);
|
||||
LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}, parseMethod={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
normalizedRequest.getBackend(),
|
||||
normalizedRequest.getParseMethod());
|
||||
ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
|
||||
LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus submit(ParseRequest request) {
|
||||
ParseRequest normalizedRequest = normalizeRequest(request);
|
||||
// 异步结果固定走全量 ZIP,调用方无需传入裁剪参数。
|
||||
normalizedRequest.setReturnMarkdown(true);
|
||||
normalizedRequest.setReturnMiddleJson(true);
|
||||
normalizedRequest.setReturnContentList(true);
|
||||
normalizedRequest.setReturnModelOutput(true);
|
||||
normalizedRequest.setReturnImages(true);
|
||||
LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}, parseMethod={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
normalizedRequest.getBackend(),
|
||||
normalizedRequest.getParseMethod());
|
||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
|
||||
LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
|
||||
PROVIDER_NAME,
|
||||
taskStatus == null ? null : taskStatus.getTaskId(),
|
||||
taskStatus == null ? null : taskStatus.getStatus());
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
|
||||
LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
taskStatus == null ? null : taskStatus.getStatus());
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
|
||||
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
|
||||
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||
LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
MineruTaskStatus taskStatus = client.queryTask(taskId);
|
||||
ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(mapper.toParseTaskStatus(taskStatus));
|
||||
if ("completed".equalsIgnoreCase(taskStatus.getStatus())) {
|
||||
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||
taskInfo.setResult(response);
|
||||
}
|
||||
LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
taskInfo == null ? null : taskInfo.getStatus(),
|
||||
taskInfo != null && taskInfo.getResult() != null);
|
||||
return taskInfo;
|
||||
}
|
||||
|
||||
private ParseRequest normalizeRequest(ParseRequest request) {
|
||||
if (request == null) {
|
||||
throw new IllegalArgumentException("ParseRequest must not be null");
|
||||
}
|
||||
if (request.getFiles() == null || request.getFiles().isEmpty()) {
|
||||
throw new IllegalArgumentException("ParseRequest files must not be empty");
|
||||
}
|
||||
ParseRequest normalizedRequest = new ParseRequest();
|
||||
normalizedRequest.setFiles(new ArrayList<>(request.getFiles()));
|
||||
normalizedRequest.setBackend(StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend());
|
||||
normalizedRequest.setParseMethod(StringUtil.hasText(request.getParseMethod()) ? request.getParseMethod() : properties.getDefaultParseMethod());
|
||||
normalizedRequest.setLanguages(
|
||||
request.getLanguages() == null || request.getLanguages().isEmpty()
|
||||
? new ArrayList<String>(properties.getDefaultLangList())
|
||||
: new ArrayList<String>(request.getLanguages())
|
||||
protected ParseRequest normalizeRequest(ParseRequest request) {
|
||||
PdfParseRequest normalizedRequest = PdfParseRequest.from(request);
|
||||
ParseRequest commonRequest = super.normalizeRequest(normalizedRequest);
|
||||
commonRequest.copyCommonFieldsTo(normalizedRequest);
|
||||
normalizedRequest.setParseMethod(
|
||||
StringUtil.hasText(normalizedRequest.getParseMethod()) ? normalizedRequest.getParseMethod() : getProperties().getDefaultParseMethod()
|
||||
);
|
||||
normalizedRequest.setFormulaEnabled(request.getFormulaEnabled() == null ? properties.getDefaultFormulaEnable() : request.getFormulaEnabled());
|
||||
normalizedRequest.setTableEnabled(request.getTableEnabled() == null ? properties.getDefaultTableEnable() : request.getTableEnabled());
|
||||
normalizedRequest.setStartPageIndex(request.getStartPageIndex() == null ? 0 : request.getStartPageIndex());
|
||||
normalizedRequest.setEndPageIndex(request.getEndPageIndex() == null ? 99999 : request.getEndPageIndex());
|
||||
normalizedRequest.setReturnMarkdown(request.getReturnMarkdown());
|
||||
normalizedRequest.setReturnMiddleJson(request.getReturnMiddleJson());
|
||||
normalizedRequest.setReturnContentList(request.getReturnContentList());
|
||||
normalizedRequest.setReturnModelOutput(request.getReturnModelOutput());
|
||||
normalizedRequest.setReturnImages(request.getReturnImages());
|
||||
normalizedRequest.setFormulaEnabled(
|
||||
normalizedRequest.getFormulaEnabled() == null ? getProperties().getDefaultFormulaEnable() : normalizedRequest.getFormulaEnabled()
|
||||
);
|
||||
normalizedRequest.setTableEnabled(
|
||||
normalizedRequest.getTableEnabled() == null ? getProperties().getDefaultTableEnable() : normalizedRequest.getTableEnabled()
|
||||
);
|
||||
normalizedRequest.setStartPageIndex(normalizedRequest.getStartPageIndex() == null ? 0 : normalizedRequest.getStartPageIndex());
|
||||
normalizedRequest.setEndPageIndex(normalizedRequest.getEndPageIndex() == null ? 99999 : normalizedRequest.getEndPageIndex());
|
||||
return normalizedRequest;
|
||||
}
|
||||
|
||||
private void validateTaskId(String taskId) {
|
||||
if (!StringUtil.hasText(taskId)) {
|
||||
throw new IllegalArgumentException("taskId must not be empty");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 轮询任务状态直到完成或失败。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 已完成的任务状态
|
||||
*/
|
||||
private MineruTaskStatus waitForTaskCompleted(String taskId) {
|
||||
long deadline = System.currentTimeMillis() + properties.getResultTimeoutMs();
|
||||
while (true) {
|
||||
MineruTaskStatus taskStatus = client.queryTask(taskId);
|
||||
if ("completed".equals(taskStatus.getStatus())) {
|
||||
return taskStatus;
|
||||
}
|
||||
if ("failed".equals(taskStatus.getStatus())) {
|
||||
throw new DocumentParseException("MinerU task failed: " + taskStatus.getError());
|
||||
}
|
||||
if (System.currentTimeMillis() >= deadline) {
|
||||
throw new DocumentParseException("MinerU task result timeout: " + taskId);
|
||||
}
|
||||
try {
|
||||
Thread.sleep(properties.getPollIntervalMs());
|
||||
} catch (InterruptedException exception) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new DocumentParseException("Interrupted while waiting for MinerU task: " + taskId, exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,10 +2,13 @@ package com.easyagents.document.pdf.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSONArray;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.document.core.mineru.MineruMapper;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.core.mineru.MineruResultPayload;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.core.model.ParseRequest;
|
||||
import com.easyagents.document.core.model.ParseResponse;
|
||||
import com.easyagents.document.core.model.ParseResult;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
@@ -41,6 +44,7 @@ public class MineruMapperTest {
|
||||
Assert.assertFalse(result.getBlocks().isEmpty());
|
||||
Assert.assertEquals(1, result.getTables().size());
|
||||
Assert.assertEquals(2, result.getImages().size());
|
||||
Assert.assertNotNull(result.getImages().get(0).getContent());
|
||||
Assert.assertNotNull(result.getArtifacts().getMiddleJson());
|
||||
Assert.assertNotNull(result.getArtifacts().getContentList());
|
||||
}
|
||||
@@ -56,6 +60,7 @@ public class MineruMapperTest {
|
||||
Assert.assertEquals("# title", result.getPlainText());
|
||||
Assert.assertEquals(1, result.getTables().size());
|
||||
Assert.assertEquals(2, result.getImages().size());
|
||||
Assert.assertNotNull(result.getImages().get(0).getContent());
|
||||
Assert.assertNotNull(result.getArtifacts().getExtraJsonArtifacts().get("contentListV2"));
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,16 @@
|
||||
package com.easyagents.document.pdf.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.document.core.model.ParseFile;
|
||||
import com.easyagents.document.core.model.ParseRequest;
|
||||
import com.easyagents.document.core.model.ParseResponse;
|
||||
import com.easyagents.document.core.model.ParseTaskInfo;
|
||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
||||
import com.easyagents.document.core.mineru.MineruClient;
|
||||
import com.easyagents.document.core.mineru.MineruMapper;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.core.mineru.MineruResultPayload;
|
||||
import com.easyagents.document.core.mineru.MineruTaskStatus;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import okhttp3.Request;
|
||||
import okio.Buffer;
|
||||
import org.junit.Assert;
|
||||
@@ -147,7 +152,7 @@ public class MineruPdfDocumentParseServiceTest {
|
||||
return count;
|
||||
}
|
||||
|
||||
private static class RecordingClient extends MineruPdfClient {
|
||||
private static class RecordingClient extends MineruClient {
|
||||
|
||||
private ParseRequest lastParseRequest;
|
||||
private ParseRequest lastSubmitRequest;
|
||||
@@ -248,7 +253,7 @@ public class MineruPdfDocumentParseServiceTest {
|
||||
}
|
||||
}
|
||||
|
||||
private static class InspectingMultipartClient extends MineruPdfClient {
|
||||
private static class InspectingMultipartClient extends MineruClient {
|
||||
|
||||
private String lastMultipartBody;
|
||||
|
||||
|
||||
44
easy-agents-document/easy-agents-document-pptx/pom.xml
Normal file
44
easy-agents-document/easy-agents-document-pptx/pom.xml
Normal file
@@ -0,0 +1,44 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document</artifactId>
|
||||
<version>${revision}</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>easy-agents-document-pptx</artifactId>
|
||||
<name>easy-agents-document-pptx</name>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.alibaba.fastjson2</groupId>
|
||||
<artifactId>fastjson2</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.easyagents.document.pptx;
|
||||
|
||||
import com.easyagents.document.core.DocumentParseService;
|
||||
import com.easyagents.document.core.entity.PptxParseRequest;
|
||||
|
||||
/**
|
||||
* PPTX 文档解析服务。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface PptxDocumentParseService extends DocumentParseService<PptxParseRequest> {
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package com.easyagents.document.pptx;
|
||||
|
||||
/**
|
||||
* PPTX provider SPI。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface PptxDocumentProvider extends PptxDocumentParseService {
|
||||
|
||||
/**
|
||||
* 获取 provider 标识。
|
||||
*
|
||||
* @return provider 名称
|
||||
*/
|
||||
String getProvider();
|
||||
}
|
||||
@@ -0,0 +1,408 @@
|
||||
package com.easyagents.document.pptx.mineru;
|
||||
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskRepository;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskUpdater;
|
||||
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||
import com.easyagents.document.core.mineru.MineruClient;
|
||||
import com.easyagents.document.core.mineru.MineruMapper;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.core.entity.DocumentBlock;
|
||||
import com.easyagents.document.core.entity.DocumentImage;
|
||||
import com.easyagents.document.core.entity.DocumentPage;
|
||||
import com.easyagents.document.core.entity.DocumentTable;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.PptxParseRequest;
|
||||
import com.easyagents.document.core.support.AbstractAsyncDocumentParseService;
|
||||
import com.easyagents.document.pptx.PptxDocumentProvider;
|
||||
import com.easyagents.document.pptx.model.PptxParseArtifact;
|
||||
import com.easyagents.document.pptx.model.PptxSlideArtifact;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.Color;
|
||||
import java.awt.Dimension;
|
||||
import java.awt.Graphics2D;
|
||||
import java.awt.RenderingHints;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
/**
|
||||
* 基于 MinerU 的 PPTX 文档解析服务。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruPptxDocumentParseService extends AbstractAsyncDocumentParseService<PptxParseRequest> implements PptxDocumentProvider {
|
||||
|
||||
public static final String PROVIDER_NAME = "mineru";
|
||||
|
||||
private final MineruProperties properties;
|
||||
private final MineruClient client;
|
||||
private final MineruMapper mapper;
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
*/
|
||||
public MineruPptxDocumentParseService(MineruProperties properties) {
|
||||
this(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param mapper MinerU 映射器
|
||||
*/
|
||||
public MineruPptxDocumentParseService(MineruProperties properties, MineruMapper mapper) {
|
||||
this(properties, new MineruClient(properties, mapper), mapper, defaultTaskManager());
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param taskManager 异步任务管理器
|
||||
*/
|
||||
public MineruPptxDocumentParseService(MineruProperties properties, DocumentAsyncTaskManager taskManager) {
|
||||
this(properties, new MineruMapper(properties), taskManager);
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param mapper MinerU 映射器
|
||||
* @param taskManager 异步任务管理器
|
||||
*/
|
||||
public MineruPptxDocumentParseService(MineruProperties properties,
|
||||
MineruMapper mapper,
|
||||
DocumentAsyncTaskManager taskManager) {
|
||||
this(properties, new MineruClient(properties, mapper), mapper, taskManager);
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param client MinerU 客户端
|
||||
* @param mapper MinerU 映射器
|
||||
* @param taskManager 异步任务管理器
|
||||
*/
|
||||
public MineruPptxDocumentParseService(MineruProperties properties,
|
||||
MineruClient client,
|
||||
MineruMapper mapper,
|
||||
DocumentAsyncTaskManager taskManager) {
|
||||
super(taskManager);
|
||||
this.properties = properties;
|
||||
this.client = client;
|
||||
this.mapper = mapper;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getProvider() {
|
||||
return PROVIDER_NAME;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PptxParseRequest normalizeRequest(ParseRequest request) {
|
||||
PptxParseRequest normalized = PptxParseRequest.from(request);
|
||||
if (normalized.getFiles() == null || normalized.getFiles().isEmpty()) {
|
||||
throw new IllegalArgumentException("PptxParseRequest files must not be empty");
|
||||
}
|
||||
normalized.setBackend(StringUtil.hasText(normalized.getBackend()) ? normalized.getBackend() : properties.getDefaultBackend());
|
||||
if (normalized.getLanguages() == null || normalized.getLanguages().isEmpty()) {
|
||||
normalized.setLanguages(new ArrayList<String>(properties.getDefaultLangList()));
|
||||
}
|
||||
normalized.setReturnMarkdown(normalized.getReturnMarkdown() == null ? Boolean.TRUE : normalized.getReturnMarkdown());
|
||||
normalized.setReturnMiddleJson(normalized.getReturnMiddleJson() == null ? Boolean.TRUE : normalized.getReturnMiddleJson());
|
||||
normalized.setReturnContentList(normalized.getReturnContentList() == null ? Boolean.TRUE : normalized.getReturnContentList());
|
||||
normalized.setReturnModelOutput(normalized.getReturnModelOutput() == null ? Boolean.FALSE : normalized.getReturnModelOutput());
|
||||
normalized.setReturnImages(normalized.getReturnImages() == null ? Boolean.TRUE : normalized.getReturnImages());
|
||||
normalized.setRenderScale(normalized.getRenderScale() == null || normalized.getRenderScale() <= 0 ? 2.0d : normalized.getRenderScale());
|
||||
normalized.setImageFormat(normalizeImageFormat(normalized.getImageFormat()));
|
||||
normalized.setIncludeSlideImageReference(
|
||||
normalized.getIncludeSlideImageReference() == null ? Boolean.TRUE : normalized.getIncludeSlideImageReference()
|
||||
);
|
||||
return normalized;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ParseResponse doParse(PptxParseRequest request, DocumentAsyncTaskUpdater updater) {
|
||||
ParseResponse response = new ParseResponse();
|
||||
List<ParseResult> results = new ArrayList<ParseResult>();
|
||||
int totalSlides = countSlides(request);
|
||||
int processedSlides = 0;
|
||||
String backend = null;
|
||||
String version = null;
|
||||
|
||||
for (ParseFile file : request.getFiles()) {
|
||||
ParseResult result = parseSinglePptx(file, request, updater, processedSlides, totalSlides);
|
||||
processedSlides += Integer.parseInt(String.valueOf(result.getMetadata().get("slideCount")));
|
||||
if (backend == null) {
|
||||
backend = (String) result.getMetadata().get("ocrBackend");
|
||||
}
|
||||
if (version == null) {
|
||||
version = (String) result.getMetadata().get("ocrVersion");
|
||||
}
|
||||
result.getMetadata().remove("slideCount");
|
||||
result.getMetadata().remove("ocrBackend");
|
||||
result.getMetadata().remove("ocrVersion");
|
||||
results.add(result);
|
||||
}
|
||||
response.setBackend(StringUtil.hasText(backend) ? backend : request.getBackend());
|
||||
response.setVersion(version);
|
||||
response.setResults(results);
|
||||
return response;
|
||||
}
|
||||
|
||||
private ParseResult parseSinglePptx(ParseFile file,
|
||||
PptxParseRequest request,
|
||||
DocumentAsyncTaskUpdater updater,
|
||||
int processedSlidesBefore,
|
||||
int totalSlides) {
|
||||
ParseResult aggregate = new ParseResult();
|
||||
aggregate.setFileName(file.getFileName());
|
||||
StringBuilder markdownBuilder = new StringBuilder();
|
||||
PptxParseArtifact artifact = new PptxParseArtifact();
|
||||
String backend = null;
|
||||
String version = null;
|
||||
int slideCount = 0;
|
||||
|
||||
try (XMLSlideShow slideShow = new XMLSlideShow(new ByteArrayInputStream(file.getContent()))) {
|
||||
List<XSLFSlide> slides = slideShow.getSlides();
|
||||
Dimension pageSize = slideShow.getPageSize();
|
||||
int startSlide = request.getStartSlideIndex() == null ? 0 : Math.max(request.getStartSlideIndex(), 0);
|
||||
int endSlide = request.getEndSlideIndex() == null
|
||||
? slides.size() - 1
|
||||
: Math.min(request.getEndSlideIndex(), slides.size() - 1);
|
||||
if (endSlide < startSlide) {
|
||||
endSlide = startSlide - 1;
|
||||
}
|
||||
|
||||
for (int slideIndex = startSlide; slideIndex <= endSlide; slideIndex++) {
|
||||
XSLFSlide slide = slides.get(slideIndex);
|
||||
slideCount++;
|
||||
updateProgress(updater, "extracting", processedSlidesBefore + slideCount - 1, totalSlides,
|
||||
"正在渲染第 " + (slideIndex + 1) + " 页幻灯片");
|
||||
|
||||
byte[] imageBytes = renderSlide(slide, pageSize, request.getRenderScale(), request.getImageFormat());
|
||||
String imagePath = buildImagePath(slideIndex, request.getImageFormat());
|
||||
String imageName = buildImageName(slideIndex);
|
||||
|
||||
updateProgress(updater, "ocr", processedSlidesBefore + slideCount - 1, totalSlides,
|
||||
"正在识别第 " + (slideIndex + 1) + " 页幻灯片");
|
||||
ParseResult ocrResult = parseSlideImage(slideIndex, imageBytes, request, imagePath);
|
||||
|
||||
if (!StringUtil.hasText(backend)) {
|
||||
backend = (String) ocrResult.getMetadata().get("middleBackend");
|
||||
}
|
||||
if (!StringUtil.hasText(version)) {
|
||||
version = (String) ocrResult.getMetadata().get("middleVersion");
|
||||
}
|
||||
|
||||
appendSlideMarkdown(markdownBuilder, slideIndex, imageName, imagePath, request, ocrResult.getMarkdown());
|
||||
aggregate.getImages().add(buildSlideImage(slideIndex, imageName, imagePath, request.getImageFormat(), imageBytes));
|
||||
aggregate.getPages().add(buildPage(slideIndex, pageSize, request.getRenderScale()));
|
||||
mergeOcrResult(aggregate, slideIndex, ocrResult);
|
||||
artifact.getSlides().add(buildSlideArtifact(slideIndex, slide, imageName, imagePath, ocrResult));
|
||||
}
|
||||
} catch (IOException exception) {
|
||||
throw new IllegalStateException("Failed to parse PPTX file: " + file.getFileName(), exception);
|
||||
}
|
||||
|
||||
updateProgress(updater, "assembling", processedSlidesBefore + slideCount, totalSlides, "正在汇总 PPTX 解析结果");
|
||||
aggregate.setMarkdown(markdownBuilder.toString().trim());
|
||||
aggregate.setPlainText(aggregate.getMarkdown());
|
||||
aggregate.getArtifacts().getExtraJsonArtifacts().put("pptx", artifact);
|
||||
aggregate.getMetadata().put("slideCount", slideCount);
|
||||
aggregate.getMetadata().put("ocrBackend", backend);
|
||||
aggregate.getMetadata().put("ocrVersion", version);
|
||||
return aggregate;
|
||||
}
|
||||
|
||||
private ParseResult parseSlideImage(int slideIndex, byte[] imageBytes, PptxParseRequest request, String imagePath) {
|
||||
ParseRequest imageRequest = new ParseRequest();
|
||||
imageRequest.addFile(ParseFile.of("slide-" + (slideIndex + 1) + "." + request.getImageFormat(), imageBytes, "image/" + request.getImageFormat()));
|
||||
imageRequest.setBackend(request.getBackend());
|
||||
imageRequest.setLanguages(request.getLanguages());
|
||||
imageRequest.setReturnMarkdown(true);
|
||||
imageRequest.setReturnMiddleJson(true);
|
||||
imageRequest.setReturnContentList(true);
|
||||
imageRequest.setReturnModelOutput(false);
|
||||
imageRequest.setReturnImages(false);
|
||||
ParseResponse response = mapper.toParseResponse(client.parse(imageRequest));
|
||||
ParseResult result = response.getResults().isEmpty() ? new ParseResult() : response.getResults().get(0);
|
||||
if (!StringUtil.hasText(result.getMarkdown())) {
|
||||
result.setMarkdown(result.getPlainText());
|
||||
}
|
||||
result.getMetadata().put("slideImagePath", imagePath);
|
||||
return result;
|
||||
}
|
||||
|
||||
private void appendSlideMarkdown(StringBuilder markdownBuilder,
|
||||
int slideIndex,
|
||||
String imageName,
|
||||
String imagePath,
|
||||
PptxParseRequest request,
|
||||
String ocrMarkdown) {
|
||||
if (markdownBuilder.length() > 0) {
|
||||
markdownBuilder.append("\n\n");
|
||||
}
|
||||
markdownBuilder.append("# Slide ").append(slideIndex + 1).append("\n\n");
|
||||
if (Boolean.TRUE.equals(request.getIncludeSlideImageReference())) {
|
||||
markdownBuilder.append(".append(imagePath).append(")\n\n");
|
||||
}
|
||||
if (StringUtil.hasText(ocrMarkdown)) {
|
||||
markdownBuilder.append(ocrMarkdown.trim());
|
||||
}
|
||||
}
|
||||
|
||||
private DocumentImage buildSlideImage(int slideIndex, String imageName, String imagePath, String imageFormat, byte[] imageBytes) {
|
||||
DocumentImage image = new DocumentImage();
|
||||
image.setPageIndex(slideIndex);
|
||||
image.setName(imageName);
|
||||
image.setSourcePath(imagePath);
|
||||
image.setMimeType("image/" + imageFormat);
|
||||
image.setContent(imageBytes);
|
||||
return image;
|
||||
}
|
||||
|
||||
private DocumentPage buildPage(int slideIndex, Dimension pageSize, Double renderScale) {
|
||||
DocumentPage page = new DocumentPage();
|
||||
page.setPageIndex(slideIndex);
|
||||
page.setWidth(pageSize.getWidth() * renderScale);
|
||||
page.setHeight(pageSize.getHeight() * renderScale);
|
||||
return page;
|
||||
}
|
||||
|
||||
private void mergeOcrResult(ParseResult aggregate, int slideIndex, ParseResult ocrResult) {
|
||||
for (DocumentBlock block : ocrResult.getBlocks()) {
|
||||
block.setPageIndex(slideIndex);
|
||||
aggregate.getBlocks().add(block);
|
||||
}
|
||||
for (DocumentTable table : ocrResult.getTables()) {
|
||||
table.setPageIndex(slideIndex);
|
||||
aggregate.getTables().add(table);
|
||||
}
|
||||
for (String warning : ocrResult.getWarnings()) {
|
||||
aggregate.getWarnings().add("Slide " + (slideIndex + 1) + ": " + warning);
|
||||
}
|
||||
}
|
||||
|
||||
private PptxSlideArtifact buildSlideArtifact(int slideIndex,
|
||||
XSLFSlide slide,
|
||||
String imageName,
|
||||
String imagePath,
|
||||
ParseResult ocrResult) {
|
||||
PptxSlideArtifact artifact = new PptxSlideArtifact();
|
||||
artifact.setSlideIndex(slideIndex);
|
||||
artifact.setTitle(slide.getTitle());
|
||||
artifact.setImageName(imageName);
|
||||
artifact.setImagePath(imagePath);
|
||||
artifact.setOcrMarkdown(ocrResult.getMarkdown());
|
||||
artifact.setMiddleJson(ocrResult.getArtifacts().getMiddleJson());
|
||||
artifact.setContentList(ocrResult.getArtifacts().getContentList());
|
||||
artifact.setWarnings(new ArrayList<String>(ocrResult.getWarnings()));
|
||||
return artifact;
|
||||
}
|
||||
|
||||
private byte[] renderSlide(XSLFSlide slide, Dimension pageSize, Double renderScale, String imageFormat) throws IOException {
|
||||
double scale = renderScale == null ? 2.0d : renderScale;
|
||||
int width = Math.max(1, (int) Math.round(pageSize.getWidth() * scale));
|
||||
int height = Math.max(1, (int) Math.round(pageSize.getHeight() * scale));
|
||||
BufferedImage image = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
|
||||
Graphics2D graphics = image.createGraphics();
|
||||
try {
|
||||
graphics.setColor(Color.WHITE);
|
||||
graphics.fillRect(0, 0, width, height);
|
||||
graphics.scale(scale, scale);
|
||||
graphics.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON);
|
||||
graphics.setRenderingHint(RenderingHints.KEY_RENDERING, RenderingHints.VALUE_RENDER_QUALITY);
|
||||
graphics.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC);
|
||||
slide.draw(graphics);
|
||||
} finally {
|
||||
graphics.dispose();
|
||||
}
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
ImageIO.write(image, imageFormat, outputStream);
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
|
||||
private int countSlides(PptxParseRequest request) {
|
||||
int totalSlides = 0;
|
||||
for (ParseFile file : request.getFiles()) {
|
||||
try (XMLSlideShow slideShow = new XMLSlideShow(new ByteArrayInputStream(file.getContent()))) {
|
||||
int slideSize = slideShow.getSlides().size();
|
||||
int startSlide = request.getStartSlideIndex() == null ? 0 : Math.max(request.getStartSlideIndex(), 0);
|
||||
int endSlide = request.getEndSlideIndex() == null
|
||||
? slideSize - 1
|
||||
: Math.min(request.getEndSlideIndex(), slideSize - 1);
|
||||
if (endSlide >= startSlide) {
|
||||
totalSlides += endSlide - startSlide + 1;
|
||||
}
|
||||
} catch (IOException exception) {
|
||||
throw new IllegalStateException("Failed to inspect PPTX slide count: " + file.getFileName(), exception);
|
||||
}
|
||||
}
|
||||
return totalSlides;
|
||||
}
|
||||
|
||||
private void updateProgress(DocumentAsyncTaskUpdater updater,
|
||||
String stage,
|
||||
int processedItems,
|
||||
int totalItems,
|
||||
String message) {
|
||||
if (updater == null) {
|
||||
return;
|
||||
}
|
||||
int safeTotal = totalItems <= 0 ? 1 : totalItems;
|
||||
int percent = (int) Math.min(99, Math.round(processedItems * 100.0d / safeTotal));
|
||||
updater.update(stage, percent, processedItems, totalItems, message);
|
||||
}
|
||||
|
||||
private String normalizeImageFormat(String imageFormat) {
|
||||
if ("jpg".equalsIgnoreCase(imageFormat) || "jpeg".equalsIgnoreCase(imageFormat)) {
|
||||
return "jpg";
|
||||
}
|
||||
return "png";
|
||||
}
|
||||
|
||||
private String buildImagePath(int slideIndex, String imageFormat) {
|
||||
return "images/slide-" + formatIndex(slideIndex) + "/page." + imageFormat;
|
||||
}
|
||||
|
||||
private String buildImageName(int slideIndex) {
|
||||
return "slide-" + formatIndex(slideIndex) + "-page";
|
||||
}
|
||||
|
||||
private String formatIndex(int slideIndex) {
|
||||
int displayIndex = slideIndex + 1;
|
||||
if (displayIndex < 10) {
|
||||
return "00" + displayIndex;
|
||||
}
|
||||
if (displayIndex < 100) {
|
||||
return "0" + displayIndex;
|
||||
}
|
||||
return String.valueOf(displayIndex);
|
||||
}
|
||||
|
||||
private static DocumentAsyncTaskManager defaultTaskManager() {
|
||||
DocumentAsyncTaskRepository repository = new InMemoryDocumentAsyncTaskRepository();
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(2);
|
||||
return new DocumentAsyncTaskManager(repository, executorService);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
package com.easyagents.document.pptx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* PPTX 结构化工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class PptxParseArtifact {
|
||||
|
||||
private List<PptxSlideArtifact> slides = new ArrayList<PptxSlideArtifact>();
|
||||
|
||||
public List<PptxSlideArtifact> getSlides() {
|
||||
return slides;
|
||||
}
|
||||
|
||||
public void setSlides(List<PptxSlideArtifact> slides) {
|
||||
this.slides = slides == null ? new ArrayList<PptxSlideArtifact>() : slides;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
package com.easyagents.document.pptx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 单页幻灯片工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class PptxSlideArtifact {
|
||||
|
||||
private Integer slideIndex;
|
||||
private String title;
|
||||
private String imageName;
|
||||
private String imagePath;
|
||||
private String ocrMarkdown;
|
||||
private Object middleJson;
|
||||
private Object contentList;
|
||||
private List<String> warnings = new ArrayList<String>();
|
||||
|
||||
public Integer getSlideIndex() {
|
||||
return slideIndex;
|
||||
}
|
||||
|
||||
public void setSlideIndex(Integer slideIndex) {
|
||||
this.slideIndex = slideIndex;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getImageName() {
|
||||
return imageName;
|
||||
}
|
||||
|
||||
public void setImageName(String imageName) {
|
||||
this.imageName = imageName;
|
||||
}
|
||||
|
||||
public String getImagePath() {
|
||||
return imagePath;
|
||||
}
|
||||
|
||||
public void setImagePath(String imagePath) {
|
||||
this.imagePath = imagePath;
|
||||
}
|
||||
|
||||
public String getOcrMarkdown() {
|
||||
return ocrMarkdown;
|
||||
}
|
||||
|
||||
public void setOcrMarkdown(String ocrMarkdown) {
|
||||
this.ocrMarkdown = ocrMarkdown;
|
||||
}
|
||||
|
||||
public Object getMiddleJson() {
|
||||
return middleJson;
|
||||
}
|
||||
|
||||
public void setMiddleJson(Object middleJson) {
|
||||
this.middleJson = middleJson;
|
||||
}
|
||||
|
||||
public Object getContentList() {
|
||||
return contentList;
|
||||
}
|
||||
|
||||
public void setContentList(Object contentList) {
|
||||
this.contentList = contentList;
|
||||
}
|
||||
|
||||
public List<String> getWarnings() {
|
||||
return warnings;
|
||||
}
|
||||
|
||||
public void setWarnings(List<String> warnings) {
|
||||
this.warnings = warnings == null ? new ArrayList<String>() : warnings;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,170 @@
|
||||
package com.easyagents.document.pptx.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSONArray;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||
import com.easyagents.document.core.mineru.MineruClient;
|
||||
import com.easyagents.document.core.mineru.MineruMapper;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.core.mineru.MineruResultPayload;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import com.easyagents.document.core.entity.PptxParseRequest;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
||||
import org.apache.poi.xslf.usermodel.XSLFTextBox;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.awt.Rectangle;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.Executor;
|
||||
|
||||
/**
|
||||
* PPTX MinerU 服务测试。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruPptxDocumentParseServiceTest {
|
||||
|
||||
@Test
|
||||
public void shouldBuildMarkdownAndImagesForSlides() throws IOException {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruPptxDocumentParseService service = new MineruPptxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
PptxParseRequest request = new PptxParseRequest();
|
||||
request.addFile(ParseFile.of("demo.pptx", buildPptxBytes()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
|
||||
Assert.assertEquals(1, response.getResults().size());
|
||||
ParseResult result = response.getResults().get(0);
|
||||
Assert.assertTrue(result.getMarkdown().contains("# Slide 1"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("images/slide-001/page.png"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("slide-ocr-1"));
|
||||
Assert.assertEquals(2, result.getImages().size());
|
||||
Assert.assertNotNull(result.getImages().get(0).getContent());
|
||||
Assert.assertNotNull(result.getArtifacts().getExtraJsonArtifacts().get("pptx"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldSupportAsyncTaskFlow() throws IOException {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruPptxDocumentParseService service = new MineruPptxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
PptxParseRequest request = new PptxParseRequest();
|
||||
request.addFile(ParseFile.of("demo.pptx", buildPptxBytes()));
|
||||
|
||||
ParseTaskStatus status = service.submit(request);
|
||||
ParseTaskInfo taskInfo = service.queryTaskInfo(status.getTaskId());
|
||||
|
||||
Assert.assertEquals("completed", taskInfo.getStatus());
|
||||
Assert.assertNotNull(taskInfo.getResult());
|
||||
Assert.assertEquals(1, taskInfo.getResult().getResults().size());
|
||||
}
|
||||
|
||||
private byte[] buildPptxBytes() throws IOException {
|
||||
XMLSlideShow slideShow = new XMLSlideShow();
|
||||
slideShow.setPageSize(new java.awt.Dimension(640, 360));
|
||||
createSlide(slideShow, "第一页");
|
||||
createSlide(slideShow, "第二页");
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
slideShow.write(outputStream);
|
||||
slideShow.close();
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
|
||||
private void createSlide(XMLSlideShow slideShow, String text) {
|
||||
XSLFSlide slide = slideShow.createSlide();
|
||||
XSLFTextBox textBox = slide.createTextBox();
|
||||
textBox.setAnchor(new Rectangle(20, 20, 300, 80));
|
||||
textBox.setText(text);
|
||||
}
|
||||
|
||||
private MineruProperties defaultProperties() {
|
||||
MineruProperties properties = new MineruProperties();
|
||||
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||
return properties;
|
||||
}
|
||||
|
||||
private Executor directExecutor() {
|
||||
return new Executor() {
|
||||
@Override
|
||||
public void execute(Runnable command) {
|
||||
command.run();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static class RecordingClient extends MineruClient {
|
||||
|
||||
private int parseCount;
|
||||
|
||||
private RecordingClient(MineruProperties properties) {
|
||||
super(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
@Override
|
||||
public MineruResultPayload parse(com.easyagents.document.core.entity.ParseRequest request) {
|
||||
parseCount++;
|
||||
return new MineruMapper(testProperties()).toResultPayload(syncPayload(parseCount));
|
||||
}
|
||||
|
||||
private JSONObject syncPayload(int index) {
|
||||
JSONObject payload = new JSONObject();
|
||||
payload.put("backend", "vlm-http-client");
|
||||
payload.put("version", "3.0.9");
|
||||
JSONObject result = new JSONObject();
|
||||
result.put("md_content", "slide-ocr-" + index);
|
||||
result.put("middle_json", middleJson());
|
||||
result.put("content_list", contentList(index));
|
||||
JSONObject results = new JSONObject();
|
||||
results.put("slide-" + index, result);
|
||||
payload.put("results", results);
|
||||
return payload;
|
||||
}
|
||||
|
||||
private JSONObject middleJson() {
|
||||
JSONObject middleJson = new JSONObject();
|
||||
middleJson.put("_backend", "vlm-http-client");
|
||||
middleJson.put("_version_name", "3.0.9");
|
||||
middleJson.put("pdf_info", new JSONArray());
|
||||
return middleJson;
|
||||
}
|
||||
|
||||
private JSONArray contentList(int index) {
|
||||
JSONArray contentList = new JSONArray();
|
||||
JSONObject text = new JSONObject();
|
||||
text.put("type", "text");
|
||||
text.put("text", "slide-ocr-" + index);
|
||||
text.put("page_idx", 0);
|
||||
text.put("bbox", new JSONArray());
|
||||
contentList.add(text);
|
||||
return contentList;
|
||||
}
|
||||
|
||||
private static MineruProperties testProperties() {
|
||||
MineruProperties properties = new MineruProperties();
|
||||
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||
return properties;
|
||||
}
|
||||
}
|
||||
}
|
||||
44
easy-agents-document/easy-agents-document-xlsx/pom.xml
Normal file
44
easy-agents-document/easy-agents-document-xlsx/pom.xml
Normal file
@@ -0,0 +1,44 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document</artifactId>
|
||||
<version>${revision}</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>easy-agents-document-xlsx</artifactId>
|
||||
<name>easy-agents-document-xlsx</name>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.alibaba.fastjson2</groupId>
|
||||
<artifactId>fastjson2</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
@@ -0,0 +1,13 @@
|
||||
package com.easyagents.document.xlsx;
|
||||
|
||||
import com.easyagents.document.core.DocumentParseService;
|
||||
import com.easyagents.document.core.entity.XlsxParseRequest;
|
||||
|
||||
/**
|
||||
* XLSX 文档解析服务。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface XlsxDocumentParseService extends DocumentParseService<XlsxParseRequest> {
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package com.easyagents.document.xlsx;
|
||||
|
||||
/**
|
||||
* XLSX provider SPI。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface XlsxDocumentProvider extends XlsxDocumentParseService {
|
||||
|
||||
/**
|
||||
* 获取 provider 标识。
|
||||
*
|
||||
* @return provider 名称
|
||||
*/
|
||||
String getProvider();
|
||||
}
|
||||
@@ -0,0 +1,625 @@
|
||||
package com.easyagents.document.xlsx.mineru;
|
||||
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskRepository;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskUpdater;
|
||||
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||
import com.easyagents.document.core.mineru.MineruClient;
|
||||
import com.easyagents.document.core.mineru.MineruMapper;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.core.entity.DocumentImage;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.XlsxParseRequest;
|
||||
import com.easyagents.document.core.support.AbstractAsyncDocumentParseService;
|
||||
import com.easyagents.document.xlsx.XlsxDocumentProvider;
|
||||
import com.easyagents.document.xlsx.model.XlsxCellArtifact;
|
||||
import com.easyagents.document.xlsx.model.XlsxCellImageArtifact;
|
||||
import com.easyagents.document.xlsx.model.XlsxParseArtifact;
|
||||
import com.easyagents.document.xlsx.model.XlsxRowArtifact;
|
||||
import com.easyagents.document.xlsx.model.XlsxSheetArtifact;
|
||||
import com.easyagents.document.xlsx.model.XlsxSheetImagesArtifact;
|
||||
import org.apache.poi.ss.usermodel.DataFormatter;
|
||||
import org.apache.poi.ss.usermodel.FormulaEvaluator;
|
||||
import org.apache.poi.ss.util.CellReference;
|
||||
import org.apache.poi.xssf.usermodel.XSSFClientAnchor;
|
||||
import org.apache.poi.xssf.usermodel.XSSFDrawing;
|
||||
import org.apache.poi.xssf.usermodel.XSSFPicture;
|
||||
import org.apache.poi.xssf.usermodel.XSSFPictureData;
|
||||
import org.apache.poi.xssf.usermodel.XSSFShape;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSheet;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.net.URLConnection;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
/**
|
||||
* XLSX 文档解析服务,OCR 由 mineru 提供支持
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseService<XlsxParseRequest> implements XlsxDocumentProvider {
|
||||
|
||||
public static final String PROVIDER_NAME = "mineru";
|
||||
|
||||
private final MineruProperties properties;
|
||||
private final MineruClient client;
|
||||
private final MineruMapper mapper;
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
*/
|
||||
public MineruXlsxDocumentParseService(MineruProperties properties) {
|
||||
this(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param mapper MinerU 映射器
|
||||
*/
|
||||
public MineruXlsxDocumentParseService(MineruProperties properties, MineruMapper mapper) {
|
||||
this(properties, new MineruClient(properties, mapper), mapper, defaultTaskManager());
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param taskManager 异步任务管理器
|
||||
*/
|
||||
public MineruXlsxDocumentParseService(MineruProperties properties, DocumentAsyncTaskManager taskManager) {
|
||||
this(properties, new MineruMapper(properties), taskManager);
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param mapper MinerU 映射器
|
||||
* @param taskManager 异步任务管理器
|
||||
*/
|
||||
public MineruXlsxDocumentParseService(MineruProperties properties,
|
||||
MineruMapper mapper,
|
||||
DocumentAsyncTaskManager taskManager) {
|
||||
this(properties, new MineruClient(properties, mapper), mapper, taskManager);
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param client MinerU 客户端
|
||||
* @param mapper MinerU 映射器
|
||||
* @param taskManager 异步任务管理器
|
||||
*/
|
||||
public MineruXlsxDocumentParseService(MineruProperties properties,
|
||||
MineruClient client,
|
||||
MineruMapper mapper,
|
||||
DocumentAsyncTaskManager taskManager) {
|
||||
super(taskManager);
|
||||
this.properties = properties;
|
||||
this.client = client;
|
||||
this.mapper = mapper;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getProvider() {
|
||||
return PROVIDER_NAME;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected XlsxParseRequest normalizeRequest(ParseRequest request) {
|
||||
XlsxParseRequest normalized = XlsxParseRequest.from(request);
|
||||
if (normalized.getFiles() == null || normalized.getFiles().isEmpty()) {
|
||||
throw new IllegalArgumentException("XlsxParseRequest files must not be empty");
|
||||
}
|
||||
normalized.setBackend(StringUtil.hasText(normalized.getBackend()) ? normalized.getBackend() : properties.getDefaultBackend());
|
||||
if (normalized.getLanguages() == null || normalized.getLanguages().isEmpty()) {
|
||||
normalized.setLanguages(new ArrayList<String>(properties.getDefaultLangList()));
|
||||
}
|
||||
normalized.setReturnMarkdown(normalized.getReturnMarkdown() == null ? Boolean.TRUE : normalized.getReturnMarkdown());
|
||||
normalized.setReturnMiddleJson(normalized.getReturnMiddleJson() == null ? Boolean.FALSE : normalized.getReturnMiddleJson());
|
||||
normalized.setReturnContentList(normalized.getReturnContentList() == null ? Boolean.FALSE : normalized.getReturnContentList());
|
||||
normalized.setReturnModelOutput(normalized.getReturnModelOutput() == null ? Boolean.FALSE : normalized.getReturnModelOutput());
|
||||
normalized.setReturnImages(normalized.getReturnImages() == null ? Boolean.TRUE : normalized.getReturnImages());
|
||||
normalized.setIncludeHiddenSheets(normalized.getIncludeHiddenSheets() == null ? Boolean.FALSE : normalized.getIncludeHiddenSheets());
|
||||
normalized.setOcrEmbeddedImages(normalized.getOcrEmbeddedImages() == null ? Boolean.TRUE : normalized.getOcrEmbeddedImages());
|
||||
normalized.setIncludeImageAppendix(normalized.getIncludeImageAppendix() == null ? Boolean.TRUE : normalized.getIncludeImageAppendix());
|
||||
return normalized;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ParseResponse doParse(XlsxParseRequest request, DocumentAsyncTaskUpdater updater) {
|
||||
ParseResponse response = new ParseResponse();
|
||||
List<ParseResult> results = new ArrayList<ParseResult>();
|
||||
String backend = null;
|
||||
int processedFiles = 0;
|
||||
int totalFiles = request.getFiles().size();
|
||||
|
||||
for (ParseFile file : request.getFiles()) {
|
||||
updateProgress(updater, "extracting", processedFiles, totalFiles, "正在读取工作簿结构");
|
||||
ParseResult result = parseSingleWorkbook(file, request, updater);
|
||||
processedFiles++;
|
||||
if (backend == null) {
|
||||
backend = (String) result.getMetadata().get("ocrBackend");
|
||||
}
|
||||
result.getMetadata().remove("ocrBackend");
|
||||
results.add(result);
|
||||
}
|
||||
|
||||
updateProgress(updater, "assembling", processedFiles, totalFiles, "正在汇总 XLSX 解析结果");
|
||||
response.setBackend(StringUtil.hasText(backend) ? backend : request.getBackend());
|
||||
response.setResults(results);
|
||||
return response;
|
||||
}
|
||||
|
||||
private ParseResult parseSingleWorkbook(ParseFile file, XlsxParseRequest request, DocumentAsyncTaskUpdater updater) {
|
||||
ParseResult aggregate = new ParseResult();
|
||||
aggregate.setFileName(file.getFileName());
|
||||
XlsxParseArtifact artifact = new XlsxParseArtifact();
|
||||
artifact.setWorkbookName(file.getFileName());
|
||||
StringBuilder markdownBuilder = new StringBuilder();
|
||||
String backend = null;
|
||||
|
||||
try (XSSFWorkbook workbook = new XSSFWorkbook(new ByteArrayInputStream(file.getContent()))) {
|
||||
FormulaEvaluator evaluator = workbook.getCreationHelper().createFormulaEvaluator();
|
||||
DataFormatter formatter = new DataFormatter();
|
||||
List<Integer> sheetIndexes = resolveSheetIndexes(workbook, request);
|
||||
int processedSheets = 0;
|
||||
|
||||
for (Integer sheetIndex : sheetIndexes) {
|
||||
XSSFSheet sheet = workbook.getSheetAt(sheetIndex);
|
||||
updateProgress(updater, "extracting", processedSheets, sheetIndexes.size(), "正在读取 Sheet " + sheet.getSheetName());
|
||||
SheetExtraction sheetExtraction = extractSheet(sheet, sheetIndex, formatter, evaluator, request, updater);
|
||||
artifact.getSheets().add(sheetExtraction.sheetArtifact);
|
||||
artifact.getCellImages().addAll(sheetExtraction.imageArtifacts);
|
||||
artifact.getSheetImages().add(sheetExtraction.sheetImagesArtifact);
|
||||
artifact.getMergedRanges().addAll(sheetExtraction.mergedRanges);
|
||||
aggregate.getImages().addAll(sheetExtraction.documentImages);
|
||||
if (markdownBuilder.length() > 0) {
|
||||
markdownBuilder.append("\n\n");
|
||||
}
|
||||
markdownBuilder.append(sheetExtraction.markdown);
|
||||
if (backend == null) {
|
||||
backend = sheetExtraction.ocrBackend;
|
||||
}
|
||||
processedSheets++;
|
||||
}
|
||||
} catch (Exception exception) {
|
||||
throw new IllegalStateException("Failed to parse XLSX file: " + file.getFileName(), exception);
|
||||
}
|
||||
|
||||
aggregate.setMarkdown(markdownBuilder.toString().trim());
|
||||
aggregate.setPlainText(aggregate.getMarkdown());
|
||||
aggregate.getArtifacts().getExtraJsonArtifacts().put("xlsx", artifact);
|
||||
aggregate.getMetadata().put("ocrBackend", backend);
|
||||
return aggregate;
|
||||
}
|
||||
|
||||
private SheetExtraction extractSheet(XSSFSheet sheet,
|
||||
int sheetIndex,
|
||||
DataFormatter formatter,
|
||||
FormulaEvaluator evaluator,
|
||||
XlsxParseRequest request,
|
||||
DocumentAsyncTaskUpdater updater) {
|
||||
SheetExtraction extraction = new SheetExtraction();
|
||||
extraction.sheetArtifact = new XlsxSheetArtifact();
|
||||
extraction.sheetArtifact.setSheetName(sheet.getSheetName());
|
||||
extraction.sheetArtifact.setSheetIndex(sheetIndex);
|
||||
extraction.sheetArtifact.setHidden(Boolean.valueOf(sheet.getWorkbook().isSheetHidden(sheetIndex)
|
||||
|| sheet.getWorkbook().isSheetVeryHidden(sheetIndex)));
|
||||
extraction.sheetImagesArtifact = new XlsxSheetImagesArtifact();
|
||||
extraction.sheetImagesArtifact.setSheetName(sheet.getSheetName());
|
||||
extraction.sheetImagesArtifact.setSheetIndex(sheetIndex);
|
||||
|
||||
Map<String, List<XlsxCellImageArtifact>> imagesByCell = new LinkedHashMap<String, List<XlsxCellImageArtifact>>();
|
||||
List<SheetImageExtraction> sheetImages = extractImages(sheet, sheetIndex, request, updater);
|
||||
List<XlsxCellImageArtifact> imageArtifacts = new ArrayList<XlsxCellImageArtifact>();
|
||||
for (SheetImageExtraction sheetImage : sheetImages) {
|
||||
XlsxCellImageArtifact imageArtifact = sheetImage.imageArtifact;
|
||||
imageArtifacts.add(imageArtifact);
|
||||
extraction.imageArtifacts.add(imageArtifact);
|
||||
extraction.sheetImagesArtifact.getReferenceKeys().add(imageArtifact.getReferenceKey());
|
||||
extraction.sheetImagesArtifact.getSourcePaths().add(imageArtifact.getSourcePath());
|
||||
String anchorCell = imageArtifact.getAnchorCell();
|
||||
List<XlsxCellImageArtifact> cellImages = imagesByCell.get(anchorCell);
|
||||
if (cellImages == null) {
|
||||
cellImages = new ArrayList<XlsxCellImageArtifact>();
|
||||
imagesByCell.put(anchorCell, cellImages);
|
||||
}
|
||||
cellImages.add(imageArtifact);
|
||||
extraction.documentImages.add(sheetImage.documentImage);
|
||||
}
|
||||
|
||||
int maxRow = resolveMaxRow(sheet, request.getMaxRowsPerSheet());
|
||||
int maxCol = resolveMaxCol(sheet, maxRow, imagesByCell);
|
||||
extraction.sheetArtifact.setRowCount(maxRow + 1);
|
||||
extraction.sheetArtifact.setColumnCount(maxCol);
|
||||
appendSheetHeader(extraction.markdown, sheet.getSheetName());
|
||||
|
||||
if (maxRow < 0 || maxCol <= 0) {
|
||||
extraction.markdown.append("_empty sheet_");
|
||||
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
|
||||
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
|
||||
}
|
||||
return extraction;
|
||||
}
|
||||
|
||||
List<List<String>> markdownRows = new ArrayList<List<String>>();
|
||||
for (int rowIndex = 0; rowIndex <= maxRow; rowIndex++) {
|
||||
org.apache.poi.ss.usermodel.Row row = sheet.getRow(rowIndex);
|
||||
XlsxRowArtifact rowArtifact = new XlsxRowArtifact();
|
||||
rowArtifact.setRowIndex(rowIndex);
|
||||
List<String> rowValues = new ArrayList<String>();
|
||||
for (int colIndex = 0; colIndex < maxCol; colIndex++) {
|
||||
String cellRef = new CellReference(rowIndex, colIndex).formatAsString();
|
||||
String cellText = readCellText(row, colIndex, formatter, evaluator);
|
||||
List<XlsxCellImageArtifact> cellImages = imagesByCell.get(cellRef);
|
||||
String displayValue = mergeDisplayValue(cellText, cellImages);
|
||||
rowValues.add(escapeMarkdown(displayValue));
|
||||
|
||||
XlsxCellArtifact cellArtifact = new XlsxCellArtifact();
|
||||
cellArtifact.setRowIndex(rowIndex);
|
||||
cellArtifact.setColumnIndex(colIndex);
|
||||
cellArtifact.setCellRef(cellRef);
|
||||
cellArtifact.setText(cellText);
|
||||
if (cellImages != null) {
|
||||
List<String> imageKeys = new ArrayList<String>();
|
||||
for (XlsxCellImageArtifact cellImage : cellImages) {
|
||||
imageKeys.add(cellImage.getReferenceKey());
|
||||
}
|
||||
cellArtifact.setImageKeys(imageKeys);
|
||||
}
|
||||
rowArtifact.getCells().add(cellArtifact);
|
||||
}
|
||||
extraction.sheetArtifact.getRows().add(rowArtifact);
|
||||
markdownRows.add(rowValues);
|
||||
}
|
||||
|
||||
appendMarkdownTable(extraction.markdown, markdownRows);
|
||||
extraction.mergedRanges.addAll(extractMergedRanges(sheet));
|
||||
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
|
||||
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
|
||||
}
|
||||
return extraction;
|
||||
}
|
||||
|
||||
private List<SheetImageExtraction> extractImages(XSSFSheet sheet,
|
||||
int sheetIndex,
|
||||
XlsxParseRequest request,
|
||||
DocumentAsyncTaskUpdater updater) {
|
||||
List<SheetImageExtraction> images = new ArrayList<SheetImageExtraction>();
|
||||
XSSFDrawing drawing = sheet.getDrawingPatriarch();
|
||||
if (drawing == null) {
|
||||
return images;
|
||||
}
|
||||
String sheetKey = buildSheetKey(sheet.getSheetName(), sheetIndex);
|
||||
int imageIndex = 0;
|
||||
for (XSSFShape shape : drawing.getShapes()) {
|
||||
if (!(shape instanceof XSSFPicture)) {
|
||||
continue;
|
||||
}
|
||||
imageIndex++;
|
||||
XSSFPicture picture = (XSSFPicture) shape;
|
||||
XSSFClientAnchor anchor = picture.getPreferredSize();
|
||||
if (anchor == null) {
|
||||
continue;
|
||||
}
|
||||
XSSFPictureData pictureData = picture.getPictureData();
|
||||
String extension = pictureData == null || !StringUtil.hasText(pictureData.suggestFileExtension())
|
||||
? "png"
|
||||
: pictureData.suggestFileExtension();
|
||||
String imageName = buildImageName(sheetKey, anchor.getRow1(), anchor.getCol1(), imageIndex);
|
||||
String sourcePath = buildImageSourcePath(sheetKey, anchor.getRow1(), anchor.getCol1(), imageIndex, extension);
|
||||
|
||||
XlsxCellImageArtifact imageArtifact = new XlsxCellImageArtifact();
|
||||
imageArtifact.setSheetName(sheet.getSheetName());
|
||||
imageArtifact.setAnchorCell(new CellReference(anchor.getRow1(), anchor.getCol1()).formatAsString());
|
||||
imageArtifact.setFromRow(anchor.getRow1());
|
||||
imageArtifact.setFromCol((int) anchor.getCol1());
|
||||
imageArtifact.setToRow(anchor.getRow2());
|
||||
imageArtifact.setToCol((int) anchor.getCol2());
|
||||
imageArtifact.setName(imageName);
|
||||
imageArtifact.setReferenceKey(imageName);
|
||||
imageArtifact.setSourcePath(sourcePath);
|
||||
if (Boolean.TRUE.equals(request.getOcrEmbeddedImages()) && pictureData != null) {
|
||||
updateProgress(updater, "ocr", imageIndex - 1, drawing.getShapes().size(), "正在识别 Sheet " + sheet.getSheetName() + " 中的图片");
|
||||
imageArtifact.setOcrText(parseImageOcr(pictureData.getData(), extension, request, imageName));
|
||||
}
|
||||
DocumentImage documentImage = new DocumentImage();
|
||||
documentImage.setName(imageName);
|
||||
documentImage.setSourcePath(sourcePath);
|
||||
documentImage.setMimeType(detectImageMimeType(sourcePath));
|
||||
documentImage.setContent(pictureData == null ? null : pictureData.getData());
|
||||
|
||||
SheetImageExtraction sheetImage = new SheetImageExtraction();
|
||||
sheetImage.imageArtifact = imageArtifact;
|
||||
sheetImage.documentImage = documentImage;
|
||||
images.add(sheetImage);
|
||||
}
|
||||
return images;
|
||||
}
|
||||
|
||||
private String parseImageOcr(byte[] imageBytes, String extension, XlsxParseRequest request, String imageName) {
|
||||
ParseRequest imageRequest = new ParseRequest();
|
||||
imageRequest.addFile(ParseFile.of(imageName + "." + extension, imageBytes, "image/" + extension));
|
||||
imageRequest.setBackend(request.getBackend());
|
||||
imageRequest.setLanguages(request.getLanguages());
|
||||
imageRequest.setReturnMarkdown(true);
|
||||
imageRequest.setReturnMiddleJson(false);
|
||||
imageRequest.setReturnContentList(false);
|
||||
imageRequest.setReturnModelOutput(false);
|
||||
imageRequest.setReturnImages(false);
|
||||
ParseResponse response = mapper.toParseResponse(client.parse(imageRequest));
|
||||
if (response.getResults().isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
ParseResult result = response.getResults().get(0);
|
||||
return StringUtil.hasText(result.getMarkdown()) ? result.getMarkdown() : result.getPlainText();
|
||||
}
|
||||
|
||||
private List<Integer> resolveSheetIndexes(XSSFWorkbook workbook, XlsxParseRequest request) {
|
||||
List<Integer> indexes = new ArrayList<Integer>();
|
||||
for (int index = 0; index < workbook.getNumberOfSheets(); index++) {
|
||||
String sheetName = workbook.getSheetName(index);
|
||||
if (!Boolean.TRUE.equals(request.getIncludeHiddenSheets())
|
||||
&& (workbook.isSheetHidden(index) || workbook.isSheetVeryHidden(index))) {
|
||||
continue;
|
||||
}
|
||||
if (request.getSheetNames() != null && !request.getSheetNames().isEmpty()
|
||||
&& !request.getSheetNames().contains(sheetName)) {
|
||||
continue;
|
||||
}
|
||||
indexes.add(index);
|
||||
}
|
||||
return indexes;
|
||||
}
|
||||
|
||||
private int resolveMaxRow(XSSFSheet sheet, Integer maxRowsPerSheet) {
|
||||
int lastRow = sheet.getLastRowNum();
|
||||
if (lastRow < 0) {
|
||||
return -1;
|
||||
}
|
||||
if (maxRowsPerSheet == null || maxRowsPerSheet <= 0) {
|
||||
return lastRow;
|
||||
}
|
||||
return Math.min(lastRow, maxRowsPerSheet - 1);
|
||||
}
|
||||
|
||||
private int resolveMaxCol(XSSFSheet sheet, int maxRow, Map<String, List<XlsxCellImageArtifact>> imagesByCell) {
|
||||
int maxCol = 0;
|
||||
for (int rowIndex = 0; rowIndex <= maxRow; rowIndex++) {
|
||||
org.apache.poi.ss.usermodel.Row row = sheet.getRow(rowIndex);
|
||||
if (row != null && row.getLastCellNum() > maxCol) {
|
||||
maxCol = row.getLastCellNum();
|
||||
}
|
||||
}
|
||||
for (String cellRef : imagesByCell.keySet()) {
|
||||
CellReference reference = new CellReference(cellRef);
|
||||
if (reference.getCol() + 1 > maxCol) {
|
||||
maxCol = reference.getCol() + 1;
|
||||
}
|
||||
}
|
||||
return maxCol;
|
||||
}
|
||||
|
||||
private String readCellText(org.apache.poi.ss.usermodel.Row row, int colIndex, DataFormatter formatter, FormulaEvaluator evaluator) {
|
||||
if (row == null) {
|
||||
return "";
|
||||
}
|
||||
org.apache.poi.ss.usermodel.Cell cell = row.getCell(colIndex);
|
||||
if (cell == null) {
|
||||
return "";
|
||||
}
|
||||
return formatter.formatCellValue(cell, evaluator);
|
||||
}
|
||||
|
||||
private String mergeDisplayValue(String cellText, List<XlsxCellImageArtifact> cellImages) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
if (StringUtil.hasText(cellText)) {
|
||||
builder.append(cellText.trim());
|
||||
}
|
||||
if (cellImages != null && !cellImages.isEmpty()) {
|
||||
for (XlsxCellImageArtifact cellImage : cellImages) {
|
||||
if (builder.length() > 0) {
|
||||
builder.append('\n');
|
||||
}
|
||||
builder.append("[IMG:").append(cellImage.getReferenceKey()).append(']');
|
||||
}
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private void appendSheetHeader(StringBuilder markdownBuilder, String sheetName) {
|
||||
markdownBuilder.append("# ").append(sheetName).append("\n\n");
|
||||
}
|
||||
|
||||
private void appendMarkdownTable(StringBuilder markdownBuilder, List<List<String>> rows) {
|
||||
if (rows.isEmpty()) {
|
||||
markdownBuilder.append("_empty sheet_");
|
||||
return;
|
||||
}
|
||||
List<String> header = rows.get(0);
|
||||
markdownBuilder.append("| ").append(joinCells(header)).append(" |\n");
|
||||
markdownBuilder.append("|");
|
||||
for (int index = 0; index < header.size(); index++) {
|
||||
markdownBuilder.append(" --- |");
|
||||
}
|
||||
markdownBuilder.append("\n");
|
||||
for (int rowIndex = 1; rowIndex < rows.size(); rowIndex++) {
|
||||
markdownBuilder.append("| ").append(joinCells(rows.get(rowIndex))).append(" |\n");
|
||||
}
|
||||
}
|
||||
|
||||
private void appendImageAppendix(StringBuilder markdownBuilder,
|
||||
String sheetName,
|
||||
List<XlsxCellImageArtifact> imageArtifacts) {
|
||||
markdownBuilder.append("\n## ").append(sheetName).append(" 图片说明\n\n");
|
||||
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
|
||||
markdownBuilder.append("
|
||||
.append(imageArtifact.getSourcePath())
|
||||
.append(")\n\n");
|
||||
markdownBuilder.append("- 占位符:[IMG:")
|
||||
.append(imageArtifact.getReferenceKey())
|
||||
.append("]\n");
|
||||
markdownBuilder.append("- 锚点:")
|
||||
.append(imageArtifact.getAnchorCell())
|
||||
.append("\n");
|
||||
markdownBuilder.append("- OCR:")
|
||||
.append(StringUtil.hasText(imageArtifact.getOcrText()) ? imageArtifact.getOcrText() : "")
|
||||
.append("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> extractMergedRanges(XSSFSheet sheet) {
|
||||
List<String> mergedRanges = new ArrayList<String>();
|
||||
for (int index = 0; index < sheet.getNumMergedRegions(); index++) {
|
||||
mergedRanges.add(sheet.getMergedRegion(index).formatAsString());
|
||||
}
|
||||
return mergedRanges;
|
||||
}
|
||||
|
||||
private String joinCells(List<String> cells) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int index = 0; index < cells.size(); index++) {
|
||||
if (index > 0) {
|
||||
builder.append(" | ");
|
||||
}
|
||||
builder.append(cells.get(index));
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private String escapeMarkdown(String text) {
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return "";
|
||||
}
|
||||
return text.replace("|", "\\|").replace("\r", " ").replace("\n", "<br/>");
|
||||
}
|
||||
|
||||
private String buildImageName(String sheetKey, int rowIndex, int colIndex, int imageIndex) {
|
||||
return sheetKey + "-r" + (rowIndex + 1) + "c" + (colIndex + 1) + "-" + formatIndex(imageIndex);
|
||||
}
|
||||
|
||||
private String buildImageSourcePath(String sheetKey, int rowIndex, int colIndex, int imageIndex, String extension) {
|
||||
return "images/" + sheetKey + "/r" + (rowIndex + 1) + "c" + (colIndex + 1) + "-" + formatIndex(imageIndex) + "." + extension;
|
||||
}
|
||||
|
||||
private String buildSheetKey(String sheetName, int sheetIndex) {
|
||||
if (!StringUtil.hasText(sheetName)) {
|
||||
return "sheet-" + formatIndex(sheetIndex + 1);
|
||||
}
|
||||
String lowerCaseName = sheetName.toLowerCase(Locale.ROOT);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int index = 0; index < lowerCaseName.length(); index++) {
|
||||
char character = lowerCaseName.charAt(index);
|
||||
if ((character >= 'a' && character <= 'z') || (character >= '0' && character <= '9')) {
|
||||
builder.append(character);
|
||||
continue;
|
||||
}
|
||||
if (builder.length() > 0 && builder.charAt(builder.length() - 1) != '-') {
|
||||
builder.append('-');
|
||||
}
|
||||
builder.append('u').append(String.format(Locale.ROOT, "%04x", (int) character)).append('-');
|
||||
}
|
||||
String normalized = builder.toString();
|
||||
while (normalized.startsWith("-")) {
|
||||
normalized = normalized.substring(1);
|
||||
}
|
||||
while (normalized.endsWith("-")) {
|
||||
normalized = normalized.substring(0, normalized.length() - 1);
|
||||
}
|
||||
return StringUtil.hasText(normalized) ? normalized : "sheet-" + formatIndex(sheetIndex + 1);
|
||||
}
|
||||
|
||||
private String formatIndex(int index) {
|
||||
int displayIndex = index <= 0 ? 1 : index;
|
||||
if (displayIndex < 10) {
|
||||
return "00" + displayIndex;
|
||||
}
|
||||
if (displayIndex < 100) {
|
||||
return "0" + displayIndex;
|
||||
}
|
||||
return String.valueOf(displayIndex);
|
||||
}
|
||||
|
||||
private String detectImageMimeType(String path) {
|
||||
if (!StringUtil.hasText(path)) {
|
||||
return "application/octet-stream";
|
||||
}
|
||||
String mimeType = URLConnection.guessContentTypeFromName(path);
|
||||
if (StringUtil.hasText(mimeType)) {
|
||||
return mimeType;
|
||||
}
|
||||
String lowerCasePath = path.toLowerCase(Locale.ROOT);
|
||||
if (lowerCasePath.endsWith(".jpg") || lowerCasePath.endsWith(".jpeg")) {
|
||||
return "image/jpeg";
|
||||
}
|
||||
if (lowerCasePath.endsWith(".png")) {
|
||||
return "image/png";
|
||||
}
|
||||
if (lowerCasePath.endsWith(".gif")) {
|
||||
return "image/gif";
|
||||
}
|
||||
if (lowerCasePath.endsWith(".bmp")) {
|
||||
return "image/bmp";
|
||||
}
|
||||
if (lowerCasePath.endsWith(".webp")) {
|
||||
return "image/webp";
|
||||
}
|
||||
return "application/octet-stream";
|
||||
}
|
||||
|
||||
private void updateProgress(DocumentAsyncTaskUpdater updater,
|
||||
String stage,
|
||||
int processedItems,
|
||||
int totalItems,
|
||||
String message) {
|
||||
if (updater == null) {
|
||||
return;
|
||||
}
|
||||
int safeTotal = totalItems <= 0 ? 1 : totalItems;
|
||||
int percent = (int) Math.min(99, Math.round(processedItems * 100.0d / safeTotal));
|
||||
updater.update(stage, percent, processedItems, totalItems, message);
|
||||
}
|
||||
|
||||
private static DocumentAsyncTaskManager defaultTaskManager() {
|
||||
DocumentAsyncTaskRepository repository = new InMemoryDocumentAsyncTaskRepository();
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(2);
|
||||
return new DocumentAsyncTaskManager(repository, executorService);
|
||||
}
|
||||
|
||||
private static class SheetExtraction {
|
||||
|
||||
private final StringBuilder markdown = new StringBuilder();
|
||||
private final List<XlsxCellImageArtifact> imageArtifacts = new ArrayList<XlsxCellImageArtifact>();
|
||||
private final List<DocumentImage> documentImages = new ArrayList<DocumentImage>();
|
||||
private final List<String> mergedRanges = new ArrayList<String>();
|
||||
private XlsxSheetArtifact sheetArtifact;
|
||||
private XlsxSheetImagesArtifact sheetImagesArtifact;
|
||||
private String ocrBackend;
|
||||
}
|
||||
|
||||
private static class SheetImageExtraction {
|
||||
|
||||
private XlsxCellImageArtifact imageArtifact;
|
||||
private DocumentImage documentImage;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 单元格工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxCellArtifact {
|
||||
|
||||
private Integer rowIndex;
|
||||
private Integer columnIndex;
|
||||
private String cellRef;
|
||||
private String text;
|
||||
private List<String> imageKeys = new ArrayList<String>();
|
||||
|
||||
public Integer getRowIndex() {
|
||||
return rowIndex;
|
||||
}
|
||||
|
||||
public void setRowIndex(Integer rowIndex) {
|
||||
this.rowIndex = rowIndex;
|
||||
}
|
||||
|
||||
public Integer getColumnIndex() {
|
||||
return columnIndex;
|
||||
}
|
||||
|
||||
public void setColumnIndex(Integer columnIndex) {
|
||||
this.columnIndex = columnIndex;
|
||||
}
|
||||
|
||||
public String getCellRef() {
|
||||
return cellRef;
|
||||
}
|
||||
|
||||
public void setCellRef(String cellRef) {
|
||||
this.cellRef = cellRef;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
public void setText(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public List<String> getImageKeys() {
|
||||
return imageKeys;
|
||||
}
|
||||
|
||||
public void setImageKeys(List<String> imageKeys) {
|
||||
this.imageKeys = imageKeys == null ? new ArrayList<String>() : imageKeys;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
/**
|
||||
* 单元格图片工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxCellImageArtifact {
|
||||
|
||||
private String sheetName;
|
||||
private String anchorCell;
|
||||
private Integer fromRow;
|
||||
private Integer fromCol;
|
||||
private Integer toRow;
|
||||
private Integer toCol;
|
||||
private String name;
|
||||
private String referenceKey;
|
||||
private String sourcePath;
|
||||
private String ocrText;
|
||||
|
||||
public String getSheetName() {
|
||||
return sheetName;
|
||||
}
|
||||
|
||||
public void setSheetName(String sheetName) {
|
||||
this.sheetName = sheetName;
|
||||
}
|
||||
|
||||
public String getAnchorCell() {
|
||||
return anchorCell;
|
||||
}
|
||||
|
||||
public void setAnchorCell(String anchorCell) {
|
||||
this.anchorCell = anchorCell;
|
||||
}
|
||||
|
||||
public Integer getFromRow() {
|
||||
return fromRow;
|
||||
}
|
||||
|
||||
public void setFromRow(Integer fromRow) {
|
||||
this.fromRow = fromRow;
|
||||
}
|
||||
|
||||
public Integer getFromCol() {
|
||||
return fromCol;
|
||||
}
|
||||
|
||||
public void setFromCol(Integer fromCol) {
|
||||
this.fromCol = fromCol;
|
||||
}
|
||||
|
||||
public Integer getToRow() {
|
||||
return toRow;
|
||||
}
|
||||
|
||||
public void setToRow(Integer toRow) {
|
||||
this.toRow = toRow;
|
||||
}
|
||||
|
||||
public Integer getToCol() {
|
||||
return toCol;
|
||||
}
|
||||
|
||||
public void setToCol(Integer toCol) {
|
||||
this.toCol = toCol;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getReferenceKey() {
|
||||
return referenceKey;
|
||||
}
|
||||
|
||||
public void setReferenceKey(String referenceKey) {
|
||||
this.referenceKey = referenceKey;
|
||||
}
|
||||
|
||||
public String getSourcePath() {
|
||||
return sourcePath;
|
||||
}
|
||||
|
||||
public void setSourcePath(String sourcePath) {
|
||||
this.sourcePath = sourcePath;
|
||||
}
|
||||
|
||||
public String getOcrText() {
|
||||
return ocrText;
|
||||
}
|
||||
|
||||
public void setOcrText(String ocrText) {
|
||||
this.ocrText = ocrText;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* XLSX 结构化工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxParseArtifact {
|
||||
|
||||
private String workbookName;
|
||||
private List<XlsxSheetArtifact> sheets = new ArrayList<XlsxSheetArtifact>();
|
||||
private List<XlsxSheetImagesArtifact> sheetImages = new ArrayList<XlsxSheetImagesArtifact>();
|
||||
private List<String> mergedRanges = new ArrayList<String>();
|
||||
private List<XlsxCellImageArtifact> cellImages = new ArrayList<XlsxCellImageArtifact>();
|
||||
|
||||
public String getWorkbookName() {
|
||||
return workbookName;
|
||||
}
|
||||
|
||||
public void setWorkbookName(String workbookName) {
|
||||
this.workbookName = workbookName;
|
||||
}
|
||||
|
||||
public List<XlsxSheetArtifact> getSheets() {
|
||||
return sheets;
|
||||
}
|
||||
|
||||
public void setSheets(List<XlsxSheetArtifact> sheets) {
|
||||
this.sheets = sheets == null ? new ArrayList<XlsxSheetArtifact>() : sheets;
|
||||
}
|
||||
|
||||
public List<XlsxSheetImagesArtifact> getSheetImages() {
|
||||
return sheetImages;
|
||||
}
|
||||
|
||||
public void setSheetImages(List<XlsxSheetImagesArtifact> sheetImages) {
|
||||
this.sheetImages = sheetImages == null ? new ArrayList<XlsxSheetImagesArtifact>() : sheetImages;
|
||||
}
|
||||
|
||||
public List<String> getMergedRanges() {
|
||||
return mergedRanges;
|
||||
}
|
||||
|
||||
public void setMergedRanges(List<String> mergedRanges) {
|
||||
this.mergedRanges = mergedRanges == null ? new ArrayList<String>() : mergedRanges;
|
||||
}
|
||||
|
||||
public List<XlsxCellImageArtifact> getCellImages() {
|
||||
return cellImages;
|
||||
}
|
||||
|
||||
public void setCellImages(List<XlsxCellImageArtifact> cellImages) {
|
||||
this.cellImages = cellImages == null ? new ArrayList<XlsxCellImageArtifact>() : cellImages;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 行工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxRowArtifact {
|
||||
|
||||
private Integer rowIndex;
|
||||
private List<XlsxCellArtifact> cells = new ArrayList<XlsxCellArtifact>();
|
||||
|
||||
public Integer getRowIndex() {
|
||||
return rowIndex;
|
||||
}
|
||||
|
||||
public void setRowIndex(Integer rowIndex) {
|
||||
this.rowIndex = rowIndex;
|
||||
}
|
||||
|
||||
public List<XlsxCellArtifact> getCells() {
|
||||
return cells;
|
||||
}
|
||||
|
||||
public void setCells(List<XlsxCellArtifact> cells) {
|
||||
this.cells = cells == null ? new ArrayList<XlsxCellArtifact>() : cells;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Sheet 工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxSheetArtifact {
|
||||
|
||||
private String sheetName;
|
||||
private Integer sheetIndex;
|
||||
private Boolean hidden;
|
||||
private Integer rowCount;
|
||||
private Integer columnCount;
|
||||
private List<XlsxRowArtifact> rows = new ArrayList<XlsxRowArtifact>();
|
||||
|
||||
public String getSheetName() {
|
||||
return sheetName;
|
||||
}
|
||||
|
||||
public void setSheetName(String sheetName) {
|
||||
this.sheetName = sheetName;
|
||||
}
|
||||
|
||||
public Integer getSheetIndex() {
|
||||
return sheetIndex;
|
||||
}
|
||||
|
||||
public void setSheetIndex(Integer sheetIndex) {
|
||||
this.sheetIndex = sheetIndex;
|
||||
}
|
||||
|
||||
public Boolean getHidden() {
|
||||
return hidden;
|
||||
}
|
||||
|
||||
public void setHidden(Boolean hidden) {
|
||||
this.hidden = hidden;
|
||||
}
|
||||
|
||||
public Integer getRowCount() {
|
||||
return rowCount;
|
||||
}
|
||||
|
||||
public void setRowCount(Integer rowCount) {
|
||||
this.rowCount = rowCount;
|
||||
}
|
||||
|
||||
public Integer getColumnCount() {
|
||||
return columnCount;
|
||||
}
|
||||
|
||||
public void setColumnCount(Integer columnCount) {
|
||||
this.columnCount = columnCount;
|
||||
}
|
||||
|
||||
public List<XlsxRowArtifact> getRows() {
|
||||
return rows;
|
||||
}
|
||||
|
||||
public void setRows(List<XlsxRowArtifact> rows) {
|
||||
this.rows = rows == null ? new ArrayList<XlsxRowArtifact>() : rows;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
package com.easyagents.document.xlsx.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Sheet 维度的图片索引工件。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxSheetImagesArtifact {
|
||||
|
||||
private String sheetName;
|
||||
private Integer sheetIndex;
|
||||
private List<String> referenceKeys = new ArrayList<String>();
|
||||
private List<String> sourcePaths = new ArrayList<String>();
|
||||
|
||||
public String getSheetName() {
|
||||
return sheetName;
|
||||
}
|
||||
|
||||
public void setSheetName(String sheetName) {
|
||||
this.sheetName = sheetName;
|
||||
}
|
||||
|
||||
public Integer getSheetIndex() {
|
||||
return sheetIndex;
|
||||
}
|
||||
|
||||
public void setSheetIndex(Integer sheetIndex) {
|
||||
this.sheetIndex = sheetIndex;
|
||||
}
|
||||
|
||||
public List<String> getReferenceKeys() {
|
||||
return referenceKeys;
|
||||
}
|
||||
|
||||
public void setReferenceKeys(List<String> referenceKeys) {
|
||||
this.referenceKeys = referenceKeys == null ? new ArrayList<String>() : referenceKeys;
|
||||
}
|
||||
|
||||
public List<String> getSourcePaths() {
|
||||
return sourcePaths;
|
||||
}
|
||||
|
||||
public void setSourcePaths(List<String> sourcePaths) {
|
||||
this.sourcePaths = sourcePaths == null ? new ArrayList<String>() : sourcePaths;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,333 @@
|
||||
package com.easyagents.document.xlsx.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||
import com.easyagents.document.core.mineru.MineruClient;
|
||||
import com.easyagents.document.core.mineru.MineruMapper;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.core.mineru.MineruResultPayload;
|
||||
import com.easyagents.document.core.entity.ParseArtifacts;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import com.easyagents.document.core.entity.XlsxParseRequest;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.xlsx.model.XlsxParseArtifact;
|
||||
import org.apache.poi.ss.usermodel.ClientAnchor;
|
||||
import org.apache.poi.xssf.usermodel.XSSFDrawing;
|
||||
import org.apache.poi.xssf.usermodel.XSSFSheet;
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.Executor;
|
||||
|
||||
/**
|
||||
* XLSX MinerU 服务测试。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruXlsxDocumentParseServiceTest {
|
||||
|
||||
@Test
|
||||
public void shouldBuildMarkdownAndImageArtifacts() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.addFile(ParseFile.of("demo.xlsx", buildWorkbookBytes()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
|
||||
Assert.assertEquals(1, response.getResults().size());
|
||||
ParseResult result = response.getResults().get(0);
|
||||
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("images/sheet1/r2c2-001.png"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("图片文字描述"));
|
||||
Assert.assertEquals(1, result.getImages().size());
|
||||
Assert.assertNotNull(result.getImages().get(0).getContent());
|
||||
|
||||
XlsxParseArtifact artifact = extractXlsxArtifact(result);
|
||||
Assert.assertEquals("demo.xlsx", artifact.getWorkbookName());
|
||||
Assert.assertEquals(1, artifact.getSheets().size());
|
||||
Assert.assertEquals(1, artifact.getSheetImages().size());
|
||||
Assert.assertEquals(1, artifact.getCellImages().size());
|
||||
Assert.assertEquals("sheet1-r2c2-001", artifact.getCellImages().get(0).getReferenceKey());
|
||||
Assert.assertEquals("images/sheet1/r2c2-001.png", artifact.getCellImages().get(0).getSourcePath());
|
||||
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
|
||||
Assert.assertEquals("images/sheet1/r2c2-001.png", artifact.getSheetImages().get(0).getSourcePaths().get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldKeepImageKeysUniqueForNonAsciiSheetNames() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.addFile(ParseFile.of("unicode-sheets.xlsx", buildWorkbookBytesWithUnicodeSheetNames()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
ParseResult result = response.getResults().get(0);
|
||||
|
||||
Assert.assertEquals(2, result.getImages().size());
|
||||
Assert.assertNotEquals(result.getImages().get(0).getName(), result.getImages().get(1).getName());
|
||||
Assert.assertNotEquals(result.getImages().get(0).getSourcePath(), result.getImages().get(1).getSourcePath());
|
||||
Assert.assertTrue(result.getMarkdown().contains("[IMG:" + result.getImages().get(0).getName() + "]"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("[IMG:" + result.getImages().get(1).getName() + "]"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldDetectJpegMimeType() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.addFile(ParseFile.of("jpeg.xlsx", buildWorkbookBytesWithJpegImage()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
ParseResult result = response.getResults().get(0);
|
||||
|
||||
Assert.assertEquals(1, result.getImages().size());
|
||||
Assert.assertEquals("image/jpeg", result.getImages().get(0).getMimeType());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldAppendImageReferenceForImageOnlySheet() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.addFile(ParseFile.of("image-only.xlsx", buildWorkbookBytesWithImageOnlySheet()));
|
||||
|
||||
ParseResponse response = service.parse(request);
|
||||
ParseResult result = response.getResults().get(0);
|
||||
XlsxParseArtifact artifact = extractXlsxArtifact(result);
|
||||
|
||||
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("_empty sheet_"));
|
||||
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片说明"));
|
||||
Assert.assertTrue(result.getMarkdown().contains(""));
|
||||
Assert.assertTrue(result.getMarkdown().contains("- 占位符:[IMG:sheet1-r2c2-001]"));
|
||||
Assert.assertEquals(1, result.getImages().size());
|
||||
Assert.assertEquals(1, artifact.getSheetImages().size());
|
||||
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldTrackAsyncLifecycleAndExposeResult() throws Exception {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
ManualExecutor executor = new ManualExecutor();
|
||||
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||
defaultProperties(),
|
||||
client,
|
||||
mapper,
|
||||
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executor)
|
||||
);
|
||||
|
||||
XlsxParseRequest request = new XlsxParseRequest();
|
||||
request.addFile(ParseFile.of("async.xlsx", buildWorkbookBytes()));
|
||||
|
||||
ParseTaskStatus submitted = service.submit(request);
|
||||
Assert.assertEquals("queued", submitted.getStatus());
|
||||
Assert.assertEquals("queued", submitted.getCurrentStage());
|
||||
Assert.assertEquals(Integer.valueOf(0), submitted.getProgressPercent());
|
||||
|
||||
ParseTaskInfo queuedInfo = service.queryTaskInfo(submitted.getTaskId());
|
||||
Assert.assertNull(queuedInfo.getResult());
|
||||
try {
|
||||
service.queryResult(submitted.getTaskId());
|
||||
Assert.fail("任务未完成时应抛出异常");
|
||||
} catch (DocumentParseException expected) {
|
||||
Assert.assertTrue(expected.getMessage().contains(submitted.getTaskId()));
|
||||
}
|
||||
|
||||
executor.runNext();
|
||||
|
||||
ParseTaskStatus completed = service.queryTask(submitted.getTaskId());
|
||||
Assert.assertEquals("completed", completed.getStatus());
|
||||
Assert.assertEquals("completed", completed.getCurrentStage());
|
||||
Assert.assertEquals(Integer.valueOf(100), completed.getProgressPercent());
|
||||
Assert.assertEquals("任务执行完成", completed.getStatusMessage());
|
||||
|
||||
ParseTaskInfo completedInfo = service.queryTaskInfo(submitted.getTaskId());
|
||||
Assert.assertNotNull(completedInfo.getResult());
|
||||
Assert.assertTrue(completedInfo.getResult().getResults().get(0).getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
|
||||
Assert.assertEquals(completedInfo.getResult(), service.queryResult(submitted.getTaskId()));
|
||||
}
|
||||
|
||||
private byte[] buildWorkbookBytes() throws Exception {
|
||||
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||
XSSFSheet sheet = workbook.createSheet("Sheet1");
|
||||
sheet.createRow(0).createCell(0).setCellValue("商品");
|
||||
sheet.getRow(0).createCell(1).setCellValue("图片");
|
||||
sheet.createRow(1).createCell(0).setCellValue("手机");
|
||||
addPicture(workbook, sheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||
return writeWorkbook(workbook);
|
||||
}
|
||||
|
||||
private byte[] buildWorkbookBytesWithUnicodeSheetNames() throws Exception {
|
||||
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||
|
||||
XSSFSheet detailSheet = workbook.createSheet("明细");
|
||||
detailSheet.createRow(0).createCell(0).setCellValue("图片");
|
||||
addPicture(workbook, detailSheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||
|
||||
XSSFSheet summarySheet = workbook.createSheet("汇总");
|
||||
summarySheet.createRow(0).createCell(0).setCellValue("图片");
|
||||
addPicture(workbook, summarySheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||
|
||||
return writeWorkbook(workbook);
|
||||
}
|
||||
|
||||
private byte[] buildWorkbookBytesWithJpegImage() throws Exception {
|
||||
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||
XSSFSheet sheet = workbook.createSheet("Sheet1");
|
||||
sheet.createRow(0).createCell(0).setCellValue("图片");
|
||||
addPicture(workbook, sheet, 1, 1, createImageBytes("jpg"), XSSFWorkbook.PICTURE_TYPE_JPEG);
|
||||
return writeWorkbook(workbook);
|
||||
}
|
||||
|
||||
private byte[] buildWorkbookBytesWithImageOnlySheet() throws Exception {
|
||||
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||
XSSFSheet sheet = workbook.createSheet("Sheet1");
|
||||
addPicture(workbook, sheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||
return writeWorkbook(workbook);
|
||||
}
|
||||
|
||||
private void addPicture(XSSFWorkbook workbook,
|
||||
XSSFSheet sheet,
|
||||
int rowIndex,
|
||||
int colIndex,
|
||||
byte[] imageBytes,
|
||||
int pictureType) {
|
||||
int pictureIndex = workbook.addPicture(imageBytes, pictureType);
|
||||
XSSFDrawing drawing = sheet.createDrawingPatriarch();
|
||||
ClientAnchor anchor = workbook.getCreationHelper().createClientAnchor();
|
||||
anchor.setRow1(rowIndex);
|
||||
anchor.setCol1(colIndex);
|
||||
anchor.setRow2(rowIndex + 1);
|
||||
anchor.setCol2(colIndex + 1);
|
||||
drawing.createPicture(anchor, pictureIndex);
|
||||
}
|
||||
|
||||
private byte[] writeWorkbook(XSSFWorkbook workbook) throws Exception {
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
workbook.write(outputStream);
|
||||
workbook.close();
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
|
||||
private byte[] createImageBytes(String format) throws Exception {
|
||||
BufferedImage image = new BufferedImage(2, 2, BufferedImage.TYPE_INT_RGB);
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
ImageIO.write(image, format, outputStream);
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
|
||||
private MineruProperties defaultProperties() {
|
||||
MineruProperties properties = new MineruProperties();
|
||||
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||
return properties;
|
||||
}
|
||||
|
||||
private Executor directExecutor() {
|
||||
return new Executor() {
|
||||
@Override
|
||||
public void execute(Runnable command) {
|
||||
command.run();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private XlsxParseArtifact extractXlsxArtifact(ParseResult result) {
|
||||
ParseArtifacts artifacts = result.getArtifacts();
|
||||
Assert.assertNotNull(artifacts);
|
||||
Object artifact = artifacts.getExtraJsonArtifacts().get("xlsx");
|
||||
Assert.assertTrue(artifact instanceof XlsxParseArtifact);
|
||||
return (XlsxParseArtifact) artifact;
|
||||
}
|
||||
|
||||
/**
|
||||
* 手动执行的测试执行器,用于验证异步任务状态流转。
|
||||
*/
|
||||
private static class ManualExecutor implements Executor {
|
||||
|
||||
private final Queue<Runnable> tasks = new ArrayDeque<Runnable>();
|
||||
|
||||
@Override
|
||||
public void execute(Runnable command) {
|
||||
tasks.offer(command);
|
||||
}
|
||||
|
||||
private void runNext() {
|
||||
Runnable task = tasks.poll();
|
||||
Assert.assertNotNull("应当存在待执行任务", task);
|
||||
task.run();
|
||||
}
|
||||
}
|
||||
|
||||
private static class RecordingClient extends MineruClient {
|
||||
|
||||
private RecordingClient(MineruProperties properties) {
|
||||
super(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
@Override
|
||||
public MineruResultPayload parse(com.easyagents.document.core.entity.ParseRequest request) {
|
||||
return new MineruMapper(testProperties()).toResultPayload(syncPayload());
|
||||
}
|
||||
|
||||
private JSONObject syncPayload() {
|
||||
JSONObject payload = new JSONObject();
|
||||
payload.put("backend", "vlm-http-client");
|
||||
payload.put("version", "3.0.9");
|
||||
JSONObject result = new JSONObject();
|
||||
result.put("md_content", "图片文字描述");
|
||||
JSONObject results = new JSONObject();
|
||||
results.put("image", result);
|
||||
payload.put("results", results);
|
||||
return payload;
|
||||
}
|
||||
|
||||
private static MineruProperties testProperties() {
|
||||
MineruProperties properties = new MineruProperties();
|
||||
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||
return properties;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -17,5 +17,7 @@
|
||||
<modules>
|
||||
<module>easy-agents-document-core</module>
|
||||
<module>easy-agents-document-pdf</module>
|
||||
<module>easy-agents-document-pptx</module>
|
||||
<module>easy-agents-document-xlsx</module>
|
||||
</modules>
|
||||
</project>
|
||||
|
||||
@@ -61,6 +61,16 @@
|
||||
<artifactId>easy-agents-document-pdf</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document-pptx</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document-xlsx</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-rag-ingestion</artifactId>
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
package com.easyagents.spring.boot.document.mineru;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 通用 MinerU 文档配置。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
@ConfigurationProperties(prefix = "easy-agents.document.mineru")
|
||||
public class CommonMineruDocumentProperties {
|
||||
|
||||
private String baseUrl;
|
||||
private Integer connectTimeoutMs = 3000;
|
||||
private Integer readTimeoutMs = 600000;
|
||||
private Integer writeTimeoutMs = 600000;
|
||||
private Integer pollIntervalMs = 1000;
|
||||
private Integer resultTimeoutMs = 1800000;
|
||||
private String defaultBackend = "vlm-http-client";
|
||||
private String defaultParseMethod = "auto";
|
||||
private List<String> defaultLangList = new ArrayList<String>(Arrays.asList("ch"));
|
||||
private Boolean defaultFormulaEnable = true;
|
||||
private Boolean defaultTableEnable = true;
|
||||
|
||||
public String getBaseUrl() {
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
public void setBaseUrl(String baseUrl) {
|
||||
this.baseUrl = baseUrl;
|
||||
}
|
||||
|
||||
public Integer getConnectTimeoutMs() {
|
||||
return connectTimeoutMs;
|
||||
}
|
||||
|
||||
public void setConnectTimeoutMs(Integer connectTimeoutMs) {
|
||||
this.connectTimeoutMs = connectTimeoutMs;
|
||||
}
|
||||
|
||||
public Integer getReadTimeoutMs() {
|
||||
return readTimeoutMs;
|
||||
}
|
||||
|
||||
public void setReadTimeoutMs(Integer readTimeoutMs) {
|
||||
this.readTimeoutMs = readTimeoutMs;
|
||||
}
|
||||
|
||||
public Integer getWriteTimeoutMs() {
|
||||
return writeTimeoutMs;
|
||||
}
|
||||
|
||||
public void setWriteTimeoutMs(Integer writeTimeoutMs) {
|
||||
this.writeTimeoutMs = writeTimeoutMs;
|
||||
}
|
||||
|
||||
public Integer getPollIntervalMs() {
|
||||
return pollIntervalMs;
|
||||
}
|
||||
|
||||
public void setPollIntervalMs(Integer pollIntervalMs) {
|
||||
this.pollIntervalMs = pollIntervalMs;
|
||||
}
|
||||
|
||||
public Integer getResultTimeoutMs() {
|
||||
return resultTimeoutMs;
|
||||
}
|
||||
|
||||
public void setResultTimeoutMs(Integer resultTimeoutMs) {
|
||||
this.resultTimeoutMs = resultTimeoutMs;
|
||||
}
|
||||
|
||||
public String getDefaultBackend() {
|
||||
return defaultBackend;
|
||||
}
|
||||
|
||||
public void setDefaultBackend(String defaultBackend) {
|
||||
this.defaultBackend = defaultBackend;
|
||||
}
|
||||
|
||||
public String getDefaultParseMethod() {
|
||||
return defaultParseMethod;
|
||||
}
|
||||
|
||||
public void setDefaultParseMethod(String defaultParseMethod) {
|
||||
this.defaultParseMethod = defaultParseMethod;
|
||||
}
|
||||
|
||||
public List<String> getDefaultLangList() {
|
||||
return defaultLangList;
|
||||
}
|
||||
|
||||
public void setDefaultLangList(List<String> defaultLangList) {
|
||||
this.defaultLangList = defaultLangList == null
|
||||
? new ArrayList<String>(Arrays.asList("ch"))
|
||||
: defaultLangList;
|
||||
}
|
||||
|
||||
public Boolean getDefaultFormulaEnable() {
|
||||
return defaultFormulaEnable;
|
||||
}
|
||||
|
||||
public void setDefaultFormulaEnable(Boolean defaultFormulaEnable) {
|
||||
this.defaultFormulaEnable = defaultFormulaEnable;
|
||||
}
|
||||
|
||||
public Boolean getDefaultTableEnable() {
|
||||
return defaultTableEnable;
|
||||
}
|
||||
|
||||
public void setDefaultTableEnable(Boolean defaultTableEnable) {
|
||||
this.defaultTableEnable = defaultTableEnable;
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,11 @@
|
||||
package com.easyagents.spring.boot.document.pdf.mineru;
|
||||
|
||||
import com.easyagents.document.core.DocumentParseService;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.pdf.PdfDocumentParseService;
|
||||
import com.easyagents.document.pdf.mineru.MineruPdfDocumentParseService;
|
||||
import com.easyagents.document.pdf.mineru.MineruProperties;
|
||||
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
@@ -20,7 +22,7 @@ import org.springframework.context.annotation.Configuration;
|
||||
@Configuration(proxyBeanMethods = false)
|
||||
@ConditionalOnClass(MineruPdfDocumentParseService.class)
|
||||
@ConditionalOnProperty(prefix = "easy-agents.document.pdf", name = "provider", havingValue = "mineru")
|
||||
@EnableConfigurationProperties(MineruDocumentProperties.class)
|
||||
@EnableConfigurationProperties({MineruDocumentProperties.class, CommonMineruDocumentProperties.class})
|
||||
public class MineruPdfAutoConfiguration {
|
||||
|
||||
/**
|
||||
@@ -31,8 +33,9 @@ public class MineruPdfAutoConfiguration {
|
||||
*/
|
||||
@Bean
|
||||
@ConditionalOnMissingBean(PdfDocumentParseService.class)
|
||||
public PdfDocumentParseService pdfDocumentParseService(MineruDocumentProperties properties) {
|
||||
return new MineruPdfDocumentParseService(toMineruProperties(properties));
|
||||
public PdfDocumentParseService pdfDocumentParseService(MineruDocumentProperties properties,
|
||||
CommonMineruDocumentProperties commonProperties) {
|
||||
return new MineruPdfDocumentParseService(toMineruProperties(properties, commonProperties));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -47,19 +50,21 @@ public class MineruPdfAutoConfiguration {
|
||||
return pdfDocumentParseService;
|
||||
}
|
||||
|
||||
private MineruProperties toMineruProperties(MineruDocumentProperties properties) {
|
||||
private MineruProperties toMineruProperties(MineruDocumentProperties properties,
|
||||
CommonMineruDocumentProperties commonProperties) {
|
||||
MineruProperties mineruProperties = new MineruProperties();
|
||||
mineruProperties.setBaseUrl(properties.getBaseUrl());
|
||||
mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs());
|
||||
mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs());
|
||||
mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs());
|
||||
mineruProperties.setPollIntervalMs(properties.getPollIntervalMs());
|
||||
mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs());
|
||||
mineruProperties.setDefaultBackend(properties.getDefaultBackend());
|
||||
mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod());
|
||||
mineruProperties.setDefaultLangList(properties.getDefaultLangList());
|
||||
mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable());
|
||||
mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable());
|
||||
boolean useCommon = commonProperties != null && StringUtil.hasText(commonProperties.getBaseUrl());
|
||||
mineruProperties.setBaseUrl(useCommon ? commonProperties.getBaseUrl() : properties.getBaseUrl());
|
||||
mineruProperties.setConnectTimeoutMs(useCommon ? commonProperties.getConnectTimeoutMs() : properties.getConnectTimeoutMs());
|
||||
mineruProperties.setReadTimeoutMs(useCommon ? commonProperties.getReadTimeoutMs() : properties.getReadTimeoutMs());
|
||||
mineruProperties.setWriteTimeoutMs(useCommon ? commonProperties.getWriteTimeoutMs() : properties.getWriteTimeoutMs());
|
||||
mineruProperties.setPollIntervalMs(useCommon ? commonProperties.getPollIntervalMs() : properties.getPollIntervalMs());
|
||||
mineruProperties.setResultTimeoutMs(useCommon ? commonProperties.getResultTimeoutMs() : properties.getResultTimeoutMs());
|
||||
mineruProperties.setDefaultBackend(useCommon ? commonProperties.getDefaultBackend() : properties.getDefaultBackend());
|
||||
mineruProperties.setDefaultParseMethod(useCommon ? commonProperties.getDefaultParseMethod() : properties.getDefaultParseMethod());
|
||||
mineruProperties.setDefaultLangList(useCommon ? commonProperties.getDefaultLangList() : properties.getDefaultLangList());
|
||||
mineruProperties.setDefaultFormulaEnable(useCommon ? commonProperties.getDefaultFormulaEnable() : properties.getDefaultFormulaEnable());
|
||||
mineruProperties.setDefaultTableEnable(useCommon ? commonProperties.getDefaultTableEnable() : properties.getDefaultTableEnable());
|
||||
return mineruProperties;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
package com.easyagents.spring.boot.document.pptx;
|
||||
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.pptx.PptxDocumentParseService;
|
||||
import com.easyagents.document.pptx.mineru.MineruPptxDocumentParseService;
|
||||
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
/**
|
||||
* MinerU PPTX 自动装配。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
@Configuration(proxyBeanMethods = false)
|
||||
@ConditionalOnClass(MineruPptxDocumentParseService.class)
|
||||
@ConditionalOnProperty(prefix = "easy-agents.document.pptx", name = "enabled", havingValue = "true")
|
||||
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, PptxDocumentProperties.class})
|
||||
public class MineruPptxAutoConfiguration {
|
||||
|
||||
@Bean
|
||||
@ConditionalOnMissingBean(name = "pptxDocumentAsyncTaskManager")
|
||||
public DocumentAsyncTaskManager pptxDocumentAsyncTaskManager(PptxDocumentProperties properties) {
|
||||
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads();
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
|
||||
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
|
||||
}
|
||||
|
||||
@Bean
|
||||
@ConditionalOnMissingBean(PptxDocumentParseService.class)
|
||||
public PptxDocumentParseService pptxDocumentParseService(CommonMineruDocumentProperties commonProperties,
|
||||
DocumentAsyncTaskManager pptxDocumentAsyncTaskManager) {
|
||||
return new MineruPptxDocumentParseService(toMineruProperties(commonProperties), pptxDocumentAsyncTaskManager);
|
||||
}
|
||||
|
||||
private MineruProperties toMineruProperties(CommonMineruDocumentProperties properties) {
|
||||
MineruProperties mineruProperties = new MineruProperties();
|
||||
mineruProperties.setBaseUrl(properties.getBaseUrl());
|
||||
mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs());
|
||||
mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs());
|
||||
mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs());
|
||||
mineruProperties.setPollIntervalMs(properties.getPollIntervalMs());
|
||||
mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs());
|
||||
mineruProperties.setDefaultBackend(properties.getDefaultBackend());
|
||||
mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod());
|
||||
mineruProperties.setDefaultLangList(properties.getDefaultLangList());
|
||||
mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable());
|
||||
mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable());
|
||||
return mineruProperties;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
package com.easyagents.spring.boot.document.pptx;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
/**
|
||||
* PPTX 文档配置。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
@ConfigurationProperties(prefix = "easy-agents.document.pptx")
|
||||
public class PptxDocumentProperties {
|
||||
|
||||
private Boolean enabled = false;
|
||||
private Integer asyncThreads = 2;
|
||||
|
||||
public Boolean getEnabled() {
|
||||
return enabled;
|
||||
}
|
||||
|
||||
public void setEnabled(Boolean enabled) {
|
||||
this.enabled = enabled;
|
||||
}
|
||||
|
||||
public Integer getAsyncThreads() {
|
||||
return asyncThreads;
|
||||
}
|
||||
|
||||
public void setAsyncThreads(Integer asyncThreads) {
|
||||
this.asyncThreads = asyncThreads;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
package com.easyagents.spring.boot.document.xlsx;
|
||||
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||
import com.easyagents.document.core.mineru.MineruProperties;
|
||||
import com.easyagents.document.xlsx.XlsxDocumentParseService;
|
||||
import com.easyagents.document.xlsx.mineru.MineruXlsxDocumentParseService;
|
||||
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
/**
|
||||
* MinerU XLSX 自动装配。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
@Configuration(proxyBeanMethods = false)
|
||||
@ConditionalOnClass(MineruXlsxDocumentParseService.class)
|
||||
@ConditionalOnProperty(prefix = "easy-agents.document.xlsx", name = "enabled", havingValue = "true")
|
||||
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, XlsxDocumentProperties.class})
|
||||
public class MineruXlsxAutoConfiguration {
|
||||
|
||||
@Bean
|
||||
@ConditionalOnMissingBean(name = "xlsxDocumentAsyncTaskManager")
|
||||
public DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager(XlsxDocumentProperties properties) {
|
||||
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads();
|
||||
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
|
||||
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
|
||||
}
|
||||
|
||||
@Bean
|
||||
@ConditionalOnMissingBean(XlsxDocumentParseService.class)
|
||||
public XlsxDocumentParseService xlsxDocumentParseService(CommonMineruDocumentProperties commonProperties,
|
||||
DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager) {
|
||||
return new MineruXlsxDocumentParseService(toMineruProperties(commonProperties), xlsxDocumentAsyncTaskManager);
|
||||
}
|
||||
|
||||
private MineruProperties toMineruProperties(CommonMineruDocumentProperties properties) {
|
||||
MineruProperties mineruProperties = new MineruProperties();
|
||||
mineruProperties.setBaseUrl(properties.getBaseUrl());
|
||||
mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs());
|
||||
mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs());
|
||||
mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs());
|
||||
mineruProperties.setPollIntervalMs(properties.getPollIntervalMs());
|
||||
mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs());
|
||||
mineruProperties.setDefaultBackend(properties.getDefaultBackend());
|
||||
mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod());
|
||||
mineruProperties.setDefaultLangList(properties.getDefaultLangList());
|
||||
mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable());
|
||||
mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable());
|
||||
return mineruProperties;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
package com.easyagents.spring.boot.document.xlsx;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
/**
|
||||
* XLSX 文档配置。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
@ConfigurationProperties(prefix = "easy-agents.document.xlsx")
|
||||
public class XlsxDocumentProperties {
|
||||
|
||||
private Boolean enabled = false;
|
||||
private Integer asyncThreads = 2;
|
||||
|
||||
public Boolean getEnabled() {
|
||||
return enabled;
|
||||
}
|
||||
|
||||
public void setEnabled(Boolean enabled) {
|
||||
this.enabled = enabled;
|
||||
}
|
||||
|
||||
public Integer getAsyncThreads() {
|
||||
return asyncThreads;
|
||||
}
|
||||
|
||||
public void setAsyncThreads(Integer asyncThreads) {
|
||||
this.asyncThreads = asyncThreads;
|
||||
}
|
||||
}
|
||||
@@ -9,3 +9,5 @@ com.easyagents.spring.boot.store.elasticsearch.ElasticSearchAutoConfiguration
|
||||
com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration
|
||||
com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration
|
||||
com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration
|
||||
com.easyagents.spring.boot.document.pptx.MineruPptxAutoConfiguration
|
||||
com.easyagents.spring.boot.document.xlsx.MineruXlsxAutoConfiguration
|
||||
|
||||
@@ -2,8 +2,12 @@ package com.easyagents.spring.boot.autoconfigure;
|
||||
|
||||
import com.easyagents.document.core.DocumentParseService;
|
||||
import com.easyagents.document.pdf.PdfDocumentParseService;
|
||||
import com.easyagents.document.pptx.PptxDocumentParseService;
|
||||
import com.easyagents.document.xlsx.XlsxDocumentParseService;
|
||||
import com.easyagents.llm.ollama.OllamaChatModel;
|
||||
import com.easyagents.spring.boot.document.pptx.MineruPptxAutoConfiguration;
|
||||
import com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration;
|
||||
import com.easyagents.spring.boot.document.xlsx.MineruXlsxAutoConfiguration;
|
||||
import com.easyagents.spring.boot.llm.ollama.OllamaAutoConfiguration;
|
||||
import com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration;
|
||||
import com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration;
|
||||
@@ -18,7 +22,9 @@ public class StarterConditionalAutoConfigurationTest {
|
||||
RagIngestionAutoConfiguration.class,
|
||||
OllamaAutoConfiguration.class,
|
||||
OpenSearchAutoConfiguration.class,
|
||||
MineruPdfAutoConfiguration.class
|
||||
MineruPdfAutoConfiguration.class,
|
||||
MineruPptxAutoConfiguration.class,
|
||||
MineruXlsxAutoConfiguration.class
|
||||
);
|
||||
|
||||
@Test
|
||||
@@ -51,4 +57,19 @@ public class StarterConditionalAutoConfigurationTest {
|
||||
Assert.assertNotNull(context.getBean(DocumentParseService.class));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldCreatePptxAndXlsxBeansWhenEnabled() {
|
||||
contextRunner
|
||||
.withPropertyValues(
|
||||
"easy-agents.document.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api",
|
||||
"easy-agents.document.pptx.enabled=true",
|
||||
"easy-agents.document.xlsx.enabled=true"
|
||||
)
|
||||
.run(context -> {
|
||||
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
|
||||
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
|
||||
Assert.assertFalse(context.containsBean("documentParseService"));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
12
pom.xml
12
pom.xml
@@ -132,6 +132,18 @@
|
||||
<version>${revision}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document-pptx</artifactId>
|
||||
<version>${revision}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-document-xlsx</artifactId>
|
||||
<version>${revision}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-rag-core</artifactId>
|
||||
|
||||
Reference in New Issue
Block a user