feat: 扩展 Office 文档解析能力
- 重构 document-core 与 MinerU 公共层,补齐 Office 异步任务基础设施 - 新增 PPTX/XLSX 解析模块与 starter 自动装配 - 补充 README 与相关测试覆盖
This commit is contained in:
@@ -15,7 +15,7 @@ Easy-Agents 是一个轻量、可扩展的 Java AI 应用开发框架,覆盖
|
|||||||
|
|
||||||
- `easy-agents-bom`:依赖版本管理(BOM)。
|
- `easy-agents-bom`:依赖版本管理(BOM)。
|
||||||
- `easy-agents-core`:核心抽象与基础能力。
|
- `easy-agents-core`:核心抽象与基础能力。
|
||||||
- `easy-agents-document`:统一文档解析能力域,当前提供 PDF 解析抽象与 MinerU provider。
|
- `easy-agents-document`:统一文档解析能力域,当前提供 PDF、PPTX、XLSX 解析抽象与 MinerU 复用能力。
|
||||||
- `easy-agents-chat`:对话模型接入实现集合。
|
- `easy-agents-chat`:对话模型接入实现集合。
|
||||||
- `easy-agents-embedding`:向量化模型实现集合。
|
- `easy-agents-embedding`:向量化模型实现集合。
|
||||||
- `easy-agents-rerank`:重排模型实现集合。
|
- `easy-agents-rerank`:重排模型实现集合。
|
||||||
|
|||||||
@@ -66,6 +66,16 @@
|
|||||||
<artifactId>easy-agents-document-pdf</artifactId>
|
<artifactId>easy-agents-document-pdf</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-document-pptx</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-document-xlsx</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.easyagents</groupId>
|
<groupId>com.easyagents</groupId>
|
||||||
<artifactId>easy-agents-rag-core</artifactId>
|
<artifactId>easy-agents-rag-core</artifactId>
|
||||||
|
|||||||
@@ -24,5 +24,26 @@
|
|||||||
<groupId>com.easyagents</groupId>
|
<groupId>com.easyagents</groupId>
|
||||||
<artifactId>easy-agents-core</artifactId>
|
<artifactId>easy-agents-core</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.alibaba.fastjson2</groupId>
|
||||||
|
<artifactId>fastjson2</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.squareup.okhttp3</groupId>
|
||||||
|
<artifactId>okhttp</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>slf4j-api</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</project>
|
</project>
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
package com.easyagents.document.core;
|
package com.easyagents.document.core;
|
||||||
|
|
||||||
import com.easyagents.document.core.model.ParseRequest;
|
import com.easyagents.document.core.entity.ParseRequest;
|
||||||
import com.easyagents.document.core.model.ParseResponse;
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
import com.easyagents.document.core.model.ParseTaskInfo;
|
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 统一文档解析服务抽象。
|
* 统一文档解析服务抽象。
|
||||||
@@ -11,7 +11,7 @@ import com.easyagents.document.core.model.ParseTaskStatus;
|
|||||||
* @author Codex
|
* @author Codex
|
||||||
* @since 2026-04-14
|
* @since 2026-04-14
|
||||||
*/
|
*/
|
||||||
public interface DocumentParseService {
|
public interface DocumentParseService<R extends ParseRequest> {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 同步解析文档并直接返回结果。
|
* 同步解析文档并直接返回结果。
|
||||||
|
|||||||
@@ -0,0 +1,173 @@
|
|||||||
|
package com.easyagents.document.core.async;
|
||||||
|
|
||||||
|
import com.easyagents.core.util.StringUtil;
|
||||||
|
import com.easyagents.document.core.exception.DocumentParseException;
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||||
|
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.UUID;
|
||||||
|
import java.util.concurrent.Executor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 文档异步任务管理器。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class DocumentAsyncTaskManager {
|
||||||
|
|
||||||
|
private final DocumentAsyncTaskRepository repository;
|
||||||
|
private final Executor executor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建任务管理器。
|
||||||
|
*
|
||||||
|
* @param repository 任务仓库
|
||||||
|
* @param executor 执行器
|
||||||
|
*/
|
||||||
|
public DocumentAsyncTaskManager(DocumentAsyncTaskRepository repository, Executor executor) {
|
||||||
|
if (repository == null) {
|
||||||
|
throw new IllegalArgumentException("DocumentAsyncTaskRepository must not be null");
|
||||||
|
}
|
||||||
|
if (executor == null) {
|
||||||
|
throw new IllegalArgumentException("Executor must not be null");
|
||||||
|
}
|
||||||
|
this.repository = repository;
|
||||||
|
this.executor = executor;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 提交异步任务。
|
||||||
|
*
|
||||||
|
* @param backend 后端标识
|
||||||
|
* @param fileNames 文件名列表
|
||||||
|
* @param runner 任务执行器
|
||||||
|
* @return 初始任务状态
|
||||||
|
*/
|
||||||
|
public ParseTaskStatus submit(String backend, List<String> fileNames, final DocumentAsyncTaskRunner runner) {
|
||||||
|
final String taskId = UUID.randomUUID().toString();
|
||||||
|
final ParseTaskStatus status = new ParseTaskStatus();
|
||||||
|
status.setTaskId(taskId);
|
||||||
|
status.setStatus("queued");
|
||||||
|
status.setBackend(backend);
|
||||||
|
status.setFileNames(fileNames == null ? new ArrayList<String>() : new ArrayList<String>(fileNames));
|
||||||
|
status.setCreatedAt(Instant.now().toString());
|
||||||
|
status.setCurrentStage("queued");
|
||||||
|
status.setProgressPercent(0);
|
||||||
|
status.setProcessedItems(0);
|
||||||
|
status.setTotalItems(fileNames == null ? 0 : fileNames.size());
|
||||||
|
status.setStatusMessage("任务已进入队列");
|
||||||
|
|
||||||
|
final DocumentAsyncTaskRecord record = new DocumentAsyncTaskRecord(status);
|
||||||
|
repository.save(record);
|
||||||
|
|
||||||
|
executor.execute(new Runnable() {
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
markRunning(record);
|
||||||
|
try {
|
||||||
|
ParseResponse response = runner.run(new RepositoryBackedTaskUpdater(record));
|
||||||
|
ParseTaskStatus completed = record.getStatusSnapshot();
|
||||||
|
completed.setStatus("completed");
|
||||||
|
completed.setCompletedAt(Instant.now().toString());
|
||||||
|
completed.setProgressPercent(100);
|
||||||
|
completed.setCurrentStage("completed");
|
||||||
|
completed.setStatusMessage("任务执行完成");
|
||||||
|
record.setResult(response);
|
||||||
|
record.updateStatus(completed);
|
||||||
|
} catch (Exception exception) {
|
||||||
|
ParseTaskStatus failed = record.getStatusSnapshot();
|
||||||
|
failed.setStatus("failed");
|
||||||
|
failed.setCompletedAt(Instant.now().toString());
|
||||||
|
failed.setCurrentStage("failed");
|
||||||
|
failed.setStatusMessage(exception.getMessage());
|
||||||
|
failed.setError(exception.getMessage());
|
||||||
|
record.updateStatus(failed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return record.getStatusSnapshot();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 查询任务状态。
|
||||||
|
*
|
||||||
|
* @param taskId 任务 ID
|
||||||
|
* @return 任务状态
|
||||||
|
*/
|
||||||
|
public ParseTaskStatus queryTask(String taskId) {
|
||||||
|
return requireRecord(taskId).getStatusSnapshot();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 查询任务聚合信息。
|
||||||
|
*
|
||||||
|
* @param taskId 任务 ID
|
||||||
|
* @return 聚合信息
|
||||||
|
*/
|
||||||
|
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||||
|
return requireRecord(taskId).getTaskInfoSnapshot();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取任务结果。
|
||||||
|
*
|
||||||
|
* @param taskId 任务 ID
|
||||||
|
* @return 任务结果
|
||||||
|
*/
|
||||||
|
public ParseResponse queryResult(String taskId) {
|
||||||
|
DocumentAsyncTaskRecord record = requireRecord(taskId);
|
||||||
|
ParseTaskStatus status = record.getStatusSnapshot();
|
||||||
|
if (!"completed".equalsIgnoreCase(status.getStatus())) {
|
||||||
|
throw new DocumentParseException("Document async task is not completed: " + taskId);
|
||||||
|
}
|
||||||
|
return record.getResult();
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentAsyncTaskRecord requireRecord(String taskId) {
|
||||||
|
if (!StringUtil.hasText(taskId)) {
|
||||||
|
throw new IllegalArgumentException("taskId must not be empty");
|
||||||
|
}
|
||||||
|
DocumentAsyncTaskRecord record = repository.find(taskId);
|
||||||
|
if (record == null) {
|
||||||
|
throw new DocumentParseException("Document async task not found: " + taskId);
|
||||||
|
}
|
||||||
|
return record;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void markRunning(DocumentAsyncTaskRecord record) {
|
||||||
|
ParseTaskStatus status = record.getStatusSnapshot();
|
||||||
|
status.setStatus("preparing");
|
||||||
|
status.setStartedAt(Instant.now().toString());
|
||||||
|
status.setCurrentStage("preparing");
|
||||||
|
status.setProgressPercent(0);
|
||||||
|
status.setStatusMessage("任务开始执行");
|
||||||
|
record.updateStatus(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class RepositoryBackedTaskUpdater implements DocumentAsyncTaskUpdater {
|
||||||
|
|
||||||
|
private final DocumentAsyncTaskRecord record;
|
||||||
|
|
||||||
|
private RepositoryBackedTaskUpdater(DocumentAsyncTaskRecord record) {
|
||||||
|
this.record = record;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void update(String stage, Integer progressPercent, Integer processedItems, Integer totalItems, String statusMessage) {
|
||||||
|
ParseTaskStatus status = record.getStatusSnapshot();
|
||||||
|
status.setStatus("completed".equalsIgnoreCase(stage) ? "completed" : "running");
|
||||||
|
status.setCurrentStage(stage);
|
||||||
|
status.setProgressPercent(progressPercent);
|
||||||
|
status.setProcessedItems(processedItems);
|
||||||
|
status.setTotalItems(totalItems);
|
||||||
|
status.setStatusMessage(statusMessage);
|
||||||
|
record.updateStatus(status);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,101 @@
|
|||||||
|
package com.easyagents.document.core.async;
|
||||||
|
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 文档异步任务记录。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class DocumentAsyncTaskRecord {
|
||||||
|
|
||||||
|
private final ParseTaskStatus status;
|
||||||
|
private ParseResponse result;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建任务记录。
|
||||||
|
*
|
||||||
|
* @param status 初始状态
|
||||||
|
*/
|
||||||
|
public DocumentAsyncTaskRecord(ParseTaskStatus status) {
|
||||||
|
this.status = status;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取状态快照。
|
||||||
|
*
|
||||||
|
* @return 状态快照
|
||||||
|
*/
|
||||||
|
public synchronized ParseTaskStatus getStatusSnapshot() {
|
||||||
|
return copyStatus(status);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取聚合信息快照。
|
||||||
|
*
|
||||||
|
* @return 聚合信息
|
||||||
|
*/
|
||||||
|
public synchronized ParseTaskInfo getTaskInfoSnapshot() {
|
||||||
|
ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(status);
|
||||||
|
taskInfo.setResult(result);
|
||||||
|
return taskInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取结果。
|
||||||
|
*
|
||||||
|
* @return 最终结果
|
||||||
|
*/
|
||||||
|
public synchronized ParseResponse getResult() {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 更新任务状态。
|
||||||
|
*
|
||||||
|
* @param newStatus 新状态
|
||||||
|
*/
|
||||||
|
public synchronized void updateStatus(ParseTaskStatus newStatus) {
|
||||||
|
if (newStatus == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
copyInto(newStatus, status);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 更新任务结果。
|
||||||
|
*
|
||||||
|
* @param result 最终结果
|
||||||
|
*/
|
||||||
|
public synchronized void setResult(ParseResponse result) {
|
||||||
|
this.result = result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private ParseTaskStatus copyStatus(ParseTaskStatus source) {
|
||||||
|
ParseTaskStatus copy = new ParseTaskStatus();
|
||||||
|
copyInto(source, copy);
|
||||||
|
return copy;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void copyInto(ParseTaskStatus source, ParseTaskStatus target) {
|
||||||
|
target.setTaskId(source.getTaskId());
|
||||||
|
target.setStatus(source.getStatus());
|
||||||
|
target.setBackend(source.getBackend());
|
||||||
|
target.setFileNames(source.getFileNames());
|
||||||
|
target.setCreatedAt(source.getCreatedAt());
|
||||||
|
target.setStartedAt(source.getStartedAt());
|
||||||
|
target.setCompletedAt(source.getCompletedAt());
|
||||||
|
target.setError(source.getError());
|
||||||
|
target.setStatusUrl(source.getStatusUrl());
|
||||||
|
target.setResultUrl(source.getResultUrl());
|
||||||
|
target.setQueuedAhead(source.getQueuedAhead());
|
||||||
|
target.setProgressPercent(source.getProgressPercent());
|
||||||
|
target.setCurrentStage(source.getCurrentStage());
|
||||||
|
target.setProcessedItems(source.getProcessedItems());
|
||||||
|
target.setTotalItems(source.getTotalItems());
|
||||||
|
target.setStatusMessage(source.getStatusMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,25 @@
|
|||||||
|
package com.easyagents.document.core.async;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 文档异步任务仓库。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public interface DocumentAsyncTaskRepository {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 保存任务记录。
|
||||||
|
*
|
||||||
|
* @param record 任务记录
|
||||||
|
*/
|
||||||
|
void save(DocumentAsyncTaskRecord record);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取任务记录。
|
||||||
|
*
|
||||||
|
* @param taskId 任务 ID
|
||||||
|
* @return 任务记录,不存在时返回 {@code null}
|
||||||
|
*/
|
||||||
|
DocumentAsyncTaskRecord find(String taskId);
|
||||||
|
}
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
package com.easyagents.document.core.async;
|
||||||
|
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 文档异步任务执行器。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public interface DocumentAsyncTaskRunner {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 执行任务。
|
||||||
|
*
|
||||||
|
* @param updater 状态更新器
|
||||||
|
* @return 解析结果
|
||||||
|
* @throws Exception 执行异常
|
||||||
|
*/
|
||||||
|
ParseResponse run(DocumentAsyncTaskUpdater updater) throws Exception;
|
||||||
|
}
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
package com.easyagents.document.core.async;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 文档异步任务进度更新器。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public interface DocumentAsyncTaskUpdater {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 更新任务状态。
|
||||||
|
*
|
||||||
|
* @param stage 当前阶段
|
||||||
|
* @param progressPercent 进度百分比
|
||||||
|
* @param processedItems 已处理数量
|
||||||
|
* @param totalItems 总数量
|
||||||
|
* @param statusMessage 状态说明
|
||||||
|
*/
|
||||||
|
void update(String stage, Integer progressPercent, Integer processedItems, Integer totalItems, String statusMessage);
|
||||||
|
}
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
package com.easyagents.document.core.async;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 基于内存的异步任务仓库。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class InMemoryDocumentAsyncTaskRepository implements DocumentAsyncTaskRepository {
|
||||||
|
|
||||||
|
private final Map<String, DocumentAsyncTaskRecord> records = new ConcurrentHashMap<String, DocumentAsyncTaskRecord>();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void save(DocumentAsyncTaskRecord record) {
|
||||||
|
if (record == null || record.getStatusSnapshot() == null || record.getStatusSnapshot().getTaskId() == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
records.put(record.getStatusSnapshot().getTaskId(), record);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocumentAsyncTaskRecord find(String taskId) {
|
||||||
|
return records.get(taskId);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.core.model;
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.core.model;
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -16,6 +16,7 @@ public class DocumentImage {
|
|||||||
private String mimeType;
|
private String mimeType;
|
||||||
private String sourcePath;
|
private String sourcePath;
|
||||||
private String dataUrl;
|
private String dataUrl;
|
||||||
|
private byte[] content;
|
||||||
private List<Double> boundingBox = new ArrayList<Double>();
|
private List<Double> boundingBox = new ArrayList<Double>();
|
||||||
private List<String> captions = new ArrayList<String>();
|
private List<String> captions = new ArrayList<String>();
|
||||||
private List<String> footnotes = new ArrayList<String>();
|
private List<String> footnotes = new ArrayList<String>();
|
||||||
@@ -60,6 +61,14 @@ public class DocumentImage {
|
|||||||
this.dataUrl = dataUrl;
|
this.dataUrl = dataUrl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public byte[] getContent() {
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setContent(byte[] content) {
|
||||||
|
this.content = content;
|
||||||
|
}
|
||||||
|
|
||||||
public List<Double> getBoundingBox() {
|
public List<Double> getBoundingBox() {
|
||||||
return boundingBox;
|
return boundingBox;
|
||||||
}
|
}
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.core.model;
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.core.model;
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.core.model;
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.core.model;
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.core.model;
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -13,12 +13,7 @@ public class ParseRequest {
|
|||||||
|
|
||||||
private List<ParseFile> files = new ArrayList<ParseFile>();
|
private List<ParseFile> files = new ArrayList<ParseFile>();
|
||||||
private String backend;
|
private String backend;
|
||||||
private String parseMethod = "auto";
|
|
||||||
private List<String> languages = new ArrayList<String>();
|
private List<String> languages = new ArrayList<String>();
|
||||||
private Boolean formulaEnabled = true;
|
|
||||||
private Boolean tableEnabled = true;
|
|
||||||
private Integer startPageIndex = 0;
|
|
||||||
private Integer endPageIndex = 99999;
|
|
||||||
private Boolean returnMarkdown = true;
|
private Boolean returnMarkdown = true;
|
||||||
private Boolean returnMiddleJson = true;
|
private Boolean returnMiddleJson = true;
|
||||||
private Boolean returnContentList = true;
|
private Boolean returnContentList = true;
|
||||||
@@ -38,6 +33,25 @@ public class ParseRequest {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 复制通用字段到目标请求。
|
||||||
|
*
|
||||||
|
* @param target 目标请求
|
||||||
|
*/
|
||||||
|
public void copyCommonFieldsTo(ParseRequest target) {
|
||||||
|
if (target == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
target.setFiles(new ArrayList<ParseFile>(getFiles()));
|
||||||
|
target.setBackend(getBackend());
|
||||||
|
target.setLanguages(new ArrayList<String>(getLanguages()));
|
||||||
|
target.setReturnMarkdown(getReturnMarkdown());
|
||||||
|
target.setReturnMiddleJson(getReturnMiddleJson());
|
||||||
|
target.setReturnContentList(getReturnContentList());
|
||||||
|
target.setReturnModelOutput(getReturnModelOutput());
|
||||||
|
target.setReturnImages(getReturnImages());
|
||||||
|
}
|
||||||
|
|
||||||
public List<ParseFile> getFiles() {
|
public List<ParseFile> getFiles() {
|
||||||
return files;
|
return files;
|
||||||
}
|
}
|
||||||
@@ -54,14 +68,6 @@ public class ParseRequest {
|
|||||||
this.backend = backend;
|
this.backend = backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getParseMethod() {
|
|
||||||
return parseMethod;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setParseMethod(String parseMethod) {
|
|
||||||
this.parseMethod = parseMethod;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<String> getLanguages() {
|
public List<String> getLanguages() {
|
||||||
return languages;
|
return languages;
|
||||||
}
|
}
|
||||||
@@ -70,38 +76,6 @@ public class ParseRequest {
|
|||||||
this.languages = languages == null ? new ArrayList<String>() : languages;
|
this.languages = languages == null ? new ArrayList<String>() : languages;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Boolean getFormulaEnabled() {
|
|
||||||
return formulaEnabled;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setFormulaEnabled(Boolean formulaEnabled) {
|
|
||||||
this.formulaEnabled = formulaEnabled;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Boolean getTableEnabled() {
|
|
||||||
return tableEnabled;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setTableEnabled(Boolean tableEnabled) {
|
|
||||||
this.tableEnabled = tableEnabled;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer getStartPageIndex() {
|
|
||||||
return startPageIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setStartPageIndex(Integer startPageIndex) {
|
|
||||||
this.startPageIndex = startPageIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer getEndPageIndex() {
|
|
||||||
return endPageIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setEndPageIndex(Integer endPageIndex) {
|
|
||||||
this.endPageIndex = endPageIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Boolean getReturnMarkdown() {
|
public Boolean getReturnMarkdown() {
|
||||||
return returnMarkdown;
|
return returnMarkdown;
|
||||||
}
|
}
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.core.model;
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.core.model;
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.core.model;
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 异步任务聚合查询结果。
|
* 异步任务聚合查询结果。
|
||||||
@@ -35,6 +35,11 @@ public class ParseTaskInfo extends ParseTaskStatus {
|
|||||||
taskInfo.setStatusUrl(status.getStatusUrl());
|
taskInfo.setStatusUrl(status.getStatusUrl());
|
||||||
taskInfo.setResultUrl(status.getResultUrl());
|
taskInfo.setResultUrl(status.getResultUrl());
|
||||||
taskInfo.setQueuedAhead(status.getQueuedAhead());
|
taskInfo.setQueuedAhead(status.getQueuedAhead());
|
||||||
|
taskInfo.setProgressPercent(status.getProgressPercent());
|
||||||
|
taskInfo.setCurrentStage(status.getCurrentStage());
|
||||||
|
taskInfo.setProcessedItems(status.getProcessedItems());
|
||||||
|
taskInfo.setTotalItems(status.getTotalItems());
|
||||||
|
taskInfo.setStatusMessage(status.getStatusMessage());
|
||||||
return taskInfo;
|
return taskInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.core.model;
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -22,6 +22,11 @@ public class ParseTaskStatus {
|
|||||||
private String statusUrl;
|
private String statusUrl;
|
||||||
private String resultUrl;
|
private String resultUrl;
|
||||||
private Integer queuedAhead;
|
private Integer queuedAhead;
|
||||||
|
private Integer progressPercent;
|
||||||
|
private String currentStage;
|
||||||
|
private Integer processedItems;
|
||||||
|
private Integer totalItems;
|
||||||
|
private String statusMessage;
|
||||||
|
|
||||||
public String getTaskId() {
|
public String getTaskId() {
|
||||||
return taskId;
|
return taskId;
|
||||||
@@ -110,4 +115,44 @@ public class ParseTaskStatus {
|
|||||||
public void setQueuedAhead(Integer queuedAhead) {
|
public void setQueuedAhead(Integer queuedAhead) {
|
||||||
this.queuedAhead = queuedAhead;
|
this.queuedAhead = queuedAhead;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Integer getProgressPercent() {
|
||||||
|
return progressPercent;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setProgressPercent(Integer progressPercent) {
|
||||||
|
this.progressPercent = progressPercent;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCurrentStage() {
|
||||||
|
return currentStage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCurrentStage(String currentStage) {
|
||||||
|
this.currentStage = currentStage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getProcessedItems() {
|
||||||
|
return processedItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setProcessedItems(Integer processedItems) {
|
||||||
|
this.processedItems = processedItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getTotalItems() {
|
||||||
|
return totalItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTotalItems(Integer totalItems) {
|
||||||
|
this.totalItems = totalItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getStatusMessage() {
|
||||||
|
return statusMessage;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStatusMessage(String statusMessage) {
|
||||||
|
this.statusMessage = statusMessage;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -0,0 +1,79 @@
|
|||||||
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PDF 解析请求。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class PdfParseRequest extends ParseRequest {
|
||||||
|
|
||||||
|
private String parseMethod = "auto";
|
||||||
|
private Boolean formulaEnabled = true;
|
||||||
|
private Boolean tableEnabled = true;
|
||||||
|
private Integer startPageIndex = 0;
|
||||||
|
private Integer endPageIndex = 99999;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将通用请求转换为 PDF 请求。
|
||||||
|
*
|
||||||
|
* @param request 原始请求
|
||||||
|
* @return PDF 请求
|
||||||
|
*/
|
||||||
|
public static PdfParseRequest from(ParseRequest request) {
|
||||||
|
PdfParseRequest pdfParseRequest = new PdfParseRequest();
|
||||||
|
if (request == null) {
|
||||||
|
return pdfParseRequest;
|
||||||
|
}
|
||||||
|
request.copyCommonFieldsTo(pdfParseRequest);
|
||||||
|
if (request instanceof PdfParseRequest) {
|
||||||
|
PdfParseRequest source = (PdfParseRequest) request;
|
||||||
|
pdfParseRequest.setParseMethod(source.getParseMethod());
|
||||||
|
pdfParseRequest.setFormulaEnabled(source.getFormulaEnabled());
|
||||||
|
pdfParseRequest.setTableEnabled(source.getTableEnabled());
|
||||||
|
pdfParseRequest.setStartPageIndex(source.getStartPageIndex());
|
||||||
|
pdfParseRequest.setEndPageIndex(source.getEndPageIndex());
|
||||||
|
}
|
||||||
|
return pdfParseRequest;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getParseMethod() {
|
||||||
|
return parseMethod;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParseMethod(String parseMethod) {
|
||||||
|
this.parseMethod = parseMethod;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean getFormulaEnabled() {
|
||||||
|
return formulaEnabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFormulaEnabled(Boolean formulaEnabled) {
|
||||||
|
this.formulaEnabled = formulaEnabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean getTableEnabled() {
|
||||||
|
return tableEnabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTableEnabled(Boolean tableEnabled) {
|
||||||
|
this.tableEnabled = tableEnabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getStartPageIndex() {
|
||||||
|
return startPageIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStartPageIndex(Integer startPageIndex) {
|
||||||
|
this.startPageIndex = startPageIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getEndPageIndex() {
|
||||||
|
return endPageIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEndPageIndex(Integer endPageIndex) {
|
||||||
|
this.endPageIndex = endPageIndex;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,79 @@
|
|||||||
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PPTX 解析请求。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class PptxParseRequest extends ParseRequest {
|
||||||
|
|
||||||
|
private Integer startSlideIndex = 0;
|
||||||
|
private Integer endSlideIndex;
|
||||||
|
private Double renderScale = 2.0d;
|
||||||
|
private String imageFormat = "png";
|
||||||
|
private Boolean includeSlideImageReference = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将通用请求转换为 PPTX 请求。
|
||||||
|
*
|
||||||
|
* @param request 原始请求
|
||||||
|
* @return PPTX 请求
|
||||||
|
*/
|
||||||
|
public static PptxParseRequest from(ParseRequest request) {
|
||||||
|
PptxParseRequest pptxParseRequest = new PptxParseRequest();
|
||||||
|
if (request == null) {
|
||||||
|
return pptxParseRequest;
|
||||||
|
}
|
||||||
|
request.copyCommonFieldsTo(pptxParseRequest);
|
||||||
|
if (request instanceof PptxParseRequest) {
|
||||||
|
PptxParseRequest source = (PptxParseRequest) request;
|
||||||
|
pptxParseRequest.setStartSlideIndex(source.getStartSlideIndex());
|
||||||
|
pptxParseRequest.setEndSlideIndex(source.getEndSlideIndex());
|
||||||
|
pptxParseRequest.setRenderScale(source.getRenderScale());
|
||||||
|
pptxParseRequest.setImageFormat(source.getImageFormat());
|
||||||
|
pptxParseRequest.setIncludeSlideImageReference(source.getIncludeSlideImageReference());
|
||||||
|
}
|
||||||
|
return pptxParseRequest;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getStartSlideIndex() {
|
||||||
|
return startSlideIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStartSlideIndex(Integer startSlideIndex) {
|
||||||
|
this.startSlideIndex = startSlideIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getEndSlideIndex() {
|
||||||
|
return endSlideIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEndSlideIndex(Integer endSlideIndex) {
|
||||||
|
this.endSlideIndex = endSlideIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Double getRenderScale() {
|
||||||
|
return renderScale;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRenderScale(Double renderScale) {
|
||||||
|
this.renderScale = renderScale;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getImageFormat() {
|
||||||
|
return imageFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setImageFormat(String imageFormat) {
|
||||||
|
this.imageFormat = imageFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean getIncludeSlideImageReference() {
|
||||||
|
return includeSlideImageReference;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setIncludeSlideImageReference(Boolean includeSlideImageReference) {
|
||||||
|
this.includeSlideImageReference = includeSlideImageReference;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,82 @@
|
|||||||
|
package com.easyagents.document.core.entity;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XLSX 解析请求。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class XlsxParseRequest extends ParseRequest {
|
||||||
|
|
||||||
|
private List<String> sheetNames = new ArrayList<String>();
|
||||||
|
private Boolean includeHiddenSheets = false;
|
||||||
|
private Boolean ocrEmbeddedImages = true;
|
||||||
|
private Integer maxRowsPerSheet;
|
||||||
|
private Boolean includeImageAppendix = true;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将通用请求转换为 XLSX 请求。
|
||||||
|
*
|
||||||
|
* @param request 原始请求
|
||||||
|
* @return XLSX 请求
|
||||||
|
*/
|
||||||
|
public static XlsxParseRequest from(ParseRequest request) {
|
||||||
|
XlsxParseRequest xlsxParseRequest = new XlsxParseRequest();
|
||||||
|
if (request == null) {
|
||||||
|
return xlsxParseRequest;
|
||||||
|
}
|
||||||
|
request.copyCommonFieldsTo(xlsxParseRequest);
|
||||||
|
if (request instanceof XlsxParseRequest) {
|
||||||
|
XlsxParseRequest source = (XlsxParseRequest) request;
|
||||||
|
xlsxParseRequest.setSheetNames(new ArrayList<String>(source.getSheetNames()));
|
||||||
|
xlsxParseRequest.setIncludeHiddenSheets(source.getIncludeHiddenSheets());
|
||||||
|
xlsxParseRequest.setOcrEmbeddedImages(source.getOcrEmbeddedImages());
|
||||||
|
xlsxParseRequest.setMaxRowsPerSheet(source.getMaxRowsPerSheet());
|
||||||
|
xlsxParseRequest.setIncludeImageAppendix(source.getIncludeImageAppendix());
|
||||||
|
}
|
||||||
|
return xlsxParseRequest;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getSheetNames() {
|
||||||
|
return sheetNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSheetNames(List<String> sheetNames) {
|
||||||
|
this.sheetNames = sheetNames == null ? new ArrayList<String>() : sheetNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean getIncludeHiddenSheets() {
|
||||||
|
return includeHiddenSheets;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setIncludeHiddenSheets(Boolean includeHiddenSheets) {
|
||||||
|
this.includeHiddenSheets = includeHiddenSheets;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean getOcrEmbeddedImages() {
|
||||||
|
return ocrEmbeddedImages;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOcrEmbeddedImages(Boolean ocrEmbeddedImages) {
|
||||||
|
this.ocrEmbeddedImages = ocrEmbeddedImages;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getMaxRowsPerSheet() {
|
||||||
|
return maxRowsPerSheet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMaxRowsPerSheet(Integer maxRowsPerSheet) {
|
||||||
|
this.maxRowsPerSheet = maxRowsPerSheet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean getIncludeImageAppendix() {
|
||||||
|
return includeImageAppendix;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setIncludeImageAppendix(Boolean includeImageAppendix) {
|
||||||
|
this.includeImageAppendix = includeImageAppendix;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,11 +1,11 @@
|
|||||||
package com.easyagents.document.pdf.mineru;
|
package com.easyagents.document.core.mineru;
|
||||||
|
|
||||||
import com.alibaba.fastjson2.JSON;
|
import com.alibaba.fastjson2.JSON;
|
||||||
import com.alibaba.fastjson2.JSONObject;
|
import com.alibaba.fastjson2.JSONObject;
|
||||||
import com.easyagents.core.util.StringUtil;
|
import com.easyagents.core.util.StringUtil;
|
||||||
import com.easyagents.document.core.exception.DocumentParseException;
|
import com.easyagents.document.core.exception.DocumentParseException;
|
||||||
import com.easyagents.document.core.model.ParseFile;
|
import com.easyagents.document.core.entity.ParseFile;
|
||||||
import com.easyagents.document.core.model.ParseRequest;
|
import com.easyagents.document.core.entity.ParseRequest;
|
||||||
import okhttp3.MediaType;
|
import okhttp3.MediaType;
|
||||||
import okhttp3.MultipartBody;
|
import okhttp3.MultipartBody;
|
||||||
import okhttp3.OkHttpClient;
|
import okhttp3.OkHttpClient;
|
||||||
@@ -15,6 +15,7 @@ import okhttp3.Response;
|
|||||||
import okhttp3.ResponseBody;
|
import okhttp3.ResponseBody;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.net.URLConnection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
@@ -23,11 +24,11 @@ import java.util.concurrent.TimeUnit;
|
|||||||
* MinerU HTTP 客户端。
|
* MinerU HTTP 客户端。
|
||||||
*
|
*
|
||||||
* @author Codex
|
* @author Codex
|
||||||
* @since 2026-04-14
|
* @since 2026-04-16
|
||||||
*/
|
*/
|
||||||
public class MineruPdfClient {
|
public class MineruClient {
|
||||||
|
|
||||||
private static final MediaType DEFAULT_PDF_MEDIA_TYPE = MediaType.parse("application/pdf");
|
private static final MediaType DEFAULT_MEDIA_TYPE = MediaType.parse("application/octet-stream");
|
||||||
|
|
||||||
private final String baseUrl;
|
private final String baseUrl;
|
||||||
private final OkHttpClient okHttpClient;
|
private final OkHttpClient okHttpClient;
|
||||||
@@ -39,7 +40,7 @@ public class MineruPdfClient {
|
|||||||
* @param properties MinerU 配置
|
* @param properties MinerU 配置
|
||||||
* @param mineruMapper DTO 映射器
|
* @param mineruMapper DTO 映射器
|
||||||
*/
|
*/
|
||||||
public MineruPdfClient(MineruProperties properties, MineruMapper mineruMapper) {
|
public MineruClient(MineruProperties properties, MineruMapper mineruMapper) {
|
||||||
this(
|
this(
|
||||||
properties,
|
properties,
|
||||||
new OkHttpClient.Builder()
|
new OkHttpClient.Builder()
|
||||||
@@ -58,7 +59,7 @@ public class MineruPdfClient {
|
|||||||
* @param okHttpClient HTTP 客户端
|
* @param okHttpClient HTTP 客户端
|
||||||
* @param mineruMapper DTO 映射器
|
* @param mineruMapper DTO 映射器
|
||||||
*/
|
*/
|
||||||
public MineruPdfClient(MineruProperties properties, OkHttpClient okHttpClient, MineruMapper mineruMapper) {
|
public MineruClient(MineruProperties properties, OkHttpClient okHttpClient, MineruMapper mineruMapper) {
|
||||||
if (properties == null || !StringUtil.hasText(properties.getBaseUrl())) {
|
if (properties == null || !StringUtil.hasText(properties.getBaseUrl())) {
|
||||||
throw new IllegalArgumentException("MinerU baseUrl must not be empty");
|
throw new IllegalArgumentException("MinerU baseUrl must not be empty");
|
||||||
}
|
}
|
||||||
@@ -165,7 +166,7 @@ public class MineruPdfClient {
|
|||||||
}
|
}
|
||||||
MediaType mediaType = StringUtil.hasText(file.getContentType())
|
MediaType mediaType = StringUtil.hasText(file.getContentType())
|
||||||
? MediaType.parse(file.getContentType())
|
? MediaType.parse(file.getContentType())
|
||||||
: DEFAULT_PDF_MEDIA_TYPE;
|
: detectMediaType(file.getFileName());
|
||||||
formBuilder.addFormDataPart(
|
formBuilder.addFormDataPart(
|
||||||
"files",
|
"files",
|
||||||
file.getFileName(),
|
file.getFileName(),
|
||||||
@@ -208,4 +209,9 @@ public class MineruPdfClient {
|
|||||||
}
|
}
|
||||||
return baseUrl;
|
return baseUrl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private MediaType detectMediaType(String fileName) {
|
||||||
|
String mimeType = URLConnection.guessContentTypeFromName(fileName);
|
||||||
|
return StringUtil.hasText(mimeType) ? MediaType.parse(mimeType) : DEFAULT_MEDIA_TYPE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -0,0 +1,218 @@
|
|||||||
|
package com.easyagents.document.core.mineru;
|
||||||
|
|
||||||
|
import com.easyagents.core.util.StringUtil;
|
||||||
|
import com.easyagents.document.core.DocumentParseService;
|
||||||
|
import com.easyagents.document.core.exception.DocumentParseException;
|
||||||
|
import com.easyagents.document.core.entity.ParseFile;
|
||||||
|
import com.easyagents.document.core.entity.ParseRequest;
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 基于 MinerU API 的文档解析服务,支持 docx 文档和 pdf 文档。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class MineruDocumentParseService<R extends ParseRequest> implements DocumentParseService<R> {
|
||||||
|
|
||||||
|
public static final String PROVIDER_NAME = "mineru";
|
||||||
|
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(MineruDocumentParseService.class);
|
||||||
|
|
||||||
|
private final MineruProperties properties;
|
||||||
|
private final MineruClient client;
|
||||||
|
private final MineruMapper mapper;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建默认服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
*/
|
||||||
|
public MineruDocumentParseService(MineruProperties properties) {
|
||||||
|
this(properties, new MineruMapper(properties));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建默认服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
* @param mapper 结果映射器
|
||||||
|
*/
|
||||||
|
public MineruDocumentParseService(MineruProperties properties, MineruMapper mapper) {
|
||||||
|
this(properties, new MineruClient(properties, mapper), mapper);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
* @param client HTTP 客户端
|
||||||
|
* @param mapper 结果映射器
|
||||||
|
*/
|
||||||
|
public MineruDocumentParseService(MineruProperties properties, MineruClient client, MineruMapper mapper) {
|
||||||
|
this.properties = properties;
|
||||||
|
this.client = client;
|
||||||
|
this.mapper = mapper;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ParseResponse parse(ParseRequest request) {
|
||||||
|
ParseRequest normalizedRequest = normalizeRequest(request);
|
||||||
|
LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||||
|
normalizedRequest.getBackend());
|
||||||
|
ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
|
||||||
|
LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||||
|
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ParseTaskStatus submit(ParseRequest request) {
|
||||||
|
ParseRequest normalizedRequest = normalizeRequest(request);
|
||||||
|
normalizedRequest.setReturnMarkdown(true);
|
||||||
|
normalizedRequest.setReturnMiddleJson(true);
|
||||||
|
normalizedRequest.setReturnContentList(true);
|
||||||
|
normalizedRequest.setReturnModelOutput(true);
|
||||||
|
normalizedRequest.setReturnImages(true);
|
||||||
|
LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||||
|
normalizedRequest.getBackend());
|
||||||
|
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
|
||||||
|
LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
taskStatus == null ? null : taskStatus.getTaskId(),
|
||||||
|
taskStatus == null ? null : taskStatus.getStatus());
|
||||||
|
return taskStatus;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ParseTaskStatus queryTask(String taskId) {
|
||||||
|
validateTaskId(taskId);
|
||||||
|
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
|
||||||
|
LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
taskId,
|
||||||
|
taskStatus == null ? null : taskStatus.getStatus());
|
||||||
|
return taskStatus;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ParseResponse queryResult(String taskId) {
|
||||||
|
validateTaskId(taskId);
|
||||||
|
LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
|
||||||
|
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
|
||||||
|
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
||||||
|
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||||
|
LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
taskId,
|
||||||
|
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||||
|
validateTaskId(taskId);
|
||||||
|
MineruTaskStatus taskStatus = client.queryTask(taskId);
|
||||||
|
ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(mapper.toParseTaskStatus(taskStatus));
|
||||||
|
if ("completed".equalsIgnoreCase(taskStatus.getStatus())) {
|
||||||
|
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
||||||
|
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||||
|
taskInfo.setResult(response);
|
||||||
|
}
|
||||||
|
LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
taskId,
|
||||||
|
taskInfo.getStatus(),
|
||||||
|
taskInfo.getResult() != null);
|
||||||
|
return taskInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取 MinerU 配置。
|
||||||
|
*
|
||||||
|
* @return MinerU 配置
|
||||||
|
*/
|
||||||
|
protected MineruProperties getProperties() {
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 归一化解析请求,补齐默认参数。
|
||||||
|
*
|
||||||
|
* @param request 原始请求
|
||||||
|
* @return 归一化后的请求
|
||||||
|
*/
|
||||||
|
protected ParseRequest normalizeRequest(ParseRequest request) {
|
||||||
|
if (request == null) {
|
||||||
|
throw new IllegalArgumentException("ParseRequest must not be null");
|
||||||
|
}
|
||||||
|
if (request.getFiles() == null || request.getFiles().isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("ParseRequest files must not be empty");
|
||||||
|
}
|
||||||
|
ParseRequest normalizedRequest = new ParseRequest();
|
||||||
|
normalizedRequest.setFiles(new ArrayList<ParseFile>(request.getFiles()));
|
||||||
|
normalizedRequest.setBackend(StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend());
|
||||||
|
normalizedRequest.setLanguages(
|
||||||
|
request.getLanguages() == null || request.getLanguages().isEmpty()
|
||||||
|
? new ArrayList<String>(properties.getDefaultLangList())
|
||||||
|
: new ArrayList<String>(request.getLanguages())
|
||||||
|
);
|
||||||
|
normalizedRequest.setReturnMarkdown(request.getReturnMarkdown() == null ? Boolean.TRUE : request.getReturnMarkdown());
|
||||||
|
normalizedRequest.setReturnMiddleJson(request.getReturnMiddleJson() == null ? Boolean.TRUE : request.getReturnMiddleJson());
|
||||||
|
normalizedRequest.setReturnContentList(request.getReturnContentList() == null ? Boolean.TRUE : request.getReturnContentList());
|
||||||
|
normalizedRequest.setReturnModelOutput(request.getReturnModelOutput() == null ? Boolean.FALSE : request.getReturnModelOutput());
|
||||||
|
normalizedRequest.setReturnImages(request.getReturnImages() == null ? Boolean.TRUE : request.getReturnImages());
|
||||||
|
return normalizedRequest;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 校验任务 ID。
|
||||||
|
*
|
||||||
|
* @param taskId 任务 ID
|
||||||
|
*/
|
||||||
|
protected void validateTaskId(String taskId) {
|
||||||
|
if (!StringUtil.hasText(taskId)) {
|
||||||
|
throw new IllegalArgumentException("taskId must not be empty");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 轮询任务状态直到完成或失败。
|
||||||
|
*
|
||||||
|
* @param taskId 任务 ID
|
||||||
|
* @return 已完成的任务状态
|
||||||
|
*/
|
||||||
|
protected MineruTaskStatus waitForTaskCompleted(String taskId) {
|
||||||
|
long deadline = System.currentTimeMillis() + properties.getResultTimeoutMs();
|
||||||
|
while (true) {
|
||||||
|
MineruTaskStatus taskStatus = client.queryTask(taskId);
|
||||||
|
if ("completed".equals(taskStatus.getStatus())) {
|
||||||
|
return taskStatus;
|
||||||
|
}
|
||||||
|
if ("failed".equals(taskStatus.getStatus())) {
|
||||||
|
throw new DocumentParseException("MinerU task failed: " + taskStatus.getError());
|
||||||
|
}
|
||||||
|
if (System.currentTimeMillis() >= deadline) {
|
||||||
|
throw new DocumentParseException("MinerU task result timeout: " + taskId);
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
Thread.sleep(properties.getPollIntervalMs());
|
||||||
|
} catch (InterruptedException exception) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
throw new DocumentParseException("Interrupted while waiting for MinerU task: " + taskId, exception);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,19 +1,20 @@
|
|||||||
package com.easyagents.document.pdf.mineru;
|
package com.easyagents.document.core.mineru;
|
||||||
|
|
||||||
import com.alibaba.fastjson2.JSON;
|
import com.alibaba.fastjson2.JSON;
|
||||||
import com.alibaba.fastjson2.JSONArray;
|
import com.alibaba.fastjson2.JSONArray;
|
||||||
import com.alibaba.fastjson2.JSONObject;
|
import com.alibaba.fastjson2.JSONObject;
|
||||||
import com.easyagents.core.util.StringUtil;
|
import com.easyagents.core.util.StringUtil;
|
||||||
import com.easyagents.document.core.exception.DocumentParseException;
|
import com.easyagents.document.core.exception.DocumentParseException;
|
||||||
import com.easyagents.document.core.model.DocumentBlock;
|
import com.easyagents.document.core.entity.DocumentBlock;
|
||||||
import com.easyagents.document.core.model.DocumentImage;
|
import com.easyagents.document.core.entity.DocumentImage;
|
||||||
import com.easyagents.document.core.model.DocumentPage;
|
import com.easyagents.document.core.entity.DocumentPage;
|
||||||
import com.easyagents.document.core.model.DocumentTable;
|
import com.easyagents.document.core.entity.DocumentTable;
|
||||||
import com.easyagents.document.core.model.ParseArtifacts;
|
import com.easyagents.document.core.entity.PdfParseRequest;
|
||||||
import com.easyagents.document.core.model.ParseRequest;
|
import com.easyagents.document.core.entity.ParseArtifacts;
|
||||||
import com.easyagents.document.core.model.ParseResponse;
|
import com.easyagents.document.core.entity.ParseRequest;
|
||||||
import com.easyagents.document.core.model.ParseResult;
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
import com.easyagents.document.core.entity.ParseResult;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
@@ -31,7 +32,7 @@ import java.util.zip.ZipInputStream;
|
|||||||
* MinerU 原始协议与统一模型之间的映射器。
|
* MinerU 原始协议与统一模型之间的映射器。
|
||||||
*
|
*
|
||||||
* @author Codex
|
* @author Codex
|
||||||
* @since 2026-04-14
|
* @since 2026-04-16
|
||||||
*/
|
*/
|
||||||
public class MineruMapper {
|
public class MineruMapper {
|
||||||
|
|
||||||
@@ -71,7 +72,6 @@ public class MineruMapper {
|
|||||||
*/
|
*/
|
||||||
public Map<String, List<String>> buildAsyncFormFields(ParseRequest request) {
|
public Map<String, List<String>> buildAsyncFormFields(ParseRequest request) {
|
||||||
Map<String, List<String>> fields = buildBaseFormFields(request);
|
Map<String, List<String>> fields = buildBaseFormFields(request);
|
||||||
// 异步结果固定按全量 ZIP 返回,避免超大结果通过 JSON 传输。
|
|
||||||
putSingleValue(fields, "return_md", "true");
|
putSingleValue(fields, "return_md", "true");
|
||||||
putSingleValue(fields, "return_middle_json", "true");
|
putSingleValue(fields, "return_middle_json", "true");
|
||||||
putSingleValue(fields, "return_content_list", "true");
|
putSingleValue(fields, "return_content_list", "true");
|
||||||
@@ -205,19 +205,24 @@ public class MineruMapper {
|
|||||||
private Map<String, List<String>> buildBaseFormFields(ParseRequest request) {
|
private Map<String, List<String>> buildBaseFormFields(ParseRequest request) {
|
||||||
Map<String, List<String>> fields = new LinkedHashMap<String, List<String>>();
|
Map<String, List<String>> fields = new LinkedHashMap<String, List<String>>();
|
||||||
putSingleValue(fields, "backend", StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend());
|
putSingleValue(fields, "backend", StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend());
|
||||||
putSingleValue(fields, "parse_method", StringUtil.hasText(request.getParseMethod()) ? request.getParseMethod() : properties.getDefaultParseMethod());
|
|
||||||
putSingleValue(fields, "formula_enable", String.valueOf(boolOrDefault(request.getFormulaEnabled(), properties.getDefaultFormulaEnable())));
|
|
||||||
putSingleValue(fields, "table_enable", String.valueOf(boolOrDefault(request.getTableEnabled(), properties.getDefaultTableEnable())));
|
|
||||||
putSingleValue(fields, "start_page_id", String.valueOf(intOrDefault(request.getStartPageIndex(), 0)));
|
|
||||||
putSingleValue(fields, "end_page_id", String.valueOf(intOrDefault(request.getEndPageIndex(), 99999)));
|
|
||||||
List<String> languages = request.getLanguages();
|
List<String> languages = request.getLanguages();
|
||||||
if (languages == null || languages.isEmpty()) {
|
if (languages == null || languages.isEmpty()) {
|
||||||
languages = properties.getDefaultLangList();
|
languages = properties.getDefaultLangList();
|
||||||
}
|
}
|
||||||
if (languages != null && !languages.isEmpty()) {
|
if (languages != null && !languages.isEmpty()) {
|
||||||
// MinerU 通过重复的 lang_list 表单字段接收多语言参数。
|
|
||||||
fields.put("lang_list", new ArrayList<String>(languages));
|
fields.put("lang_list", new ArrayList<String>(languages));
|
||||||
}
|
}
|
||||||
|
if (request instanceof PdfParseRequest) {
|
||||||
|
PdfParseRequest pdfParseRequest = (PdfParseRequest) request;
|
||||||
|
putSingleValue(fields, "parse_method",
|
||||||
|
StringUtil.hasText(pdfParseRequest.getParseMethod()) ? pdfParseRequest.getParseMethod() : properties.getDefaultParseMethod());
|
||||||
|
putSingleValue(fields, "formula_enable",
|
||||||
|
String.valueOf(boolOrDefault(pdfParseRequest.getFormulaEnabled(), properties.getDefaultFormulaEnable())));
|
||||||
|
putSingleValue(fields, "table_enable",
|
||||||
|
String.valueOf(boolOrDefault(pdfParseRequest.getTableEnabled(), properties.getDefaultTableEnable())));
|
||||||
|
putSingleValue(fields, "start_page_id", String.valueOf(intOrDefault(pdfParseRequest.getStartPageIndex(), 0)));
|
||||||
|
putSingleValue(fields, "end_page_id", String.valueOf(intOrDefault(pdfParseRequest.getEndPageIndex(), 99999)));
|
||||||
|
}
|
||||||
return fields;
|
return fields;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -240,7 +245,8 @@ public class MineruMapper {
|
|||||||
result.setArtifacts(artifacts);
|
result.setArtifacts(artifacts);
|
||||||
|
|
||||||
Map<String, String> imageDataUrls = toStringMap(fileResult.getJSONObject("images"));
|
Map<String, String> imageDataUrls = toStringMap(fileResult.getJSONObject("images"));
|
||||||
applyStructuredArtifacts(result, imageDataUrls);
|
Map<String, byte[]> imageContents = toBinaryMap(imageDataUrls);
|
||||||
|
applyStructuredArtifacts(result, imageDataUrls, imageContents);
|
||||||
if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) {
|
if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) {
|
||||||
result.getWarnings().add("MinerU did not return markdown, middle_json or content_list");
|
result.getWarnings().add("MinerU did not return markdown, middle_json or content_list");
|
||||||
}
|
}
|
||||||
@@ -264,7 +270,6 @@ public class MineruMapper {
|
|||||||
JSONArray contentList = asArray(contentListArtifact);
|
JSONArray contentList = asArray(contentListArtifact);
|
||||||
Object modelOutput = modelOutputArtifact;
|
Object modelOutput = modelOutputArtifact;
|
||||||
|
|
||||||
// MinerU 在 DOCX 等场景下可能将结构化块列表放在 middle/model 工件里,并且直接返回数组。
|
|
||||||
if (contentList == null && middleArtifact instanceof JSONArray) {
|
if (contentList == null && middleArtifact instanceof JSONArray) {
|
||||||
contentList = (JSONArray) middleArtifact;
|
contentList = (JSONArray) middleArtifact;
|
||||||
middleJson = null;
|
middleJson = null;
|
||||||
@@ -289,10 +294,12 @@ public class MineruMapper {
|
|||||||
result.setArtifacts(artifacts);
|
result.setArtifacts(artifacts);
|
||||||
|
|
||||||
Map<String, String> imageDataUrls = new LinkedHashMap<String, String>();
|
Map<String, String> imageDataUrls = new LinkedHashMap<String, String>();
|
||||||
|
Map<String, byte[]> imageContents = new LinkedHashMap<String, byte[]>();
|
||||||
for (Map.Entry<String, byte[]> imageEntry : bundle.images.entrySet()) {
|
for (Map.Entry<String, byte[]> imageEntry : bundle.images.entrySet()) {
|
||||||
imageDataUrls.put(imageEntry.getKey(), toDataUrl(imageEntry.getKey(), imageEntry.getValue()));
|
imageDataUrls.put(imageEntry.getKey(), toDataUrl(imageEntry.getKey(), imageEntry.getValue()));
|
||||||
|
imageContents.put(imageEntry.getKey(), imageEntry.getValue());
|
||||||
}
|
}
|
||||||
applyStructuredArtifacts(result, imageDataUrls);
|
applyStructuredArtifacts(result, imageDataUrls, imageContents);
|
||||||
|
|
||||||
if (markdown == null && middleJson == null && contentList == null) {
|
if (markdown == null && middleJson == null && contentList == null) {
|
||||||
throw new DocumentParseException("MinerU ZIP result missing critical artifacts for file: " + fileName);
|
throw new DocumentParseException("MinerU ZIP result missing critical artifacts for file: " + fileName);
|
||||||
@@ -300,7 +307,7 @@ public class MineruMapper {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls) {
|
private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls, Map<String, byte[]> imageContents) {
|
||||||
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson());
|
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson());
|
||||||
JSONArray contentList = asArray(result.getArtifacts().getContentList());
|
JSONArray contentList = asArray(result.getArtifacts().getContentList());
|
||||||
|
|
||||||
@@ -311,9 +318,9 @@ public class MineruMapper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (contentList != null) {
|
if (contentList != null) {
|
||||||
fillFromContentList(result, contentList, imageDataUrls);
|
fillFromContentList(result, contentList, imageDataUrls, imageContents);
|
||||||
} else if (middleJson != null) {
|
} else if (middleJson != null) {
|
||||||
fillFromMiddleJson(result, middleJson, imageDataUrls);
|
fillFromMiddleJson(result, middleJson, imageDataUrls, imageContents);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((result.getImages() == null || result.getImages().isEmpty()) && imageDataUrls != null && !imageDataUrls.isEmpty()) {
|
if ((result.getImages() == null || result.getImages().isEmpty()) && imageDataUrls != null && !imageDataUrls.isEmpty()) {
|
||||||
@@ -322,6 +329,7 @@ public class MineruMapper {
|
|||||||
image.setName(baseName(entry.getKey()));
|
image.setName(baseName(entry.getKey()));
|
||||||
image.setSourcePath(entry.getKey());
|
image.setSourcePath(entry.getKey());
|
||||||
image.setDataUrl(entry.getValue());
|
image.setDataUrl(entry.getValue());
|
||||||
|
image.setContent(matchBinaryContent(entry.getKey(), imageContents));
|
||||||
image.setMimeType(detectMimeType(entry.getKey()));
|
image.setMimeType(detectMimeType(entry.getKey()));
|
||||||
result.getImages().add(image);
|
result.getImages().add(image);
|
||||||
}
|
}
|
||||||
@@ -349,7 +357,10 @@ public class MineruMapper {
|
|||||||
result.setPages(pages);
|
result.setPages(pages);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void fillFromContentList(ParseResult result, JSONArray contentList, Map<String, String> imageDataUrls) {
|
private void fillFromContentList(ParseResult result,
|
||||||
|
JSONArray contentList,
|
||||||
|
Map<String, String> imageDataUrls,
|
||||||
|
Map<String, byte[]> imageContents) {
|
||||||
for (int index = 0; index < contentList.size(); index++) {
|
for (int index = 0; index < contentList.size(); index++) {
|
||||||
JSONObject item = contentList.getJSONObject(index);
|
JSONObject item = contentList.getJSONObject(index);
|
||||||
if (item == null) {
|
if (item == null) {
|
||||||
@@ -391,12 +402,16 @@ public class MineruMapper {
|
|||||||
image.setCaptions(extractCaptions(item));
|
image.setCaptions(extractCaptions(item));
|
||||||
image.setFootnotes(extractFootnotes(item));
|
image.setFootnotes(extractFootnotes(item));
|
||||||
image.setDataUrl(matchDataUrl(item.getString("img_path"), imageDataUrls));
|
image.setDataUrl(matchDataUrl(item.getString("img_path"), imageDataUrls));
|
||||||
|
image.setContent(matchBinaryContent(item.getString("img_path"), imageContents));
|
||||||
result.getImages().add(image);
|
result.getImages().add(image);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void fillFromMiddleJson(ParseResult result, JSONObject middleJson, Map<String, String> imageDataUrls) {
|
private void fillFromMiddleJson(ParseResult result,
|
||||||
|
JSONObject middleJson,
|
||||||
|
Map<String, String> imageDataUrls,
|
||||||
|
Map<String, byte[]> imageContents) {
|
||||||
JSONArray pages = middleJson.getJSONArray("pdf_info");
|
JSONArray pages = middleJson.getJSONArray("pdf_info");
|
||||||
if (pages == null) {
|
if (pages == null) {
|
||||||
return;
|
return;
|
||||||
@@ -404,8 +419,8 @@ public class MineruMapper {
|
|||||||
for (int pageIndex = 0; pageIndex < pages.size(); pageIndex++) {
|
for (int pageIndex = 0; pageIndex < pages.size(); pageIndex++) {
|
||||||
JSONObject page = pages.getJSONObject(pageIndex);
|
JSONObject page = pages.getJSONObject(pageIndex);
|
||||||
fillBlocksFromMiddlePage(result, page.getJSONArray("para_blocks"), page.getInteger("page_idx"));
|
fillBlocksFromMiddlePage(result, page.getJSONArray("para_blocks"), page.getInteger("page_idx"));
|
||||||
fillVisualsFromMiddlePage(result, page.getJSONArray("tables"), page.getInteger("page_idx"), true, imageDataUrls);
|
fillVisualsFromMiddlePage(result, page.getJSONArray("tables"), page.getInteger("page_idx"), true, imageDataUrls, imageContents);
|
||||||
fillVisualsFromMiddlePage(result, page.getJSONArray("images"), page.getInteger("page_idx"), false, imageDataUrls);
|
fillVisualsFromMiddlePage(result, page.getJSONArray("images"), page.getInteger("page_idx"), false, imageDataUrls, imageContents);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -429,7 +444,12 @@ public class MineruMapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void fillVisualsFromMiddlePage(ParseResult result, JSONArray blocks, Integer pageIndex, boolean table, Map<String, String> imageDataUrls) {
|
private void fillVisualsFromMiddlePage(ParseResult result,
|
||||||
|
JSONArray blocks,
|
||||||
|
Integer pageIndex,
|
||||||
|
boolean table,
|
||||||
|
Map<String, String> imageDataUrls,
|
||||||
|
Map<String, byte[]> imageContents) {
|
||||||
if (blocks == null) {
|
if (blocks == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -456,6 +476,7 @@ public class MineruMapper {
|
|||||||
documentImage.setName(baseName(documentImage.getSourcePath()));
|
documentImage.setName(baseName(documentImage.getSourcePath()));
|
||||||
documentImage.setMimeType(detectMimeType(documentImage.getSourcePath()));
|
documentImage.setMimeType(detectMimeType(documentImage.getSourcePath()));
|
||||||
documentImage.setDataUrl(matchDataUrl(documentImage.getSourcePath(), imageDataUrls));
|
documentImage.setDataUrl(matchDataUrl(documentImage.getSourcePath(), imageDataUrls));
|
||||||
|
documentImage.setContent(matchBinaryContent(documentImage.getSourcePath(), imageContents));
|
||||||
result.getImages().add(documentImage);
|
result.getImages().add(documentImage);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -607,6 +628,20 @@ public class MineruMapper {
|
|||||||
return values;
|
return values;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Map<String, byte[]> toBinaryMap(Map<String, String> dataUrls) {
|
||||||
|
Map<String, byte[]> values = new LinkedHashMap<String, byte[]>();
|
||||||
|
if (dataUrls == null || dataUrls.isEmpty()) {
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
for (Map.Entry<String, String> entry : dataUrls.entrySet()) {
|
||||||
|
byte[] content = decodeDataUrl(entry.getValue());
|
||||||
|
if (content != null) {
|
||||||
|
values.put(entry.getKey(), content);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
private List<Double> toDoubleList(JSONArray jsonArray) {
|
private List<Double> toDoubleList(JSONArray jsonArray) {
|
||||||
if (jsonArray == null || jsonArray.isEmpty()) {
|
if (jsonArray == null || jsonArray.isEmpty()) {
|
||||||
return new ArrayList<Double>();
|
return new ArrayList<Double>();
|
||||||
@@ -800,6 +835,25 @@ public class MineruMapper {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private byte[] matchBinaryContent(String imagePath, Map<String, byte[]> imageContents) {
|
||||||
|
if (imageContents == null || imageContents.isEmpty()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (StringUtil.hasText(imagePath) && imageContents.containsKey(imagePath)) {
|
||||||
|
return imageContents.get(imagePath);
|
||||||
|
}
|
||||||
|
String currentBaseName = baseName(imagePath);
|
||||||
|
if (!StringUtil.hasText(currentBaseName)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
for (Map.Entry<String, byte[]> entry : imageContents.entrySet()) {
|
||||||
|
if (currentBaseName.equals(baseName(entry.getKey()))) {
|
||||||
|
return entry.getValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
private String baseName(String path) {
|
private String baseName(String path) {
|
||||||
if (!StringUtil.hasText(path)) {
|
if (!StringUtil.hasText(path)) {
|
||||||
return null;
|
return null;
|
||||||
@@ -820,6 +874,21 @@ public class MineruMapper {
|
|||||||
return "data:" + detectMimeType(path) + ";base64," + Base64.getEncoder().encodeToString(content);
|
return "data:" + detectMimeType(path) + ";base64," + Base64.getEncoder().encodeToString(content);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private byte[] decodeDataUrl(String dataUrl) {
|
||||||
|
if (!StringUtil.hasText(dataUrl)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
int commaIndex = dataUrl.indexOf(',');
|
||||||
|
if (commaIndex < 0 || commaIndex == dataUrl.length() - 1) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
return Base64.getDecoder().decode(dataUrl.substring(commaIndex + 1));
|
||||||
|
} catch (IllegalArgumentException exception) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private String joinList(List<String> values) {
|
private String joinList(List<String> values) {
|
||||||
if (values == null || values.isEmpty()) {
|
if (values == null || values.isEmpty()) {
|
||||||
return null;
|
return null;
|
||||||
@@ -1,14 +1,14 @@
|
|||||||
package com.easyagents.document.pdf.mineru;
|
package com.easyagents.document.core.mineru;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* MinerU PDF 解析配置。
|
* MinerU 文档解析配置。
|
||||||
*
|
*
|
||||||
* @author Codex
|
* @author Codex
|
||||||
* @since 2026-04-14
|
* @since 2026-04-16
|
||||||
*/
|
*/
|
||||||
public class MineruProperties {
|
public class MineruProperties {
|
||||||
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.pdf.mineru;
|
package com.easyagents.document.core.mineru;
|
||||||
|
|
||||||
import com.alibaba.fastjson2.JSONObject;
|
import com.alibaba.fastjson2.JSONObject;
|
||||||
|
|
||||||
@@ -9,7 +9,7 @@ import java.util.Map;
|
|||||||
* MinerU 结果载荷。
|
* MinerU 结果载荷。
|
||||||
*
|
*
|
||||||
* @author Codex
|
* @author Codex
|
||||||
* @since 2026-04-14
|
* @since 2026-04-16
|
||||||
*/
|
*/
|
||||||
public class MineruResultPayload {
|
public class MineruResultPayload {
|
||||||
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.easyagents.document.pdf.mineru;
|
package com.easyagents.document.core.mineru;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -7,7 +7,7 @@ import java.util.List;
|
|||||||
* MinerU 原始任务状态。
|
* MinerU 原始任务状态。
|
||||||
*
|
*
|
||||||
* @author Codex
|
* @author Codex
|
||||||
* @since 2026-04-14
|
* @since 2026-04-16
|
||||||
*/
|
*/
|
||||||
public class MineruTaskStatus {
|
public class MineruTaskStatus {
|
||||||
|
|
||||||
@@ -0,0 +1,98 @@
|
|||||||
|
package com.easyagents.document.core.support;
|
||||||
|
|
||||||
|
import com.easyagents.document.core.DocumentParseService;
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskUpdater;
|
||||||
|
import com.easyagents.document.core.entity.ParseFile;
|
||||||
|
import com.easyagents.document.core.entity.ParseRequest;
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 带统一异步任务能力的文档解析服务基类。
|
||||||
|
* 支持 ppt 和 excel,pdf 和 word 文档使用 mineru 自带异步能力
|
||||||
|
*
|
||||||
|
* @param <R> 请求类型
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public abstract class AbstractAsyncDocumentParseService<R extends ParseRequest> implements DocumentParseService<R> {
|
||||||
|
|
||||||
|
private final DocumentAsyncTaskManager taskManager;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务基类。
|
||||||
|
*
|
||||||
|
* @param taskManager 异步任务管理器
|
||||||
|
*/
|
||||||
|
protected AbstractAsyncDocumentParseService(DocumentAsyncTaskManager taskManager) {
|
||||||
|
if (taskManager == null) {
|
||||||
|
throw new IllegalArgumentException("DocumentAsyncTaskManager must not be null");
|
||||||
|
}
|
||||||
|
this.taskManager = taskManager;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ParseResponse parse(ParseRequest request) {
|
||||||
|
return doParse(normalizeRequest(request), null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ParseTaskStatus submit(ParseRequest request) {
|
||||||
|
final R normalizedRequest = normalizeRequest(request);
|
||||||
|
return taskManager.submit(
|
||||||
|
normalizedRequest.getBackend(),
|
||||||
|
collectFileNames(normalizedRequest),
|
||||||
|
updater -> doParse(normalizedRequest, updater)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ParseTaskStatus queryTask(String taskId) {
|
||||||
|
return taskManager.queryTask(taskId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ParseResponse queryResult(String taskId) {
|
||||||
|
return taskManager.queryResult(taskId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||||
|
return taskManager.queryTaskInfo(taskId);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 归一化请求。
|
||||||
|
*
|
||||||
|
* @param request 原始请求
|
||||||
|
* @return 归一化结果
|
||||||
|
*/
|
||||||
|
protected abstract R normalizeRequest(ParseRequest request);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 执行解析。
|
||||||
|
*
|
||||||
|
* @param request 归一化请求
|
||||||
|
* @param updater 进度更新器;同步解析时可能为 {@code null}
|
||||||
|
* @return 解析结果
|
||||||
|
*/
|
||||||
|
protected abstract ParseResponse doParse(R request, DocumentAsyncTaskUpdater updater);
|
||||||
|
|
||||||
|
private List<String> collectFileNames(ParseRequest request) {
|
||||||
|
List<String> fileNames = new ArrayList<String>();
|
||||||
|
if (request == null || request.getFiles() == null) {
|
||||||
|
return fileNames;
|
||||||
|
}
|
||||||
|
for (ParseFile file : request.getFiles()) {
|
||||||
|
if (file != null && file.getFileName() != null) {
|
||||||
|
fileNames.add(file.getFileName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fileNames;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
package com.easyagents.document.core.async;
|
||||||
|
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
import com.easyagents.document.core.entity.ParseResult;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.concurrent.Executor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 异步任务管理器测试。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class DocumentAsyncTaskManagerTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldTrackTaskLifecycleAndResult() {
|
||||||
|
Executor directExecutor = new Executor() {
|
||||||
|
@Override
|
||||||
|
public void execute(Runnable command) {
|
||||||
|
command.run();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
DocumentAsyncTaskManager manager = new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor);
|
||||||
|
|
||||||
|
ParseTaskStatus status = manager.submit("mineru", Collections.singletonList("demo.pptx"), updater -> {
|
||||||
|
updater.update("ocr", 50, 1, 2, "处理中");
|
||||||
|
ParseResponse response = new ParseResponse();
|
||||||
|
ParseResult result = new ParseResult();
|
||||||
|
result.setFileName("demo.pptx");
|
||||||
|
result.setMarkdown("# Slide 1");
|
||||||
|
response.setResults(Collections.singletonList(result));
|
||||||
|
return response;
|
||||||
|
});
|
||||||
|
|
||||||
|
ParseTaskInfo taskInfo = manager.queryTaskInfo(status.getTaskId());
|
||||||
|
|
||||||
|
Assert.assertEquals("completed", taskInfo.getStatus());
|
||||||
|
Assert.assertEquals(Integer.valueOf(100), taskInfo.getProgressPercent());
|
||||||
|
Assert.assertEquals("completed", taskInfo.getCurrentStage());
|
||||||
|
Assert.assertNotNull(taskInfo.getResult());
|
||||||
|
Assert.assertEquals(1, taskInfo.getResult().getResults().size());
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,210 @@
|
|||||||
|
package com.easyagents.document.core.mineru;
|
||||||
|
|
||||||
|
import com.alibaba.fastjson2.JSONObject;
|
||||||
|
import com.easyagents.document.core.entity.ParseFile;
|
||||||
|
import com.easyagents.document.core.entity.ParseRequest;
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||||
|
import okhttp3.Request;
|
||||||
|
import okio.Buffer;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.zip.ZipEntry;
|
||||||
|
import java.util.zip.ZipOutputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MinerU 通用文档解析服务测试。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class MineruDocumentParseServiceTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldForceAsyncResultArtifacts() {
|
||||||
|
RecordingClient client = new RecordingClient(defaultProperties());
|
||||||
|
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||||
|
MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper);
|
||||||
|
|
||||||
|
ParseRequest request = buildRequest();
|
||||||
|
request.setReturnMarkdown(false);
|
||||||
|
request.setReturnMiddleJson(false);
|
||||||
|
request.setReturnContentList(false);
|
||||||
|
request.setReturnModelOutput(false);
|
||||||
|
request.setReturnImages(false);
|
||||||
|
|
||||||
|
ParseTaskStatus status = service.submit(request);
|
||||||
|
|
||||||
|
Assert.assertEquals("task-1", status.getTaskId());
|
||||||
|
Assert.assertTrue(client.lastSubmitRequest.getReturnMarkdown());
|
||||||
|
Assert.assertTrue(client.lastSubmitRequest.getReturnMiddleJson());
|
||||||
|
Assert.assertTrue(client.lastSubmitRequest.getReturnContentList());
|
||||||
|
Assert.assertTrue(client.lastSubmitRequest.getReturnModelOutput());
|
||||||
|
Assert.assertTrue(client.lastSubmitRequest.getReturnImages());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldUseTaskMetadataWhenQueryingAsyncZipResult() {
|
||||||
|
RecordingClient client = new RecordingClient(defaultProperties());
|
||||||
|
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||||
|
MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper);
|
||||||
|
|
||||||
|
ParseResponse response = service.queryResult("task-1");
|
||||||
|
|
||||||
|
Assert.assertEquals("vlm-http-client", response.getBackend());
|
||||||
|
Assert.assertEquals("3.0.9", response.getVersion());
|
||||||
|
Assert.assertEquals(1, response.getResults().size());
|
||||||
|
Assert.assertEquals("demo", response.getResults().get(0).getFileName());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldReturnCompletedResultInTaskInfo() {
|
||||||
|
RecordingClient client = new RecordingClient(defaultProperties());
|
||||||
|
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||||
|
MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper);
|
||||||
|
|
||||||
|
ParseTaskInfo taskInfo = service.queryTaskInfo("task-1");
|
||||||
|
|
||||||
|
Assert.assertEquals("completed", taskInfo.getStatus());
|
||||||
|
Assert.assertNotNull(taskInfo.getResult());
|
||||||
|
Assert.assertEquals(1, taskInfo.getResult().getResults().size());
|
||||||
|
Assert.assertEquals(1, client.queryResultZipCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldSendRepeatedLangListFields() {
|
||||||
|
InspectingMultipartClient client = new InspectingMultipartClient(defaultProperties());
|
||||||
|
ParseRequest request = buildRequest();
|
||||||
|
request.setLanguages(java.util.Arrays.asList("zh", "en"));
|
||||||
|
|
||||||
|
client.parse(request);
|
||||||
|
|
||||||
|
Assert.assertEquals(2, countOccurrences(client.lastMultipartBody, "name=\"lang_list\""));
|
||||||
|
Assert.assertTrue(client.lastMultipartBody.contains("\r\nzh\r\n"));
|
||||||
|
Assert.assertTrue(client.lastMultipartBody.contains("\r\nen\r\n"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private ParseRequest buildRequest() {
|
||||||
|
ParseRequest request = new ParseRequest();
|
||||||
|
request.addFile(ParseFile.of("demo.pptx", "ppt".getBytes(StandardCharsets.UTF_8)));
|
||||||
|
return request;
|
||||||
|
}
|
||||||
|
|
||||||
|
private MineruProperties defaultProperties() {
|
||||||
|
MineruProperties properties = new MineruProperties();
|
||||||
|
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||||
|
properties.setResultTimeoutMs(50);
|
||||||
|
properties.setPollIntervalMs(1);
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int countOccurrences(String source, String token) {
|
||||||
|
int count = 0;
|
||||||
|
int index = 0;
|
||||||
|
while (source != null && token != null && !token.isEmpty() && (index = source.indexOf(token, index)) >= 0) {
|
||||||
|
count++;
|
||||||
|
index += token.length();
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class RecordingClient extends MineruClient {
|
||||||
|
|
||||||
|
private ParseRequest lastSubmitRequest;
|
||||||
|
private int queryResultZipCount;
|
||||||
|
|
||||||
|
private RecordingClient(MineruProperties properties) {
|
||||||
|
super(properties, new MineruMapper(properties));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MineruTaskStatus submit(ParseRequest request) {
|
||||||
|
this.lastSubmitRequest = request;
|
||||||
|
MineruTaskStatus taskStatus = new MineruTaskStatus();
|
||||||
|
taskStatus.setTaskId("task-1");
|
||||||
|
taskStatus.setStatus("pending");
|
||||||
|
return taskStatus;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MineruTaskStatus queryTask(String taskId) {
|
||||||
|
MineruTaskStatus taskStatus = new MineruTaskStatus();
|
||||||
|
taskStatus.setTaskId(taskId);
|
||||||
|
taskStatus.setStatus("completed");
|
||||||
|
taskStatus.setBackend("vlm-http-client");
|
||||||
|
taskStatus.setVersion("3.0.9");
|
||||||
|
return taskStatus;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] queryResultZip(String taskId) {
|
||||||
|
queryResultZipCount++;
|
||||||
|
try {
|
||||||
|
return buildZipResult();
|
||||||
|
} catch (IOException exception) {
|
||||||
|
throw new IllegalStateException("Failed to build test ZIP", exception);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static byte[] buildZipResult() throws IOException {
|
||||||
|
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
try (ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream)) {
|
||||||
|
addEntry(zipOutputStream, "demo/vlm/demo.md", "# title");
|
||||||
|
addEntry(zipOutputStream, "demo/vlm/demo_middle.json", middleJson().toJSONString());
|
||||||
|
addEntry(zipOutputStream, "demo/vlm/demo_content_list.json", contentList().toJSONString());
|
||||||
|
}
|
||||||
|
return outputStream.toByteArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException {
|
||||||
|
zipOutputStream.putNextEntry(new ZipEntry(name));
|
||||||
|
zipOutputStream.write(content.getBytes(StandardCharsets.UTF_8));
|
||||||
|
zipOutputStream.closeEntry();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static JSONObject middleJson() {
|
||||||
|
JSONObject middleJson = new JSONObject();
|
||||||
|
middleJson.put("_backend", "vlm");
|
||||||
|
middleJson.put("_version_name", "3.0.9");
|
||||||
|
middleJson.put("pdf_info", new com.alibaba.fastjson2.JSONArray());
|
||||||
|
return middleJson;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static com.alibaba.fastjson2.JSONArray contentList() {
|
||||||
|
com.alibaba.fastjson2.JSONArray contentList = new com.alibaba.fastjson2.JSONArray();
|
||||||
|
JSONObject text = new JSONObject();
|
||||||
|
text.put("type", "text");
|
||||||
|
text.put("text", "title");
|
||||||
|
text.put("page_idx", 0);
|
||||||
|
text.put("bbox", new com.alibaba.fastjson2.JSONArray());
|
||||||
|
contentList.add(text);
|
||||||
|
return contentList;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class InspectingMultipartClient extends MineruClient {
|
||||||
|
|
||||||
|
private String lastMultipartBody;
|
||||||
|
|
||||||
|
private InspectingMultipartClient(MineruProperties properties) {
|
||||||
|
super(properties, new MineruMapper(properties));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected JSONObject executeJsonRequest(String path, Request request) {
|
||||||
|
try {
|
||||||
|
Buffer buffer = new Buffer();
|
||||||
|
request.body().writeTo(buffer);
|
||||||
|
this.lastMultipartBody = buffer.readUtf8();
|
||||||
|
} catch (IOException exception) {
|
||||||
|
throw new IllegalStateException("Failed to inspect multipart body", exception);
|
||||||
|
}
|
||||||
|
return new JSONObject();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
package com.easyagents.document.pdf;
|
package com.easyagents.document.pdf;
|
||||||
|
|
||||||
import com.easyagents.document.core.DocumentParseService;
|
import com.easyagents.document.core.DocumentParseService;
|
||||||
|
import com.easyagents.document.core.entity.PdfParseRequest;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* PDF 文档解析服务。
|
* PDF 文档解析服务。
|
||||||
@@ -8,5 +9,5 @@ import com.easyagents.document.core.DocumentParseService;
|
|||||||
* @author Codex
|
* @author Codex
|
||||||
* @since 2026-04-14
|
* @since 2026-04-14
|
||||||
*/
|
*/
|
||||||
public interface PdfDocumentParseService extends DocumentParseService {
|
public interface PdfDocumentParseService extends DocumentParseService<PdfParseRequest> {
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,31 +1,23 @@
|
|||||||
package com.easyagents.document.pdf.mineru;
|
package com.easyagents.document.pdf.mineru;
|
||||||
|
|
||||||
import com.easyagents.core.util.StringUtil;
|
import com.easyagents.document.core.mineru.MineruClient;
|
||||||
import com.easyagents.document.core.exception.DocumentParseException;
|
import com.easyagents.document.core.mineru.MineruDocumentParseService;
|
||||||
import com.easyagents.document.core.model.ParseRequest;
|
import com.easyagents.document.core.mineru.MineruMapper;
|
||||||
import com.easyagents.document.core.model.ParseResponse;
|
import com.easyagents.document.core.mineru.MineruProperties;
|
||||||
import com.easyagents.document.core.model.ParseTaskInfo;
|
import com.easyagents.document.core.entity.ParseRequest;
|
||||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
import com.easyagents.document.core.entity.PdfParseRequest;
|
||||||
import com.easyagents.document.pdf.PdfDocumentProvider;
|
import com.easyagents.document.pdf.PdfDocumentProvider;
|
||||||
import org.slf4j.Logger;
|
import com.easyagents.core.util.StringUtil;
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 基于 MinerU API 的 PDF 解析服务。
|
* 基于 MinerU API 的 PDF 解析服务。
|
||||||
*
|
*
|
||||||
* @author Codex
|
* @author Codex
|
||||||
* @since 2026-04-14
|
* @since 2026-04-16
|
||||||
*/
|
*/
|
||||||
public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
public class MineruPdfDocumentParseService extends MineruDocumentParseService<PdfParseRequest> implements PdfDocumentProvider {
|
||||||
|
|
||||||
public static final String PROVIDER_NAME = "mineru";
|
public static final String PROVIDER_NAME = "mineru";
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(MineruPdfDocumentParseService.class);
|
|
||||||
|
|
||||||
private final MineruProperties properties;
|
|
||||||
private final MineruPdfClient client;
|
|
||||||
private final MineruMapper mapper;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 创建默认服务实例。
|
* 创建默认服务实例。
|
||||||
@@ -33,7 +25,7 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
|||||||
* @param properties MinerU 配置
|
* @param properties MinerU 配置
|
||||||
*/
|
*/
|
||||||
public MineruPdfDocumentParseService(MineruProperties properties) {
|
public MineruPdfDocumentParseService(MineruProperties properties) {
|
||||||
this(properties, new MineruMapper(properties));
|
super(properties);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -43,7 +35,7 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
|||||||
* @param mapper 结果映射器
|
* @param mapper 结果映射器
|
||||||
*/
|
*/
|
||||||
public MineruPdfDocumentParseService(MineruProperties properties, MineruMapper mapper) {
|
public MineruPdfDocumentParseService(MineruProperties properties, MineruMapper mapper) {
|
||||||
this(properties, new MineruPdfClient(properties, mapper), mapper);
|
super(properties, mapper);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -53,10 +45,8 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
|||||||
* @param client HTTP 客户端
|
* @param client HTTP 客户端
|
||||||
* @param mapper 结果映射器
|
* @param mapper 结果映射器
|
||||||
*/
|
*/
|
||||||
public MineruPdfDocumentParseService(MineruProperties properties, MineruPdfClient client, MineruMapper mapper) {
|
public MineruPdfDocumentParseService(MineruProperties properties, MineruClient client, MineruMapper mapper) {
|
||||||
this.properties = properties;
|
super(properties, client, mapper);
|
||||||
this.client = client;
|
|
||||||
this.mapper = mapper;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -65,145 +55,21 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ParseResponse parse(ParseRequest request) {
|
protected ParseRequest normalizeRequest(ParseRequest request) {
|
||||||
ParseRequest normalizedRequest = normalizeRequest(request);
|
PdfParseRequest normalizedRequest = PdfParseRequest.from(request);
|
||||||
LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}, parseMethod={}",
|
ParseRequest commonRequest = super.normalizeRequest(normalizedRequest);
|
||||||
PROVIDER_NAME,
|
commonRequest.copyCommonFieldsTo(normalizedRequest);
|
||||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
normalizedRequest.setParseMethod(
|
||||||
normalizedRequest.getBackend(),
|
StringUtil.hasText(normalizedRequest.getParseMethod()) ? normalizedRequest.getParseMethod() : getProperties().getDefaultParseMethod()
|
||||||
normalizedRequest.getParseMethod());
|
|
||||||
ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
|
|
||||||
LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
|
|
||||||
PROVIDER_NAME,
|
|
||||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
|
||||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
|
||||||
return response;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public ParseTaskStatus submit(ParseRequest request) {
|
|
||||||
ParseRequest normalizedRequest = normalizeRequest(request);
|
|
||||||
// 异步结果固定走全量 ZIP,调用方无需传入裁剪参数。
|
|
||||||
normalizedRequest.setReturnMarkdown(true);
|
|
||||||
normalizedRequest.setReturnMiddleJson(true);
|
|
||||||
normalizedRequest.setReturnContentList(true);
|
|
||||||
normalizedRequest.setReturnModelOutput(true);
|
|
||||||
normalizedRequest.setReturnImages(true);
|
|
||||||
LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}, parseMethod={}",
|
|
||||||
PROVIDER_NAME,
|
|
||||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
|
||||||
normalizedRequest.getBackend(),
|
|
||||||
normalizedRequest.getParseMethod());
|
|
||||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
|
|
||||||
LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
|
|
||||||
PROVIDER_NAME,
|
|
||||||
taskStatus == null ? null : taskStatus.getTaskId(),
|
|
||||||
taskStatus == null ? null : taskStatus.getStatus());
|
|
||||||
return taskStatus;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public ParseTaskStatus queryTask(String taskId) {
|
|
||||||
validateTaskId(taskId);
|
|
||||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
|
|
||||||
LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
|
|
||||||
PROVIDER_NAME,
|
|
||||||
taskId,
|
|
||||||
taskStatus == null ? null : taskStatus.getStatus());
|
|
||||||
return taskStatus;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public ParseResponse queryResult(String taskId) {
|
|
||||||
validateTaskId(taskId);
|
|
||||||
LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
|
|
||||||
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
|
|
||||||
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
|
||||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
|
||||||
LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
|
|
||||||
PROVIDER_NAME,
|
|
||||||
taskId,
|
|
||||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
|
||||||
return response;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public ParseTaskInfo queryTaskInfo(String taskId) {
|
|
||||||
validateTaskId(taskId);
|
|
||||||
MineruTaskStatus taskStatus = client.queryTask(taskId);
|
|
||||||
ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(mapper.toParseTaskStatus(taskStatus));
|
|
||||||
if ("completed".equalsIgnoreCase(taskStatus.getStatus())) {
|
|
||||||
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
|
||||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
|
||||||
taskInfo.setResult(response);
|
|
||||||
}
|
|
||||||
LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
|
|
||||||
PROVIDER_NAME,
|
|
||||||
taskId,
|
|
||||||
taskInfo == null ? null : taskInfo.getStatus(),
|
|
||||||
taskInfo != null && taskInfo.getResult() != null);
|
|
||||||
return taskInfo;
|
|
||||||
}
|
|
||||||
|
|
||||||
private ParseRequest normalizeRequest(ParseRequest request) {
|
|
||||||
if (request == null) {
|
|
||||||
throw new IllegalArgumentException("ParseRequest must not be null");
|
|
||||||
}
|
|
||||||
if (request.getFiles() == null || request.getFiles().isEmpty()) {
|
|
||||||
throw new IllegalArgumentException("ParseRequest files must not be empty");
|
|
||||||
}
|
|
||||||
ParseRequest normalizedRequest = new ParseRequest();
|
|
||||||
normalizedRequest.setFiles(new ArrayList<>(request.getFiles()));
|
|
||||||
normalizedRequest.setBackend(StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend());
|
|
||||||
normalizedRequest.setParseMethod(StringUtil.hasText(request.getParseMethod()) ? request.getParseMethod() : properties.getDefaultParseMethod());
|
|
||||||
normalizedRequest.setLanguages(
|
|
||||||
request.getLanguages() == null || request.getLanguages().isEmpty()
|
|
||||||
? new ArrayList<String>(properties.getDefaultLangList())
|
|
||||||
: new ArrayList<String>(request.getLanguages())
|
|
||||||
);
|
);
|
||||||
normalizedRequest.setFormulaEnabled(request.getFormulaEnabled() == null ? properties.getDefaultFormulaEnable() : request.getFormulaEnabled());
|
normalizedRequest.setFormulaEnabled(
|
||||||
normalizedRequest.setTableEnabled(request.getTableEnabled() == null ? properties.getDefaultTableEnable() : request.getTableEnabled());
|
normalizedRequest.getFormulaEnabled() == null ? getProperties().getDefaultFormulaEnable() : normalizedRequest.getFormulaEnabled()
|
||||||
normalizedRequest.setStartPageIndex(request.getStartPageIndex() == null ? 0 : request.getStartPageIndex());
|
);
|
||||||
normalizedRequest.setEndPageIndex(request.getEndPageIndex() == null ? 99999 : request.getEndPageIndex());
|
normalizedRequest.setTableEnabled(
|
||||||
normalizedRequest.setReturnMarkdown(request.getReturnMarkdown());
|
normalizedRequest.getTableEnabled() == null ? getProperties().getDefaultTableEnable() : normalizedRequest.getTableEnabled()
|
||||||
normalizedRequest.setReturnMiddleJson(request.getReturnMiddleJson());
|
);
|
||||||
normalizedRequest.setReturnContentList(request.getReturnContentList());
|
normalizedRequest.setStartPageIndex(normalizedRequest.getStartPageIndex() == null ? 0 : normalizedRequest.getStartPageIndex());
|
||||||
normalizedRequest.setReturnModelOutput(request.getReturnModelOutput());
|
normalizedRequest.setEndPageIndex(normalizedRequest.getEndPageIndex() == null ? 99999 : normalizedRequest.getEndPageIndex());
|
||||||
normalizedRequest.setReturnImages(request.getReturnImages());
|
|
||||||
return normalizedRequest;
|
return normalizedRequest;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void validateTaskId(String taskId) {
|
|
||||||
if (!StringUtil.hasText(taskId)) {
|
|
||||||
throw new IllegalArgumentException("taskId must not be empty");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 轮询任务状态直到完成或失败。
|
|
||||||
*
|
|
||||||
* @param taskId 任务 ID
|
|
||||||
* @return 已完成的任务状态
|
|
||||||
*/
|
|
||||||
private MineruTaskStatus waitForTaskCompleted(String taskId) {
|
|
||||||
long deadline = System.currentTimeMillis() + properties.getResultTimeoutMs();
|
|
||||||
while (true) {
|
|
||||||
MineruTaskStatus taskStatus = client.queryTask(taskId);
|
|
||||||
if ("completed".equals(taskStatus.getStatus())) {
|
|
||||||
return taskStatus;
|
|
||||||
}
|
|
||||||
if ("failed".equals(taskStatus.getStatus())) {
|
|
||||||
throw new DocumentParseException("MinerU task failed: " + taskStatus.getError());
|
|
||||||
}
|
|
||||||
if (System.currentTimeMillis() >= deadline) {
|
|
||||||
throw new DocumentParseException("MinerU task result timeout: " + taskId);
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
Thread.sleep(properties.getPollIntervalMs());
|
|
||||||
} catch (InterruptedException exception) {
|
|
||||||
Thread.currentThread().interrupt();
|
|
||||||
throw new DocumentParseException("Interrupted while waiting for MinerU task: " + taskId, exception);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,10 +2,13 @@ package com.easyagents.document.pdf.mineru;
|
|||||||
|
|
||||||
import com.alibaba.fastjson2.JSONArray;
|
import com.alibaba.fastjson2.JSONArray;
|
||||||
import com.alibaba.fastjson2.JSONObject;
|
import com.alibaba.fastjson2.JSONObject;
|
||||||
|
import com.easyagents.document.core.mineru.MineruMapper;
|
||||||
|
import com.easyagents.document.core.mineru.MineruProperties;
|
||||||
|
import com.easyagents.document.core.mineru.MineruResultPayload;
|
||||||
import com.easyagents.document.core.exception.DocumentParseException;
|
import com.easyagents.document.core.exception.DocumentParseException;
|
||||||
import com.easyagents.document.core.model.ParseRequest;
|
import com.easyagents.document.core.entity.ParseRequest;
|
||||||
import com.easyagents.document.core.model.ParseResponse;
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
import com.easyagents.document.core.model.ParseResult;
|
import com.easyagents.document.core.entity.ParseResult;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
@@ -41,6 +44,7 @@ public class MineruMapperTest {
|
|||||||
Assert.assertFalse(result.getBlocks().isEmpty());
|
Assert.assertFalse(result.getBlocks().isEmpty());
|
||||||
Assert.assertEquals(1, result.getTables().size());
|
Assert.assertEquals(1, result.getTables().size());
|
||||||
Assert.assertEquals(2, result.getImages().size());
|
Assert.assertEquals(2, result.getImages().size());
|
||||||
|
Assert.assertNotNull(result.getImages().get(0).getContent());
|
||||||
Assert.assertNotNull(result.getArtifacts().getMiddleJson());
|
Assert.assertNotNull(result.getArtifacts().getMiddleJson());
|
||||||
Assert.assertNotNull(result.getArtifacts().getContentList());
|
Assert.assertNotNull(result.getArtifacts().getContentList());
|
||||||
}
|
}
|
||||||
@@ -56,6 +60,7 @@ public class MineruMapperTest {
|
|||||||
Assert.assertEquals("# title", result.getPlainText());
|
Assert.assertEquals("# title", result.getPlainText());
|
||||||
Assert.assertEquals(1, result.getTables().size());
|
Assert.assertEquals(1, result.getTables().size());
|
||||||
Assert.assertEquals(2, result.getImages().size());
|
Assert.assertEquals(2, result.getImages().size());
|
||||||
|
Assert.assertNotNull(result.getImages().get(0).getContent());
|
||||||
Assert.assertNotNull(result.getArtifacts().getExtraJsonArtifacts().get("contentListV2"));
|
Assert.assertNotNull(result.getArtifacts().getExtraJsonArtifacts().get("contentListV2"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,11 +1,16 @@
|
|||||||
package com.easyagents.document.pdf.mineru;
|
package com.easyagents.document.pdf.mineru;
|
||||||
|
|
||||||
import com.alibaba.fastjson2.JSONObject;
|
import com.alibaba.fastjson2.JSONObject;
|
||||||
import com.easyagents.document.core.model.ParseFile;
|
import com.easyagents.document.core.mineru.MineruClient;
|
||||||
import com.easyagents.document.core.model.ParseRequest;
|
import com.easyagents.document.core.mineru.MineruMapper;
|
||||||
import com.easyagents.document.core.model.ParseResponse;
|
import com.easyagents.document.core.mineru.MineruProperties;
|
||||||
import com.easyagents.document.core.model.ParseTaskInfo;
|
import com.easyagents.document.core.mineru.MineruResultPayload;
|
||||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
import com.easyagents.document.core.mineru.MineruTaskStatus;
|
||||||
|
import com.easyagents.document.core.entity.ParseFile;
|
||||||
|
import com.easyagents.document.core.entity.ParseRequest;
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||||
import okhttp3.Request;
|
import okhttp3.Request;
|
||||||
import okio.Buffer;
|
import okio.Buffer;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
@@ -147,7 +152,7 @@ public class MineruPdfDocumentParseServiceTest {
|
|||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class RecordingClient extends MineruPdfClient {
|
private static class RecordingClient extends MineruClient {
|
||||||
|
|
||||||
private ParseRequest lastParseRequest;
|
private ParseRequest lastParseRequest;
|
||||||
private ParseRequest lastSubmitRequest;
|
private ParseRequest lastSubmitRequest;
|
||||||
@@ -248,7 +253,7 @@ public class MineruPdfDocumentParseServiceTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class InspectingMultipartClient extends MineruPdfClient {
|
private static class InspectingMultipartClient extends MineruClient {
|
||||||
|
|
||||||
private String lastMultipartBody;
|
private String lastMultipartBody;
|
||||||
|
|
||||||
|
|||||||
44
easy-agents-document/easy-agents-document-pptx/pom.xml
Normal file
44
easy-agents-document/easy-agents-document-pptx/pom.xml
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<parent>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-document</artifactId>
|
||||||
|
<version>${revision}</version>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<artifactId>easy-agents-document-pptx</artifactId>
|
||||||
|
<name>easy-agents-document-pptx</name>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<maven.compiler.source>8</maven.compiler.source>
|
||||||
|
<maven.compiler.target>8</maven.compiler.target>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-document-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.alibaba.fastjson2</groupId>
|
||||||
|
<artifactId>fastjson2</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</project>
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
package com.easyagents.document.pptx;
|
||||||
|
|
||||||
|
import com.easyagents.document.core.DocumentParseService;
|
||||||
|
import com.easyagents.document.core.entity.PptxParseRequest;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PPTX 文档解析服务。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public interface PptxDocumentParseService extends DocumentParseService<PptxParseRequest> {
|
||||||
|
}
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
package com.easyagents.document.pptx;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PPTX provider SPI。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public interface PptxDocumentProvider extends PptxDocumentParseService {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取 provider 标识。
|
||||||
|
*
|
||||||
|
* @return provider 名称
|
||||||
|
*/
|
||||||
|
String getProvider();
|
||||||
|
}
|
||||||
@@ -0,0 +1,408 @@
|
|||||||
|
package com.easyagents.document.pptx.mineru;
|
||||||
|
|
||||||
|
import com.easyagents.core.util.StringUtil;
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskRepository;
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskUpdater;
|
||||||
|
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||||
|
import com.easyagents.document.core.mineru.MineruClient;
|
||||||
|
import com.easyagents.document.core.mineru.MineruMapper;
|
||||||
|
import com.easyagents.document.core.mineru.MineruProperties;
|
||||||
|
import com.easyagents.document.core.entity.DocumentBlock;
|
||||||
|
import com.easyagents.document.core.entity.DocumentImage;
|
||||||
|
import com.easyagents.document.core.entity.DocumentPage;
|
||||||
|
import com.easyagents.document.core.entity.DocumentTable;
|
||||||
|
import com.easyagents.document.core.entity.ParseFile;
|
||||||
|
import com.easyagents.document.core.entity.ParseRequest;
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
import com.easyagents.document.core.entity.ParseResult;
|
||||||
|
import com.easyagents.document.core.entity.PptxParseRequest;
|
||||||
|
import com.easyagents.document.core.support.AbstractAsyncDocumentParseService;
|
||||||
|
import com.easyagents.document.pptx.PptxDocumentProvider;
|
||||||
|
import com.easyagents.document.pptx.model.PptxParseArtifact;
|
||||||
|
import com.easyagents.document.pptx.model.PptxSlideArtifact;
|
||||||
|
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||||
|
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
||||||
|
|
||||||
|
import javax.imageio.ImageIO;
|
||||||
|
import java.awt.Color;
|
||||||
|
import java.awt.Dimension;
|
||||||
|
import java.awt.Graphics2D;
|
||||||
|
import java.awt.RenderingHints;
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 基于 MinerU 的 PPTX 文档解析服务。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class MineruPptxDocumentParseService extends AbstractAsyncDocumentParseService<PptxParseRequest> implements PptxDocumentProvider {
|
||||||
|
|
||||||
|
public static final String PROVIDER_NAME = "mineru";
|
||||||
|
|
||||||
|
private final MineruProperties properties;
|
||||||
|
private final MineruClient client;
|
||||||
|
private final MineruMapper mapper;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
*/
|
||||||
|
public MineruPptxDocumentParseService(MineruProperties properties) {
|
||||||
|
this(properties, new MineruMapper(properties));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
* @param mapper MinerU 映射器
|
||||||
|
*/
|
||||||
|
public MineruPptxDocumentParseService(MineruProperties properties, MineruMapper mapper) {
|
||||||
|
this(properties, new MineruClient(properties, mapper), mapper, defaultTaskManager());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
* @param taskManager 异步任务管理器
|
||||||
|
*/
|
||||||
|
public MineruPptxDocumentParseService(MineruProperties properties, DocumentAsyncTaskManager taskManager) {
|
||||||
|
this(properties, new MineruMapper(properties), taskManager);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
* @param mapper MinerU 映射器
|
||||||
|
* @param taskManager 异步任务管理器
|
||||||
|
*/
|
||||||
|
public MineruPptxDocumentParseService(MineruProperties properties,
|
||||||
|
MineruMapper mapper,
|
||||||
|
DocumentAsyncTaskManager taskManager) {
|
||||||
|
this(properties, new MineruClient(properties, mapper), mapper, taskManager);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
* @param client MinerU 客户端
|
||||||
|
* @param mapper MinerU 映射器
|
||||||
|
* @param taskManager 异步任务管理器
|
||||||
|
*/
|
||||||
|
public MineruPptxDocumentParseService(MineruProperties properties,
|
||||||
|
MineruClient client,
|
||||||
|
MineruMapper mapper,
|
||||||
|
DocumentAsyncTaskManager taskManager) {
|
||||||
|
super(taskManager);
|
||||||
|
this.properties = properties;
|
||||||
|
this.client = client;
|
||||||
|
this.mapper = mapper;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getProvider() {
|
||||||
|
return PROVIDER_NAME;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected PptxParseRequest normalizeRequest(ParseRequest request) {
|
||||||
|
PptxParseRequest normalized = PptxParseRequest.from(request);
|
||||||
|
if (normalized.getFiles() == null || normalized.getFiles().isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("PptxParseRequest files must not be empty");
|
||||||
|
}
|
||||||
|
normalized.setBackend(StringUtil.hasText(normalized.getBackend()) ? normalized.getBackend() : properties.getDefaultBackend());
|
||||||
|
if (normalized.getLanguages() == null || normalized.getLanguages().isEmpty()) {
|
||||||
|
normalized.setLanguages(new ArrayList<String>(properties.getDefaultLangList()));
|
||||||
|
}
|
||||||
|
normalized.setReturnMarkdown(normalized.getReturnMarkdown() == null ? Boolean.TRUE : normalized.getReturnMarkdown());
|
||||||
|
normalized.setReturnMiddleJson(normalized.getReturnMiddleJson() == null ? Boolean.TRUE : normalized.getReturnMiddleJson());
|
||||||
|
normalized.setReturnContentList(normalized.getReturnContentList() == null ? Boolean.TRUE : normalized.getReturnContentList());
|
||||||
|
normalized.setReturnModelOutput(normalized.getReturnModelOutput() == null ? Boolean.FALSE : normalized.getReturnModelOutput());
|
||||||
|
normalized.setReturnImages(normalized.getReturnImages() == null ? Boolean.TRUE : normalized.getReturnImages());
|
||||||
|
normalized.setRenderScale(normalized.getRenderScale() == null || normalized.getRenderScale() <= 0 ? 2.0d : normalized.getRenderScale());
|
||||||
|
normalized.setImageFormat(normalizeImageFormat(normalized.getImageFormat()));
|
||||||
|
normalized.setIncludeSlideImageReference(
|
||||||
|
normalized.getIncludeSlideImageReference() == null ? Boolean.TRUE : normalized.getIncludeSlideImageReference()
|
||||||
|
);
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected ParseResponse doParse(PptxParseRequest request, DocumentAsyncTaskUpdater updater) {
|
||||||
|
ParseResponse response = new ParseResponse();
|
||||||
|
List<ParseResult> results = new ArrayList<ParseResult>();
|
||||||
|
int totalSlides = countSlides(request);
|
||||||
|
int processedSlides = 0;
|
||||||
|
String backend = null;
|
||||||
|
String version = null;
|
||||||
|
|
||||||
|
for (ParseFile file : request.getFiles()) {
|
||||||
|
ParseResult result = parseSinglePptx(file, request, updater, processedSlides, totalSlides);
|
||||||
|
processedSlides += Integer.parseInt(String.valueOf(result.getMetadata().get("slideCount")));
|
||||||
|
if (backend == null) {
|
||||||
|
backend = (String) result.getMetadata().get("ocrBackend");
|
||||||
|
}
|
||||||
|
if (version == null) {
|
||||||
|
version = (String) result.getMetadata().get("ocrVersion");
|
||||||
|
}
|
||||||
|
result.getMetadata().remove("slideCount");
|
||||||
|
result.getMetadata().remove("ocrBackend");
|
||||||
|
result.getMetadata().remove("ocrVersion");
|
||||||
|
results.add(result);
|
||||||
|
}
|
||||||
|
response.setBackend(StringUtil.hasText(backend) ? backend : request.getBackend());
|
||||||
|
response.setVersion(version);
|
||||||
|
response.setResults(results);
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
private ParseResult parseSinglePptx(ParseFile file,
|
||||||
|
PptxParseRequest request,
|
||||||
|
DocumentAsyncTaskUpdater updater,
|
||||||
|
int processedSlidesBefore,
|
||||||
|
int totalSlides) {
|
||||||
|
ParseResult aggregate = new ParseResult();
|
||||||
|
aggregate.setFileName(file.getFileName());
|
||||||
|
StringBuilder markdownBuilder = new StringBuilder();
|
||||||
|
PptxParseArtifact artifact = new PptxParseArtifact();
|
||||||
|
String backend = null;
|
||||||
|
String version = null;
|
||||||
|
int slideCount = 0;
|
||||||
|
|
||||||
|
try (XMLSlideShow slideShow = new XMLSlideShow(new ByteArrayInputStream(file.getContent()))) {
|
||||||
|
List<XSLFSlide> slides = slideShow.getSlides();
|
||||||
|
Dimension pageSize = slideShow.getPageSize();
|
||||||
|
int startSlide = request.getStartSlideIndex() == null ? 0 : Math.max(request.getStartSlideIndex(), 0);
|
||||||
|
int endSlide = request.getEndSlideIndex() == null
|
||||||
|
? slides.size() - 1
|
||||||
|
: Math.min(request.getEndSlideIndex(), slides.size() - 1);
|
||||||
|
if (endSlide < startSlide) {
|
||||||
|
endSlide = startSlide - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int slideIndex = startSlide; slideIndex <= endSlide; slideIndex++) {
|
||||||
|
XSLFSlide slide = slides.get(slideIndex);
|
||||||
|
slideCount++;
|
||||||
|
updateProgress(updater, "extracting", processedSlidesBefore + slideCount - 1, totalSlides,
|
||||||
|
"正在渲染第 " + (slideIndex + 1) + " 页幻灯片");
|
||||||
|
|
||||||
|
byte[] imageBytes = renderSlide(slide, pageSize, request.getRenderScale(), request.getImageFormat());
|
||||||
|
String imagePath = buildImagePath(slideIndex, request.getImageFormat());
|
||||||
|
String imageName = buildImageName(slideIndex);
|
||||||
|
|
||||||
|
updateProgress(updater, "ocr", processedSlidesBefore + slideCount - 1, totalSlides,
|
||||||
|
"正在识别第 " + (slideIndex + 1) + " 页幻灯片");
|
||||||
|
ParseResult ocrResult = parseSlideImage(slideIndex, imageBytes, request, imagePath);
|
||||||
|
|
||||||
|
if (!StringUtil.hasText(backend)) {
|
||||||
|
backend = (String) ocrResult.getMetadata().get("middleBackend");
|
||||||
|
}
|
||||||
|
if (!StringUtil.hasText(version)) {
|
||||||
|
version = (String) ocrResult.getMetadata().get("middleVersion");
|
||||||
|
}
|
||||||
|
|
||||||
|
appendSlideMarkdown(markdownBuilder, slideIndex, imageName, imagePath, request, ocrResult.getMarkdown());
|
||||||
|
aggregate.getImages().add(buildSlideImage(slideIndex, imageName, imagePath, request.getImageFormat(), imageBytes));
|
||||||
|
aggregate.getPages().add(buildPage(slideIndex, pageSize, request.getRenderScale()));
|
||||||
|
mergeOcrResult(aggregate, slideIndex, ocrResult);
|
||||||
|
artifact.getSlides().add(buildSlideArtifact(slideIndex, slide, imageName, imagePath, ocrResult));
|
||||||
|
}
|
||||||
|
} catch (IOException exception) {
|
||||||
|
throw new IllegalStateException("Failed to parse PPTX file: " + file.getFileName(), exception);
|
||||||
|
}
|
||||||
|
|
||||||
|
updateProgress(updater, "assembling", processedSlidesBefore + slideCount, totalSlides, "正在汇总 PPTX 解析结果");
|
||||||
|
aggregate.setMarkdown(markdownBuilder.toString().trim());
|
||||||
|
aggregate.setPlainText(aggregate.getMarkdown());
|
||||||
|
aggregate.getArtifacts().getExtraJsonArtifacts().put("pptx", artifact);
|
||||||
|
aggregate.getMetadata().put("slideCount", slideCount);
|
||||||
|
aggregate.getMetadata().put("ocrBackend", backend);
|
||||||
|
aggregate.getMetadata().put("ocrVersion", version);
|
||||||
|
return aggregate;
|
||||||
|
}
|
||||||
|
|
||||||
|
private ParseResult parseSlideImage(int slideIndex, byte[] imageBytes, PptxParseRequest request, String imagePath) {
|
||||||
|
ParseRequest imageRequest = new ParseRequest();
|
||||||
|
imageRequest.addFile(ParseFile.of("slide-" + (slideIndex + 1) + "." + request.getImageFormat(), imageBytes, "image/" + request.getImageFormat()));
|
||||||
|
imageRequest.setBackend(request.getBackend());
|
||||||
|
imageRequest.setLanguages(request.getLanguages());
|
||||||
|
imageRequest.setReturnMarkdown(true);
|
||||||
|
imageRequest.setReturnMiddleJson(true);
|
||||||
|
imageRequest.setReturnContentList(true);
|
||||||
|
imageRequest.setReturnModelOutput(false);
|
||||||
|
imageRequest.setReturnImages(false);
|
||||||
|
ParseResponse response = mapper.toParseResponse(client.parse(imageRequest));
|
||||||
|
ParseResult result = response.getResults().isEmpty() ? new ParseResult() : response.getResults().get(0);
|
||||||
|
if (!StringUtil.hasText(result.getMarkdown())) {
|
||||||
|
result.setMarkdown(result.getPlainText());
|
||||||
|
}
|
||||||
|
result.getMetadata().put("slideImagePath", imagePath);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void appendSlideMarkdown(StringBuilder markdownBuilder,
|
||||||
|
int slideIndex,
|
||||||
|
String imageName,
|
||||||
|
String imagePath,
|
||||||
|
PptxParseRequest request,
|
||||||
|
String ocrMarkdown) {
|
||||||
|
if (markdownBuilder.length() > 0) {
|
||||||
|
markdownBuilder.append("\n\n");
|
||||||
|
}
|
||||||
|
markdownBuilder.append("# Slide ").append(slideIndex + 1).append("\n\n");
|
||||||
|
if (Boolean.TRUE.equals(request.getIncludeSlideImageReference())) {
|
||||||
|
markdownBuilder.append(".append(imagePath).append(")\n\n");
|
||||||
|
}
|
||||||
|
if (StringUtil.hasText(ocrMarkdown)) {
|
||||||
|
markdownBuilder.append(ocrMarkdown.trim());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentImage buildSlideImage(int slideIndex, String imageName, String imagePath, String imageFormat, byte[] imageBytes) {
|
||||||
|
DocumentImage image = new DocumentImage();
|
||||||
|
image.setPageIndex(slideIndex);
|
||||||
|
image.setName(imageName);
|
||||||
|
image.setSourcePath(imagePath);
|
||||||
|
image.setMimeType("image/" + imageFormat);
|
||||||
|
image.setContent(imageBytes);
|
||||||
|
return image;
|
||||||
|
}
|
||||||
|
|
||||||
|
private DocumentPage buildPage(int slideIndex, Dimension pageSize, Double renderScale) {
|
||||||
|
DocumentPage page = new DocumentPage();
|
||||||
|
page.setPageIndex(slideIndex);
|
||||||
|
page.setWidth(pageSize.getWidth() * renderScale);
|
||||||
|
page.setHeight(pageSize.getHeight() * renderScale);
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void mergeOcrResult(ParseResult aggregate, int slideIndex, ParseResult ocrResult) {
|
||||||
|
for (DocumentBlock block : ocrResult.getBlocks()) {
|
||||||
|
block.setPageIndex(slideIndex);
|
||||||
|
aggregate.getBlocks().add(block);
|
||||||
|
}
|
||||||
|
for (DocumentTable table : ocrResult.getTables()) {
|
||||||
|
table.setPageIndex(slideIndex);
|
||||||
|
aggregate.getTables().add(table);
|
||||||
|
}
|
||||||
|
for (String warning : ocrResult.getWarnings()) {
|
||||||
|
aggregate.getWarnings().add("Slide " + (slideIndex + 1) + ": " + warning);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private PptxSlideArtifact buildSlideArtifact(int slideIndex,
|
||||||
|
XSLFSlide slide,
|
||||||
|
String imageName,
|
||||||
|
String imagePath,
|
||||||
|
ParseResult ocrResult) {
|
||||||
|
PptxSlideArtifact artifact = new PptxSlideArtifact();
|
||||||
|
artifact.setSlideIndex(slideIndex);
|
||||||
|
artifact.setTitle(slide.getTitle());
|
||||||
|
artifact.setImageName(imageName);
|
||||||
|
artifact.setImagePath(imagePath);
|
||||||
|
artifact.setOcrMarkdown(ocrResult.getMarkdown());
|
||||||
|
artifact.setMiddleJson(ocrResult.getArtifacts().getMiddleJson());
|
||||||
|
artifact.setContentList(ocrResult.getArtifacts().getContentList());
|
||||||
|
artifact.setWarnings(new ArrayList<String>(ocrResult.getWarnings()));
|
||||||
|
return artifact;
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] renderSlide(XSLFSlide slide, Dimension pageSize, Double renderScale, String imageFormat) throws IOException {
|
||||||
|
double scale = renderScale == null ? 2.0d : renderScale;
|
||||||
|
int width = Math.max(1, (int) Math.round(pageSize.getWidth() * scale));
|
||||||
|
int height = Math.max(1, (int) Math.round(pageSize.getHeight() * scale));
|
||||||
|
BufferedImage image = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
|
||||||
|
Graphics2D graphics = image.createGraphics();
|
||||||
|
try {
|
||||||
|
graphics.setColor(Color.WHITE);
|
||||||
|
graphics.fillRect(0, 0, width, height);
|
||||||
|
graphics.scale(scale, scale);
|
||||||
|
graphics.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON);
|
||||||
|
graphics.setRenderingHint(RenderingHints.KEY_RENDERING, RenderingHints.VALUE_RENDER_QUALITY);
|
||||||
|
graphics.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC);
|
||||||
|
slide.draw(graphics);
|
||||||
|
} finally {
|
||||||
|
graphics.dispose();
|
||||||
|
}
|
||||||
|
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
ImageIO.write(image, imageFormat, outputStream);
|
||||||
|
return outputStream.toByteArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
private int countSlides(PptxParseRequest request) {
|
||||||
|
int totalSlides = 0;
|
||||||
|
for (ParseFile file : request.getFiles()) {
|
||||||
|
try (XMLSlideShow slideShow = new XMLSlideShow(new ByteArrayInputStream(file.getContent()))) {
|
||||||
|
int slideSize = slideShow.getSlides().size();
|
||||||
|
int startSlide = request.getStartSlideIndex() == null ? 0 : Math.max(request.getStartSlideIndex(), 0);
|
||||||
|
int endSlide = request.getEndSlideIndex() == null
|
||||||
|
? slideSize - 1
|
||||||
|
: Math.min(request.getEndSlideIndex(), slideSize - 1);
|
||||||
|
if (endSlide >= startSlide) {
|
||||||
|
totalSlides += endSlide - startSlide + 1;
|
||||||
|
}
|
||||||
|
} catch (IOException exception) {
|
||||||
|
throw new IllegalStateException("Failed to inspect PPTX slide count: " + file.getFileName(), exception);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return totalSlides;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateProgress(DocumentAsyncTaskUpdater updater,
|
||||||
|
String stage,
|
||||||
|
int processedItems,
|
||||||
|
int totalItems,
|
||||||
|
String message) {
|
||||||
|
if (updater == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int safeTotal = totalItems <= 0 ? 1 : totalItems;
|
||||||
|
int percent = (int) Math.min(99, Math.round(processedItems * 100.0d / safeTotal));
|
||||||
|
updater.update(stage, percent, processedItems, totalItems, message);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String normalizeImageFormat(String imageFormat) {
|
||||||
|
if ("jpg".equalsIgnoreCase(imageFormat) || "jpeg".equalsIgnoreCase(imageFormat)) {
|
||||||
|
return "jpg";
|
||||||
|
}
|
||||||
|
return "png";
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildImagePath(int slideIndex, String imageFormat) {
|
||||||
|
return "images/slide-" + formatIndex(slideIndex) + "/page." + imageFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildImageName(int slideIndex) {
|
||||||
|
return "slide-" + formatIndex(slideIndex) + "-page";
|
||||||
|
}
|
||||||
|
|
||||||
|
private String formatIndex(int slideIndex) {
|
||||||
|
int displayIndex = slideIndex + 1;
|
||||||
|
if (displayIndex < 10) {
|
||||||
|
return "00" + displayIndex;
|
||||||
|
}
|
||||||
|
if (displayIndex < 100) {
|
||||||
|
return "0" + displayIndex;
|
||||||
|
}
|
||||||
|
return String.valueOf(displayIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static DocumentAsyncTaskManager defaultTaskManager() {
|
||||||
|
DocumentAsyncTaskRepository repository = new InMemoryDocumentAsyncTaskRepository();
|
||||||
|
ExecutorService executorService = Executors.newFixedThreadPool(2);
|
||||||
|
return new DocumentAsyncTaskManager(repository, executorService);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
package com.easyagents.document.pptx.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PPTX 结构化工件。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class PptxParseArtifact {
|
||||||
|
|
||||||
|
private List<PptxSlideArtifact> slides = new ArrayList<PptxSlideArtifact>();
|
||||||
|
|
||||||
|
public List<PptxSlideArtifact> getSlides() {
|
||||||
|
return slides;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSlides(List<PptxSlideArtifact> slides) {
|
||||||
|
this.slides = slides == null ? new ArrayList<PptxSlideArtifact>() : slides;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,86 @@
|
|||||||
|
package com.easyagents.document.pptx.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 单页幻灯片工件。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class PptxSlideArtifact {
|
||||||
|
|
||||||
|
private Integer slideIndex;
|
||||||
|
private String title;
|
||||||
|
private String imageName;
|
||||||
|
private String imagePath;
|
||||||
|
private String ocrMarkdown;
|
||||||
|
private Object middleJson;
|
||||||
|
private Object contentList;
|
||||||
|
private List<String> warnings = new ArrayList<String>();
|
||||||
|
|
||||||
|
public Integer getSlideIndex() {
|
||||||
|
return slideIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSlideIndex(Integer slideIndex) {
|
||||||
|
this.slideIndex = slideIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTitle() {
|
||||||
|
return title;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTitle(String title) {
|
||||||
|
this.title = title;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getImageName() {
|
||||||
|
return imageName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setImageName(String imageName) {
|
||||||
|
this.imageName = imageName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getImagePath() {
|
||||||
|
return imagePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setImagePath(String imagePath) {
|
||||||
|
this.imagePath = imagePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOcrMarkdown() {
|
||||||
|
return ocrMarkdown;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOcrMarkdown(String ocrMarkdown) {
|
||||||
|
this.ocrMarkdown = ocrMarkdown;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object getMiddleJson() {
|
||||||
|
return middleJson;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMiddleJson(Object middleJson) {
|
||||||
|
this.middleJson = middleJson;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object getContentList() {
|
||||||
|
return contentList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setContentList(Object contentList) {
|
||||||
|
this.contentList = contentList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getWarnings() {
|
||||||
|
return warnings;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setWarnings(List<String> warnings) {
|
||||||
|
this.warnings = warnings == null ? new ArrayList<String>() : warnings;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,170 @@
|
|||||||
|
package com.easyagents.document.pptx.mineru;
|
||||||
|
|
||||||
|
import com.alibaba.fastjson2.JSONArray;
|
||||||
|
import com.alibaba.fastjson2.JSONObject;
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||||
|
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||||
|
import com.easyagents.document.core.mineru.MineruClient;
|
||||||
|
import com.easyagents.document.core.mineru.MineruMapper;
|
||||||
|
import com.easyagents.document.core.mineru.MineruProperties;
|
||||||
|
import com.easyagents.document.core.mineru.MineruResultPayload;
|
||||||
|
import com.easyagents.document.core.entity.ParseFile;
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
import com.easyagents.document.core.entity.ParseResult;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||||
|
import com.easyagents.document.core.entity.PptxParseRequest;
|
||||||
|
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||||
|
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
||||||
|
import org.apache.poi.xslf.usermodel.XSLFTextBox;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.awt.Rectangle;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.concurrent.Executor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PPTX MinerU 服务测试。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class MineruPptxDocumentParseServiceTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldBuildMarkdownAndImagesForSlides() throws IOException {
|
||||||
|
RecordingClient client = new RecordingClient(defaultProperties());
|
||||||
|
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||||
|
MineruPptxDocumentParseService service = new MineruPptxDocumentParseService(
|
||||||
|
defaultProperties(),
|
||||||
|
client,
|
||||||
|
mapper,
|
||||||
|
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||||
|
);
|
||||||
|
|
||||||
|
PptxParseRequest request = new PptxParseRequest();
|
||||||
|
request.addFile(ParseFile.of("demo.pptx", buildPptxBytes()));
|
||||||
|
|
||||||
|
ParseResponse response = service.parse(request);
|
||||||
|
|
||||||
|
Assert.assertEquals(1, response.getResults().size());
|
||||||
|
ParseResult result = response.getResults().get(0);
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("# Slide 1"));
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("images/slide-001/page.png"));
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("slide-ocr-1"));
|
||||||
|
Assert.assertEquals(2, result.getImages().size());
|
||||||
|
Assert.assertNotNull(result.getImages().get(0).getContent());
|
||||||
|
Assert.assertNotNull(result.getArtifacts().getExtraJsonArtifacts().get("pptx"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldSupportAsyncTaskFlow() throws IOException {
|
||||||
|
RecordingClient client = new RecordingClient(defaultProperties());
|
||||||
|
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||||
|
MineruPptxDocumentParseService service = new MineruPptxDocumentParseService(
|
||||||
|
defaultProperties(),
|
||||||
|
client,
|
||||||
|
mapper,
|
||||||
|
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||||
|
);
|
||||||
|
|
||||||
|
PptxParseRequest request = new PptxParseRequest();
|
||||||
|
request.addFile(ParseFile.of("demo.pptx", buildPptxBytes()));
|
||||||
|
|
||||||
|
ParseTaskStatus status = service.submit(request);
|
||||||
|
ParseTaskInfo taskInfo = service.queryTaskInfo(status.getTaskId());
|
||||||
|
|
||||||
|
Assert.assertEquals("completed", taskInfo.getStatus());
|
||||||
|
Assert.assertNotNull(taskInfo.getResult());
|
||||||
|
Assert.assertEquals(1, taskInfo.getResult().getResults().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] buildPptxBytes() throws IOException {
|
||||||
|
XMLSlideShow slideShow = new XMLSlideShow();
|
||||||
|
slideShow.setPageSize(new java.awt.Dimension(640, 360));
|
||||||
|
createSlide(slideShow, "第一页");
|
||||||
|
createSlide(slideShow, "第二页");
|
||||||
|
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
slideShow.write(outputStream);
|
||||||
|
slideShow.close();
|
||||||
|
return outputStream.toByteArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createSlide(XMLSlideShow slideShow, String text) {
|
||||||
|
XSLFSlide slide = slideShow.createSlide();
|
||||||
|
XSLFTextBox textBox = slide.createTextBox();
|
||||||
|
textBox.setAnchor(new Rectangle(20, 20, 300, 80));
|
||||||
|
textBox.setText(text);
|
||||||
|
}
|
||||||
|
|
||||||
|
private MineruProperties defaultProperties() {
|
||||||
|
MineruProperties properties = new MineruProperties();
|
||||||
|
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Executor directExecutor() {
|
||||||
|
return new Executor() {
|
||||||
|
@Override
|
||||||
|
public void execute(Runnable command) {
|
||||||
|
command.run();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class RecordingClient extends MineruClient {
|
||||||
|
|
||||||
|
private int parseCount;
|
||||||
|
|
||||||
|
private RecordingClient(MineruProperties properties) {
|
||||||
|
super(properties, new MineruMapper(properties));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MineruResultPayload parse(com.easyagents.document.core.entity.ParseRequest request) {
|
||||||
|
parseCount++;
|
||||||
|
return new MineruMapper(testProperties()).toResultPayload(syncPayload(parseCount));
|
||||||
|
}
|
||||||
|
|
||||||
|
private JSONObject syncPayload(int index) {
|
||||||
|
JSONObject payload = new JSONObject();
|
||||||
|
payload.put("backend", "vlm-http-client");
|
||||||
|
payload.put("version", "3.0.9");
|
||||||
|
JSONObject result = new JSONObject();
|
||||||
|
result.put("md_content", "slide-ocr-" + index);
|
||||||
|
result.put("middle_json", middleJson());
|
||||||
|
result.put("content_list", contentList(index));
|
||||||
|
JSONObject results = new JSONObject();
|
||||||
|
results.put("slide-" + index, result);
|
||||||
|
payload.put("results", results);
|
||||||
|
return payload;
|
||||||
|
}
|
||||||
|
|
||||||
|
private JSONObject middleJson() {
|
||||||
|
JSONObject middleJson = new JSONObject();
|
||||||
|
middleJson.put("_backend", "vlm-http-client");
|
||||||
|
middleJson.put("_version_name", "3.0.9");
|
||||||
|
middleJson.put("pdf_info", new JSONArray());
|
||||||
|
return middleJson;
|
||||||
|
}
|
||||||
|
|
||||||
|
private JSONArray contentList(int index) {
|
||||||
|
JSONArray contentList = new JSONArray();
|
||||||
|
JSONObject text = new JSONObject();
|
||||||
|
text.put("type", "text");
|
||||||
|
text.put("text", "slide-ocr-" + index);
|
||||||
|
text.put("page_idx", 0);
|
||||||
|
text.put("bbox", new JSONArray());
|
||||||
|
contentList.add(text);
|
||||||
|
return contentList;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MineruProperties testProperties() {
|
||||||
|
MineruProperties properties = new MineruProperties();
|
||||||
|
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
44
easy-agents-document/easy-agents-document-xlsx/pom.xml
Normal file
44
easy-agents-document/easy-agents-document-xlsx/pom.xml
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<parent>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-document</artifactId>
|
||||||
|
<version>${revision}</version>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<artifactId>easy-agents-document-xlsx</artifactId>
|
||||||
|
<name>easy-agents-document-xlsx</name>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<maven.compiler.source>8</maven.compiler.source>
|
||||||
|
<maven.compiler.target>8</maven.compiler.target>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-document-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-core</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.alibaba.fastjson2</groupId>
|
||||||
|
<artifactId>fastjson2</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</project>
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
package com.easyagents.document.xlsx;
|
||||||
|
|
||||||
|
import com.easyagents.document.core.DocumentParseService;
|
||||||
|
import com.easyagents.document.core.entity.XlsxParseRequest;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XLSX 文档解析服务。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public interface XlsxDocumentParseService extends DocumentParseService<XlsxParseRequest> {
|
||||||
|
}
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
package com.easyagents.document.xlsx;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XLSX provider SPI。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public interface XlsxDocumentProvider extends XlsxDocumentParseService {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 获取 provider 标识。
|
||||||
|
*
|
||||||
|
* @return provider 名称
|
||||||
|
*/
|
||||||
|
String getProvider();
|
||||||
|
}
|
||||||
@@ -0,0 +1,625 @@
|
|||||||
|
package com.easyagents.document.xlsx.mineru;
|
||||||
|
|
||||||
|
import com.easyagents.core.util.StringUtil;
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskRepository;
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskUpdater;
|
||||||
|
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||||
|
import com.easyagents.document.core.mineru.MineruClient;
|
||||||
|
import com.easyagents.document.core.mineru.MineruMapper;
|
||||||
|
import com.easyagents.document.core.mineru.MineruProperties;
|
||||||
|
import com.easyagents.document.core.entity.DocumentImage;
|
||||||
|
import com.easyagents.document.core.entity.ParseFile;
|
||||||
|
import com.easyagents.document.core.entity.ParseRequest;
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
import com.easyagents.document.core.entity.ParseResult;
|
||||||
|
import com.easyagents.document.core.entity.XlsxParseRequest;
|
||||||
|
import com.easyagents.document.core.support.AbstractAsyncDocumentParseService;
|
||||||
|
import com.easyagents.document.xlsx.XlsxDocumentProvider;
|
||||||
|
import com.easyagents.document.xlsx.model.XlsxCellArtifact;
|
||||||
|
import com.easyagents.document.xlsx.model.XlsxCellImageArtifact;
|
||||||
|
import com.easyagents.document.xlsx.model.XlsxParseArtifact;
|
||||||
|
import com.easyagents.document.xlsx.model.XlsxRowArtifact;
|
||||||
|
import com.easyagents.document.xlsx.model.XlsxSheetArtifact;
|
||||||
|
import com.easyagents.document.xlsx.model.XlsxSheetImagesArtifact;
|
||||||
|
import org.apache.poi.ss.usermodel.DataFormatter;
|
||||||
|
import org.apache.poi.ss.usermodel.FormulaEvaluator;
|
||||||
|
import org.apache.poi.ss.util.CellReference;
|
||||||
|
import org.apache.poi.xssf.usermodel.XSSFClientAnchor;
|
||||||
|
import org.apache.poi.xssf.usermodel.XSSFDrawing;
|
||||||
|
import org.apache.poi.xssf.usermodel.XSSFPicture;
|
||||||
|
import org.apache.poi.xssf.usermodel.XSSFPictureData;
|
||||||
|
import org.apache.poi.xssf.usermodel.XSSFShape;
|
||||||
|
import org.apache.poi.xssf.usermodel.XSSFSheet;
|
||||||
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.net.URLConnection;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XLSX 文档解析服务,OCR 由 mineru 提供支持
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseService<XlsxParseRequest> implements XlsxDocumentProvider {
|
||||||
|
|
||||||
|
public static final String PROVIDER_NAME = "mineru";
|
||||||
|
|
||||||
|
private final MineruProperties properties;
|
||||||
|
private final MineruClient client;
|
||||||
|
private final MineruMapper mapper;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
*/
|
||||||
|
public MineruXlsxDocumentParseService(MineruProperties properties) {
|
||||||
|
this(properties, new MineruMapper(properties));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
* @param mapper MinerU 映射器
|
||||||
|
*/
|
||||||
|
public MineruXlsxDocumentParseService(MineruProperties properties, MineruMapper mapper) {
|
||||||
|
this(properties, new MineruClient(properties, mapper), mapper, defaultTaskManager());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
* @param taskManager 异步任务管理器
|
||||||
|
*/
|
||||||
|
public MineruXlsxDocumentParseService(MineruProperties properties, DocumentAsyncTaskManager taskManager) {
|
||||||
|
this(properties, new MineruMapper(properties), taskManager);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
* @param mapper MinerU 映射器
|
||||||
|
* @param taskManager 异步任务管理器
|
||||||
|
*/
|
||||||
|
public MineruXlsxDocumentParseService(MineruProperties properties,
|
||||||
|
MineruMapper mapper,
|
||||||
|
DocumentAsyncTaskManager taskManager) {
|
||||||
|
this(properties, new MineruClient(properties, mapper), mapper, taskManager);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建服务实例。
|
||||||
|
*
|
||||||
|
* @param properties MinerU 配置
|
||||||
|
* @param client MinerU 客户端
|
||||||
|
* @param mapper MinerU 映射器
|
||||||
|
* @param taskManager 异步任务管理器
|
||||||
|
*/
|
||||||
|
public MineruXlsxDocumentParseService(MineruProperties properties,
|
||||||
|
MineruClient client,
|
||||||
|
MineruMapper mapper,
|
||||||
|
DocumentAsyncTaskManager taskManager) {
|
||||||
|
super(taskManager);
|
||||||
|
this.properties = properties;
|
||||||
|
this.client = client;
|
||||||
|
this.mapper = mapper;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getProvider() {
|
||||||
|
return PROVIDER_NAME;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected XlsxParseRequest normalizeRequest(ParseRequest request) {
|
||||||
|
XlsxParseRequest normalized = XlsxParseRequest.from(request);
|
||||||
|
if (normalized.getFiles() == null || normalized.getFiles().isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("XlsxParseRequest files must not be empty");
|
||||||
|
}
|
||||||
|
normalized.setBackend(StringUtil.hasText(normalized.getBackend()) ? normalized.getBackend() : properties.getDefaultBackend());
|
||||||
|
if (normalized.getLanguages() == null || normalized.getLanguages().isEmpty()) {
|
||||||
|
normalized.setLanguages(new ArrayList<String>(properties.getDefaultLangList()));
|
||||||
|
}
|
||||||
|
normalized.setReturnMarkdown(normalized.getReturnMarkdown() == null ? Boolean.TRUE : normalized.getReturnMarkdown());
|
||||||
|
normalized.setReturnMiddleJson(normalized.getReturnMiddleJson() == null ? Boolean.FALSE : normalized.getReturnMiddleJson());
|
||||||
|
normalized.setReturnContentList(normalized.getReturnContentList() == null ? Boolean.FALSE : normalized.getReturnContentList());
|
||||||
|
normalized.setReturnModelOutput(normalized.getReturnModelOutput() == null ? Boolean.FALSE : normalized.getReturnModelOutput());
|
||||||
|
normalized.setReturnImages(normalized.getReturnImages() == null ? Boolean.TRUE : normalized.getReturnImages());
|
||||||
|
normalized.setIncludeHiddenSheets(normalized.getIncludeHiddenSheets() == null ? Boolean.FALSE : normalized.getIncludeHiddenSheets());
|
||||||
|
normalized.setOcrEmbeddedImages(normalized.getOcrEmbeddedImages() == null ? Boolean.TRUE : normalized.getOcrEmbeddedImages());
|
||||||
|
normalized.setIncludeImageAppendix(normalized.getIncludeImageAppendix() == null ? Boolean.TRUE : normalized.getIncludeImageAppendix());
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected ParseResponse doParse(XlsxParseRequest request, DocumentAsyncTaskUpdater updater) {
|
||||||
|
ParseResponse response = new ParseResponse();
|
||||||
|
List<ParseResult> results = new ArrayList<ParseResult>();
|
||||||
|
String backend = null;
|
||||||
|
int processedFiles = 0;
|
||||||
|
int totalFiles = request.getFiles().size();
|
||||||
|
|
||||||
|
for (ParseFile file : request.getFiles()) {
|
||||||
|
updateProgress(updater, "extracting", processedFiles, totalFiles, "正在读取工作簿结构");
|
||||||
|
ParseResult result = parseSingleWorkbook(file, request, updater);
|
||||||
|
processedFiles++;
|
||||||
|
if (backend == null) {
|
||||||
|
backend = (String) result.getMetadata().get("ocrBackend");
|
||||||
|
}
|
||||||
|
result.getMetadata().remove("ocrBackend");
|
||||||
|
results.add(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
updateProgress(updater, "assembling", processedFiles, totalFiles, "正在汇总 XLSX 解析结果");
|
||||||
|
response.setBackend(StringUtil.hasText(backend) ? backend : request.getBackend());
|
||||||
|
response.setResults(results);
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
private ParseResult parseSingleWorkbook(ParseFile file, XlsxParseRequest request, DocumentAsyncTaskUpdater updater) {
|
||||||
|
ParseResult aggregate = new ParseResult();
|
||||||
|
aggregate.setFileName(file.getFileName());
|
||||||
|
XlsxParseArtifact artifact = new XlsxParseArtifact();
|
||||||
|
artifact.setWorkbookName(file.getFileName());
|
||||||
|
StringBuilder markdownBuilder = new StringBuilder();
|
||||||
|
String backend = null;
|
||||||
|
|
||||||
|
try (XSSFWorkbook workbook = new XSSFWorkbook(new ByteArrayInputStream(file.getContent()))) {
|
||||||
|
FormulaEvaluator evaluator = workbook.getCreationHelper().createFormulaEvaluator();
|
||||||
|
DataFormatter formatter = new DataFormatter();
|
||||||
|
List<Integer> sheetIndexes = resolveSheetIndexes(workbook, request);
|
||||||
|
int processedSheets = 0;
|
||||||
|
|
||||||
|
for (Integer sheetIndex : sheetIndexes) {
|
||||||
|
XSSFSheet sheet = workbook.getSheetAt(sheetIndex);
|
||||||
|
updateProgress(updater, "extracting", processedSheets, sheetIndexes.size(), "正在读取 Sheet " + sheet.getSheetName());
|
||||||
|
SheetExtraction sheetExtraction = extractSheet(sheet, sheetIndex, formatter, evaluator, request, updater);
|
||||||
|
artifact.getSheets().add(sheetExtraction.sheetArtifact);
|
||||||
|
artifact.getCellImages().addAll(sheetExtraction.imageArtifacts);
|
||||||
|
artifact.getSheetImages().add(sheetExtraction.sheetImagesArtifact);
|
||||||
|
artifact.getMergedRanges().addAll(sheetExtraction.mergedRanges);
|
||||||
|
aggregate.getImages().addAll(sheetExtraction.documentImages);
|
||||||
|
if (markdownBuilder.length() > 0) {
|
||||||
|
markdownBuilder.append("\n\n");
|
||||||
|
}
|
||||||
|
markdownBuilder.append(sheetExtraction.markdown);
|
||||||
|
if (backend == null) {
|
||||||
|
backend = sheetExtraction.ocrBackend;
|
||||||
|
}
|
||||||
|
processedSheets++;
|
||||||
|
}
|
||||||
|
} catch (Exception exception) {
|
||||||
|
throw new IllegalStateException("Failed to parse XLSX file: " + file.getFileName(), exception);
|
||||||
|
}
|
||||||
|
|
||||||
|
aggregate.setMarkdown(markdownBuilder.toString().trim());
|
||||||
|
aggregate.setPlainText(aggregate.getMarkdown());
|
||||||
|
aggregate.getArtifacts().getExtraJsonArtifacts().put("xlsx", artifact);
|
||||||
|
aggregate.getMetadata().put("ocrBackend", backend);
|
||||||
|
return aggregate;
|
||||||
|
}
|
||||||
|
|
||||||
|
private SheetExtraction extractSheet(XSSFSheet sheet,
|
||||||
|
int sheetIndex,
|
||||||
|
DataFormatter formatter,
|
||||||
|
FormulaEvaluator evaluator,
|
||||||
|
XlsxParseRequest request,
|
||||||
|
DocumentAsyncTaskUpdater updater) {
|
||||||
|
SheetExtraction extraction = new SheetExtraction();
|
||||||
|
extraction.sheetArtifact = new XlsxSheetArtifact();
|
||||||
|
extraction.sheetArtifact.setSheetName(sheet.getSheetName());
|
||||||
|
extraction.sheetArtifact.setSheetIndex(sheetIndex);
|
||||||
|
extraction.sheetArtifact.setHidden(Boolean.valueOf(sheet.getWorkbook().isSheetHidden(sheetIndex)
|
||||||
|
|| sheet.getWorkbook().isSheetVeryHidden(sheetIndex)));
|
||||||
|
extraction.sheetImagesArtifact = new XlsxSheetImagesArtifact();
|
||||||
|
extraction.sheetImagesArtifact.setSheetName(sheet.getSheetName());
|
||||||
|
extraction.sheetImagesArtifact.setSheetIndex(sheetIndex);
|
||||||
|
|
||||||
|
Map<String, List<XlsxCellImageArtifact>> imagesByCell = new LinkedHashMap<String, List<XlsxCellImageArtifact>>();
|
||||||
|
List<SheetImageExtraction> sheetImages = extractImages(sheet, sheetIndex, request, updater);
|
||||||
|
List<XlsxCellImageArtifact> imageArtifacts = new ArrayList<XlsxCellImageArtifact>();
|
||||||
|
for (SheetImageExtraction sheetImage : sheetImages) {
|
||||||
|
XlsxCellImageArtifact imageArtifact = sheetImage.imageArtifact;
|
||||||
|
imageArtifacts.add(imageArtifact);
|
||||||
|
extraction.imageArtifacts.add(imageArtifact);
|
||||||
|
extraction.sheetImagesArtifact.getReferenceKeys().add(imageArtifact.getReferenceKey());
|
||||||
|
extraction.sheetImagesArtifact.getSourcePaths().add(imageArtifact.getSourcePath());
|
||||||
|
String anchorCell = imageArtifact.getAnchorCell();
|
||||||
|
List<XlsxCellImageArtifact> cellImages = imagesByCell.get(anchorCell);
|
||||||
|
if (cellImages == null) {
|
||||||
|
cellImages = new ArrayList<XlsxCellImageArtifact>();
|
||||||
|
imagesByCell.put(anchorCell, cellImages);
|
||||||
|
}
|
||||||
|
cellImages.add(imageArtifact);
|
||||||
|
extraction.documentImages.add(sheetImage.documentImage);
|
||||||
|
}
|
||||||
|
|
||||||
|
int maxRow = resolveMaxRow(sheet, request.getMaxRowsPerSheet());
|
||||||
|
int maxCol = resolveMaxCol(sheet, maxRow, imagesByCell);
|
||||||
|
extraction.sheetArtifact.setRowCount(maxRow + 1);
|
||||||
|
extraction.sheetArtifact.setColumnCount(maxCol);
|
||||||
|
appendSheetHeader(extraction.markdown, sheet.getSheetName());
|
||||||
|
|
||||||
|
if (maxRow < 0 || maxCol <= 0) {
|
||||||
|
extraction.markdown.append("_empty sheet_");
|
||||||
|
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
|
||||||
|
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
|
||||||
|
}
|
||||||
|
return extraction;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<List<String>> markdownRows = new ArrayList<List<String>>();
|
||||||
|
for (int rowIndex = 0; rowIndex <= maxRow; rowIndex++) {
|
||||||
|
org.apache.poi.ss.usermodel.Row row = sheet.getRow(rowIndex);
|
||||||
|
XlsxRowArtifact rowArtifact = new XlsxRowArtifact();
|
||||||
|
rowArtifact.setRowIndex(rowIndex);
|
||||||
|
List<String> rowValues = new ArrayList<String>();
|
||||||
|
for (int colIndex = 0; colIndex < maxCol; colIndex++) {
|
||||||
|
String cellRef = new CellReference(rowIndex, colIndex).formatAsString();
|
||||||
|
String cellText = readCellText(row, colIndex, formatter, evaluator);
|
||||||
|
List<XlsxCellImageArtifact> cellImages = imagesByCell.get(cellRef);
|
||||||
|
String displayValue = mergeDisplayValue(cellText, cellImages);
|
||||||
|
rowValues.add(escapeMarkdown(displayValue));
|
||||||
|
|
||||||
|
XlsxCellArtifact cellArtifact = new XlsxCellArtifact();
|
||||||
|
cellArtifact.setRowIndex(rowIndex);
|
||||||
|
cellArtifact.setColumnIndex(colIndex);
|
||||||
|
cellArtifact.setCellRef(cellRef);
|
||||||
|
cellArtifact.setText(cellText);
|
||||||
|
if (cellImages != null) {
|
||||||
|
List<String> imageKeys = new ArrayList<String>();
|
||||||
|
for (XlsxCellImageArtifact cellImage : cellImages) {
|
||||||
|
imageKeys.add(cellImage.getReferenceKey());
|
||||||
|
}
|
||||||
|
cellArtifact.setImageKeys(imageKeys);
|
||||||
|
}
|
||||||
|
rowArtifact.getCells().add(cellArtifact);
|
||||||
|
}
|
||||||
|
extraction.sheetArtifact.getRows().add(rowArtifact);
|
||||||
|
markdownRows.add(rowValues);
|
||||||
|
}
|
||||||
|
|
||||||
|
appendMarkdownTable(extraction.markdown, markdownRows);
|
||||||
|
extraction.mergedRanges.addAll(extractMergedRanges(sheet));
|
||||||
|
if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) {
|
||||||
|
appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts);
|
||||||
|
}
|
||||||
|
return extraction;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<SheetImageExtraction> extractImages(XSSFSheet sheet,
|
||||||
|
int sheetIndex,
|
||||||
|
XlsxParseRequest request,
|
||||||
|
DocumentAsyncTaskUpdater updater) {
|
||||||
|
List<SheetImageExtraction> images = new ArrayList<SheetImageExtraction>();
|
||||||
|
XSSFDrawing drawing = sheet.getDrawingPatriarch();
|
||||||
|
if (drawing == null) {
|
||||||
|
return images;
|
||||||
|
}
|
||||||
|
String sheetKey = buildSheetKey(sheet.getSheetName(), sheetIndex);
|
||||||
|
int imageIndex = 0;
|
||||||
|
for (XSSFShape shape : drawing.getShapes()) {
|
||||||
|
if (!(shape instanceof XSSFPicture)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
imageIndex++;
|
||||||
|
XSSFPicture picture = (XSSFPicture) shape;
|
||||||
|
XSSFClientAnchor anchor = picture.getPreferredSize();
|
||||||
|
if (anchor == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
XSSFPictureData pictureData = picture.getPictureData();
|
||||||
|
String extension = pictureData == null || !StringUtil.hasText(pictureData.suggestFileExtension())
|
||||||
|
? "png"
|
||||||
|
: pictureData.suggestFileExtension();
|
||||||
|
String imageName = buildImageName(sheetKey, anchor.getRow1(), anchor.getCol1(), imageIndex);
|
||||||
|
String sourcePath = buildImageSourcePath(sheetKey, anchor.getRow1(), anchor.getCol1(), imageIndex, extension);
|
||||||
|
|
||||||
|
XlsxCellImageArtifact imageArtifact = new XlsxCellImageArtifact();
|
||||||
|
imageArtifact.setSheetName(sheet.getSheetName());
|
||||||
|
imageArtifact.setAnchorCell(new CellReference(anchor.getRow1(), anchor.getCol1()).formatAsString());
|
||||||
|
imageArtifact.setFromRow(anchor.getRow1());
|
||||||
|
imageArtifact.setFromCol((int) anchor.getCol1());
|
||||||
|
imageArtifact.setToRow(anchor.getRow2());
|
||||||
|
imageArtifact.setToCol((int) anchor.getCol2());
|
||||||
|
imageArtifact.setName(imageName);
|
||||||
|
imageArtifact.setReferenceKey(imageName);
|
||||||
|
imageArtifact.setSourcePath(sourcePath);
|
||||||
|
if (Boolean.TRUE.equals(request.getOcrEmbeddedImages()) && pictureData != null) {
|
||||||
|
updateProgress(updater, "ocr", imageIndex - 1, drawing.getShapes().size(), "正在识别 Sheet " + sheet.getSheetName() + " 中的图片");
|
||||||
|
imageArtifact.setOcrText(parseImageOcr(pictureData.getData(), extension, request, imageName));
|
||||||
|
}
|
||||||
|
DocumentImage documentImage = new DocumentImage();
|
||||||
|
documentImage.setName(imageName);
|
||||||
|
documentImage.setSourcePath(sourcePath);
|
||||||
|
documentImage.setMimeType(detectImageMimeType(sourcePath));
|
||||||
|
documentImage.setContent(pictureData == null ? null : pictureData.getData());
|
||||||
|
|
||||||
|
SheetImageExtraction sheetImage = new SheetImageExtraction();
|
||||||
|
sheetImage.imageArtifact = imageArtifact;
|
||||||
|
sheetImage.documentImage = documentImage;
|
||||||
|
images.add(sheetImage);
|
||||||
|
}
|
||||||
|
return images;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String parseImageOcr(byte[] imageBytes, String extension, XlsxParseRequest request, String imageName) {
|
||||||
|
ParseRequest imageRequest = new ParseRequest();
|
||||||
|
imageRequest.addFile(ParseFile.of(imageName + "." + extension, imageBytes, "image/" + extension));
|
||||||
|
imageRequest.setBackend(request.getBackend());
|
||||||
|
imageRequest.setLanguages(request.getLanguages());
|
||||||
|
imageRequest.setReturnMarkdown(true);
|
||||||
|
imageRequest.setReturnMiddleJson(false);
|
||||||
|
imageRequest.setReturnContentList(false);
|
||||||
|
imageRequest.setReturnModelOutput(false);
|
||||||
|
imageRequest.setReturnImages(false);
|
||||||
|
ParseResponse response = mapper.toParseResponse(client.parse(imageRequest));
|
||||||
|
if (response.getResults().isEmpty()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
ParseResult result = response.getResults().get(0);
|
||||||
|
return StringUtil.hasText(result.getMarkdown()) ? result.getMarkdown() : result.getPlainText();
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Integer> resolveSheetIndexes(XSSFWorkbook workbook, XlsxParseRequest request) {
|
||||||
|
List<Integer> indexes = new ArrayList<Integer>();
|
||||||
|
for (int index = 0; index < workbook.getNumberOfSheets(); index++) {
|
||||||
|
String sheetName = workbook.getSheetName(index);
|
||||||
|
if (!Boolean.TRUE.equals(request.getIncludeHiddenSheets())
|
||||||
|
&& (workbook.isSheetHidden(index) || workbook.isSheetVeryHidden(index))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (request.getSheetNames() != null && !request.getSheetNames().isEmpty()
|
||||||
|
&& !request.getSheetNames().contains(sheetName)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
indexes.add(index);
|
||||||
|
}
|
||||||
|
return indexes;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int resolveMaxRow(XSSFSheet sheet, Integer maxRowsPerSheet) {
|
||||||
|
int lastRow = sheet.getLastRowNum();
|
||||||
|
if (lastRow < 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (maxRowsPerSheet == null || maxRowsPerSheet <= 0) {
|
||||||
|
return lastRow;
|
||||||
|
}
|
||||||
|
return Math.min(lastRow, maxRowsPerSheet - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int resolveMaxCol(XSSFSheet sheet, int maxRow, Map<String, List<XlsxCellImageArtifact>> imagesByCell) {
|
||||||
|
int maxCol = 0;
|
||||||
|
for (int rowIndex = 0; rowIndex <= maxRow; rowIndex++) {
|
||||||
|
org.apache.poi.ss.usermodel.Row row = sheet.getRow(rowIndex);
|
||||||
|
if (row != null && row.getLastCellNum() > maxCol) {
|
||||||
|
maxCol = row.getLastCellNum();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (String cellRef : imagesByCell.keySet()) {
|
||||||
|
CellReference reference = new CellReference(cellRef);
|
||||||
|
if (reference.getCol() + 1 > maxCol) {
|
||||||
|
maxCol = reference.getCol() + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return maxCol;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String readCellText(org.apache.poi.ss.usermodel.Row row, int colIndex, DataFormatter formatter, FormulaEvaluator evaluator) {
|
||||||
|
if (row == null) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
org.apache.poi.ss.usermodel.Cell cell = row.getCell(colIndex);
|
||||||
|
if (cell == null) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
return formatter.formatCellValue(cell, evaluator);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String mergeDisplayValue(String cellText, List<XlsxCellImageArtifact> cellImages) {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
if (StringUtil.hasText(cellText)) {
|
||||||
|
builder.append(cellText.trim());
|
||||||
|
}
|
||||||
|
if (cellImages != null && !cellImages.isEmpty()) {
|
||||||
|
for (XlsxCellImageArtifact cellImage : cellImages) {
|
||||||
|
if (builder.length() > 0) {
|
||||||
|
builder.append('\n');
|
||||||
|
}
|
||||||
|
builder.append("[IMG:").append(cellImage.getReferenceKey()).append(']');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void appendSheetHeader(StringBuilder markdownBuilder, String sheetName) {
|
||||||
|
markdownBuilder.append("# ").append(sheetName).append("\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void appendMarkdownTable(StringBuilder markdownBuilder, List<List<String>> rows) {
|
||||||
|
if (rows.isEmpty()) {
|
||||||
|
markdownBuilder.append("_empty sheet_");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
List<String> header = rows.get(0);
|
||||||
|
markdownBuilder.append("| ").append(joinCells(header)).append(" |\n");
|
||||||
|
markdownBuilder.append("|");
|
||||||
|
for (int index = 0; index < header.size(); index++) {
|
||||||
|
markdownBuilder.append(" --- |");
|
||||||
|
}
|
||||||
|
markdownBuilder.append("\n");
|
||||||
|
for (int rowIndex = 1; rowIndex < rows.size(); rowIndex++) {
|
||||||
|
markdownBuilder.append("| ").append(joinCells(rows.get(rowIndex))).append(" |\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void appendImageAppendix(StringBuilder markdownBuilder,
|
||||||
|
String sheetName,
|
||||||
|
List<XlsxCellImageArtifact> imageArtifacts) {
|
||||||
|
markdownBuilder.append("\n## ").append(sheetName).append(" 图片说明\n\n");
|
||||||
|
for (XlsxCellImageArtifact imageArtifact : imageArtifacts) {
|
||||||
|
markdownBuilder.append("
|
||||||
|
.append(imageArtifact.getSourcePath())
|
||||||
|
.append(")\n\n");
|
||||||
|
markdownBuilder.append("- 占位符:[IMG:")
|
||||||
|
.append(imageArtifact.getReferenceKey())
|
||||||
|
.append("]\n");
|
||||||
|
markdownBuilder.append("- 锚点:")
|
||||||
|
.append(imageArtifact.getAnchorCell())
|
||||||
|
.append("\n");
|
||||||
|
markdownBuilder.append("- OCR:")
|
||||||
|
.append(StringUtil.hasText(imageArtifact.getOcrText()) ? imageArtifact.getOcrText() : "")
|
||||||
|
.append("\n\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> extractMergedRanges(XSSFSheet sheet) {
|
||||||
|
List<String> mergedRanges = new ArrayList<String>();
|
||||||
|
for (int index = 0; index < sheet.getNumMergedRegions(); index++) {
|
||||||
|
mergedRanges.add(sheet.getMergedRegion(index).formatAsString());
|
||||||
|
}
|
||||||
|
return mergedRanges;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String joinCells(List<String> cells) {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
for (int index = 0; index < cells.size(); index++) {
|
||||||
|
if (index > 0) {
|
||||||
|
builder.append(" | ");
|
||||||
|
}
|
||||||
|
builder.append(cells.get(index));
|
||||||
|
}
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String escapeMarkdown(String text) {
|
||||||
|
if (!StringUtil.hasText(text)) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
return text.replace("|", "\\|").replace("\r", " ").replace("\n", "<br/>");
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildImageName(String sheetKey, int rowIndex, int colIndex, int imageIndex) {
|
||||||
|
return sheetKey + "-r" + (rowIndex + 1) + "c" + (colIndex + 1) + "-" + formatIndex(imageIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildImageSourcePath(String sheetKey, int rowIndex, int colIndex, int imageIndex, String extension) {
|
||||||
|
return "images/" + sheetKey + "/r" + (rowIndex + 1) + "c" + (colIndex + 1) + "-" + formatIndex(imageIndex) + "." + extension;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String buildSheetKey(String sheetName, int sheetIndex) {
|
||||||
|
if (!StringUtil.hasText(sheetName)) {
|
||||||
|
return "sheet-" + formatIndex(sheetIndex + 1);
|
||||||
|
}
|
||||||
|
String lowerCaseName = sheetName.toLowerCase(Locale.ROOT);
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
for (int index = 0; index < lowerCaseName.length(); index++) {
|
||||||
|
char character = lowerCaseName.charAt(index);
|
||||||
|
if ((character >= 'a' && character <= 'z') || (character >= '0' && character <= '9')) {
|
||||||
|
builder.append(character);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (builder.length() > 0 && builder.charAt(builder.length() - 1) != '-') {
|
||||||
|
builder.append('-');
|
||||||
|
}
|
||||||
|
builder.append('u').append(String.format(Locale.ROOT, "%04x", (int) character)).append('-');
|
||||||
|
}
|
||||||
|
String normalized = builder.toString();
|
||||||
|
while (normalized.startsWith("-")) {
|
||||||
|
normalized = normalized.substring(1);
|
||||||
|
}
|
||||||
|
while (normalized.endsWith("-")) {
|
||||||
|
normalized = normalized.substring(0, normalized.length() - 1);
|
||||||
|
}
|
||||||
|
return StringUtil.hasText(normalized) ? normalized : "sheet-" + formatIndex(sheetIndex + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String formatIndex(int index) {
|
||||||
|
int displayIndex = index <= 0 ? 1 : index;
|
||||||
|
if (displayIndex < 10) {
|
||||||
|
return "00" + displayIndex;
|
||||||
|
}
|
||||||
|
if (displayIndex < 100) {
|
||||||
|
return "0" + displayIndex;
|
||||||
|
}
|
||||||
|
return String.valueOf(displayIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String detectImageMimeType(String path) {
|
||||||
|
if (!StringUtil.hasText(path)) {
|
||||||
|
return "application/octet-stream";
|
||||||
|
}
|
||||||
|
String mimeType = URLConnection.guessContentTypeFromName(path);
|
||||||
|
if (StringUtil.hasText(mimeType)) {
|
||||||
|
return mimeType;
|
||||||
|
}
|
||||||
|
String lowerCasePath = path.toLowerCase(Locale.ROOT);
|
||||||
|
if (lowerCasePath.endsWith(".jpg") || lowerCasePath.endsWith(".jpeg")) {
|
||||||
|
return "image/jpeg";
|
||||||
|
}
|
||||||
|
if (lowerCasePath.endsWith(".png")) {
|
||||||
|
return "image/png";
|
||||||
|
}
|
||||||
|
if (lowerCasePath.endsWith(".gif")) {
|
||||||
|
return "image/gif";
|
||||||
|
}
|
||||||
|
if (lowerCasePath.endsWith(".bmp")) {
|
||||||
|
return "image/bmp";
|
||||||
|
}
|
||||||
|
if (lowerCasePath.endsWith(".webp")) {
|
||||||
|
return "image/webp";
|
||||||
|
}
|
||||||
|
return "application/octet-stream";
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateProgress(DocumentAsyncTaskUpdater updater,
|
||||||
|
String stage,
|
||||||
|
int processedItems,
|
||||||
|
int totalItems,
|
||||||
|
String message) {
|
||||||
|
if (updater == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int safeTotal = totalItems <= 0 ? 1 : totalItems;
|
||||||
|
int percent = (int) Math.min(99, Math.round(processedItems * 100.0d / safeTotal));
|
||||||
|
updater.update(stage, percent, processedItems, totalItems, message);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static DocumentAsyncTaskManager defaultTaskManager() {
|
||||||
|
DocumentAsyncTaskRepository repository = new InMemoryDocumentAsyncTaskRepository();
|
||||||
|
ExecutorService executorService = Executors.newFixedThreadPool(2);
|
||||||
|
return new DocumentAsyncTaskManager(repository, executorService);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class SheetExtraction {
|
||||||
|
|
||||||
|
private final StringBuilder markdown = new StringBuilder();
|
||||||
|
private final List<XlsxCellImageArtifact> imageArtifacts = new ArrayList<XlsxCellImageArtifact>();
|
||||||
|
private final List<DocumentImage> documentImages = new ArrayList<DocumentImage>();
|
||||||
|
private final List<String> mergedRanges = new ArrayList<String>();
|
||||||
|
private XlsxSheetArtifact sheetArtifact;
|
||||||
|
private XlsxSheetImagesArtifact sheetImagesArtifact;
|
||||||
|
private String ocrBackend;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class SheetImageExtraction {
|
||||||
|
|
||||||
|
private XlsxCellImageArtifact imageArtifact;
|
||||||
|
private DocumentImage documentImage;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
package com.easyagents.document.xlsx.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 单元格工件。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class XlsxCellArtifact {
|
||||||
|
|
||||||
|
private Integer rowIndex;
|
||||||
|
private Integer columnIndex;
|
||||||
|
private String cellRef;
|
||||||
|
private String text;
|
||||||
|
private List<String> imageKeys = new ArrayList<String>();
|
||||||
|
|
||||||
|
public Integer getRowIndex() {
|
||||||
|
return rowIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRowIndex(Integer rowIndex) {
|
||||||
|
this.rowIndex = rowIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getColumnIndex() {
|
||||||
|
return columnIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setColumnIndex(Integer columnIndex) {
|
||||||
|
this.columnIndex = columnIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCellRef() {
|
||||||
|
return cellRef;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCellRef(String cellRef) {
|
||||||
|
this.cellRef = cellRef;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getText() {
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setText(String text) {
|
||||||
|
this.text = text;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getImageKeys() {
|
||||||
|
return imageKeys;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setImageKeys(List<String> imageKeys) {
|
||||||
|
this.imageKeys = imageKeys == null ? new ArrayList<String>() : imageKeys;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,101 @@
|
|||||||
|
package com.easyagents.document.xlsx.model;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 单元格图片工件。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class XlsxCellImageArtifact {
|
||||||
|
|
||||||
|
private String sheetName;
|
||||||
|
private String anchorCell;
|
||||||
|
private Integer fromRow;
|
||||||
|
private Integer fromCol;
|
||||||
|
private Integer toRow;
|
||||||
|
private Integer toCol;
|
||||||
|
private String name;
|
||||||
|
private String referenceKey;
|
||||||
|
private String sourcePath;
|
||||||
|
private String ocrText;
|
||||||
|
|
||||||
|
public String getSheetName() {
|
||||||
|
return sheetName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSheetName(String sheetName) {
|
||||||
|
this.sheetName = sheetName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getAnchorCell() {
|
||||||
|
return anchorCell;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAnchorCell(String anchorCell) {
|
||||||
|
this.anchorCell = anchorCell;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getFromRow() {
|
||||||
|
return fromRow;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFromRow(Integer fromRow) {
|
||||||
|
this.fromRow = fromRow;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getFromCol() {
|
||||||
|
return fromCol;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFromCol(Integer fromCol) {
|
||||||
|
this.fromCol = fromCol;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getToRow() {
|
||||||
|
return toRow;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setToRow(Integer toRow) {
|
||||||
|
this.toRow = toRow;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getToCol() {
|
||||||
|
return toCol;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setToCol(Integer toCol) {
|
||||||
|
this.toCol = toCol;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setName(String name) {
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getReferenceKey() {
|
||||||
|
return referenceKey;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReferenceKey(String referenceKey) {
|
||||||
|
this.referenceKey = referenceKey;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSourcePath() {
|
||||||
|
return sourcePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSourcePath(String sourcePath) {
|
||||||
|
this.sourcePath = sourcePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOcrText() {
|
||||||
|
return ocrText;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOcrText(String ocrText) {
|
||||||
|
this.ocrText = ocrText;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
package com.easyagents.document.xlsx.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XLSX 结构化工件。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class XlsxParseArtifact {
|
||||||
|
|
||||||
|
private String workbookName;
|
||||||
|
private List<XlsxSheetArtifact> sheets = new ArrayList<XlsxSheetArtifact>();
|
||||||
|
private List<XlsxSheetImagesArtifact> sheetImages = new ArrayList<XlsxSheetImagesArtifact>();
|
||||||
|
private List<String> mergedRanges = new ArrayList<String>();
|
||||||
|
private List<XlsxCellImageArtifact> cellImages = new ArrayList<XlsxCellImageArtifact>();
|
||||||
|
|
||||||
|
public String getWorkbookName() {
|
||||||
|
return workbookName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setWorkbookName(String workbookName) {
|
||||||
|
this.workbookName = workbookName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<XlsxSheetArtifact> getSheets() {
|
||||||
|
return sheets;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSheets(List<XlsxSheetArtifact> sheets) {
|
||||||
|
this.sheets = sheets == null ? new ArrayList<XlsxSheetArtifact>() : sheets;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<XlsxSheetImagesArtifact> getSheetImages() {
|
||||||
|
return sheetImages;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSheetImages(List<XlsxSheetImagesArtifact> sheetImages) {
|
||||||
|
this.sheetImages = sheetImages == null ? new ArrayList<XlsxSheetImagesArtifact>() : sheetImages;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getMergedRanges() {
|
||||||
|
return mergedRanges;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMergedRanges(List<String> mergedRanges) {
|
||||||
|
this.mergedRanges = mergedRanges == null ? new ArrayList<String>() : mergedRanges;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<XlsxCellImageArtifact> getCellImages() {
|
||||||
|
return cellImages;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCellImages(List<XlsxCellImageArtifact> cellImages) {
|
||||||
|
this.cellImages = cellImages == null ? new ArrayList<XlsxCellImageArtifact>() : cellImages;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
package com.easyagents.document.xlsx.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 行工件。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class XlsxRowArtifact {
|
||||||
|
|
||||||
|
private Integer rowIndex;
|
||||||
|
private List<XlsxCellArtifact> cells = new ArrayList<XlsxCellArtifact>();
|
||||||
|
|
||||||
|
public Integer getRowIndex() {
|
||||||
|
return rowIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRowIndex(Integer rowIndex) {
|
||||||
|
this.rowIndex = rowIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<XlsxCellArtifact> getCells() {
|
||||||
|
return cells;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCells(List<XlsxCellArtifact> cells) {
|
||||||
|
this.cells = cells == null ? new ArrayList<XlsxCellArtifact>() : cells;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,68 @@
|
|||||||
|
package com.easyagents.document.xlsx.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sheet 工件。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class XlsxSheetArtifact {
|
||||||
|
|
||||||
|
private String sheetName;
|
||||||
|
private Integer sheetIndex;
|
||||||
|
private Boolean hidden;
|
||||||
|
private Integer rowCount;
|
||||||
|
private Integer columnCount;
|
||||||
|
private List<XlsxRowArtifact> rows = new ArrayList<XlsxRowArtifact>();
|
||||||
|
|
||||||
|
public String getSheetName() {
|
||||||
|
return sheetName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSheetName(String sheetName) {
|
||||||
|
this.sheetName = sheetName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getSheetIndex() {
|
||||||
|
return sheetIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSheetIndex(Integer sheetIndex) {
|
||||||
|
this.sheetIndex = sheetIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean getHidden() {
|
||||||
|
return hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHidden(Boolean hidden) {
|
||||||
|
this.hidden = hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getRowCount() {
|
||||||
|
return rowCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRowCount(Integer rowCount) {
|
||||||
|
this.rowCount = rowCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getColumnCount() {
|
||||||
|
return columnCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setColumnCount(Integer columnCount) {
|
||||||
|
this.columnCount = columnCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<XlsxRowArtifact> getRows() {
|
||||||
|
return rows;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRows(List<XlsxRowArtifact> rows) {
|
||||||
|
this.rows = rows == null ? new ArrayList<XlsxRowArtifact>() : rows;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,50 @@
|
|||||||
|
package com.easyagents.document.xlsx.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sheet 维度的图片索引工件。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class XlsxSheetImagesArtifact {
|
||||||
|
|
||||||
|
private String sheetName;
|
||||||
|
private Integer sheetIndex;
|
||||||
|
private List<String> referenceKeys = new ArrayList<String>();
|
||||||
|
private List<String> sourcePaths = new ArrayList<String>();
|
||||||
|
|
||||||
|
public String getSheetName() {
|
||||||
|
return sheetName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSheetName(String sheetName) {
|
||||||
|
this.sheetName = sheetName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getSheetIndex() {
|
||||||
|
return sheetIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSheetIndex(Integer sheetIndex) {
|
||||||
|
this.sheetIndex = sheetIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getReferenceKeys() {
|
||||||
|
return referenceKeys;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReferenceKeys(List<String> referenceKeys) {
|
||||||
|
this.referenceKeys = referenceKeys == null ? new ArrayList<String>() : referenceKeys;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getSourcePaths() {
|
||||||
|
return sourcePaths;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSourcePaths(List<String> sourcePaths) {
|
||||||
|
this.sourcePaths = sourcePaths == null ? new ArrayList<String>() : sourcePaths;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,333 @@
|
|||||||
|
package com.easyagents.document.xlsx.mineru;
|
||||||
|
|
||||||
|
import com.alibaba.fastjson2.JSONObject;
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||||
|
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||||
|
import com.easyagents.document.core.mineru.MineruClient;
|
||||||
|
import com.easyagents.document.core.mineru.MineruMapper;
|
||||||
|
import com.easyagents.document.core.mineru.MineruProperties;
|
||||||
|
import com.easyagents.document.core.mineru.MineruResultPayload;
|
||||||
|
import com.easyagents.document.core.entity.ParseArtifacts;
|
||||||
|
import com.easyagents.document.core.entity.ParseFile;
|
||||||
|
import com.easyagents.document.core.entity.ParseResponse;
|
||||||
|
import com.easyagents.document.core.entity.ParseResult;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||||
|
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||||
|
import com.easyagents.document.core.entity.XlsxParseRequest;
|
||||||
|
import com.easyagents.document.core.exception.DocumentParseException;
|
||||||
|
import com.easyagents.document.xlsx.model.XlsxParseArtifact;
|
||||||
|
import org.apache.poi.ss.usermodel.ClientAnchor;
|
||||||
|
import org.apache.poi.xssf.usermodel.XSSFDrawing;
|
||||||
|
import org.apache.poi.xssf.usermodel.XSSFSheet;
|
||||||
|
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import javax.imageio.ImageIO;
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.util.ArrayDeque;
|
||||||
|
import java.util.Queue;
|
||||||
|
import java.util.concurrent.Executor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XLSX MinerU 服务测试。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
public class MineruXlsxDocumentParseServiceTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldBuildMarkdownAndImageArtifacts() throws Exception {
|
||||||
|
RecordingClient client = new RecordingClient(defaultProperties());
|
||||||
|
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||||
|
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||||
|
defaultProperties(),
|
||||||
|
client,
|
||||||
|
mapper,
|
||||||
|
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||||
|
);
|
||||||
|
|
||||||
|
XlsxParseRequest request = new XlsxParseRequest();
|
||||||
|
request.addFile(ParseFile.of("demo.xlsx", buildWorkbookBytes()));
|
||||||
|
|
||||||
|
ParseResponse response = service.parse(request);
|
||||||
|
|
||||||
|
Assert.assertEquals(1, response.getResults().size());
|
||||||
|
ParseResult result = response.getResults().get(0);
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("images/sheet1/r2c2-001.png"));
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("图片文字描述"));
|
||||||
|
Assert.assertEquals(1, result.getImages().size());
|
||||||
|
Assert.assertNotNull(result.getImages().get(0).getContent());
|
||||||
|
|
||||||
|
XlsxParseArtifact artifact = extractXlsxArtifact(result);
|
||||||
|
Assert.assertEquals("demo.xlsx", artifact.getWorkbookName());
|
||||||
|
Assert.assertEquals(1, artifact.getSheets().size());
|
||||||
|
Assert.assertEquals(1, artifact.getSheetImages().size());
|
||||||
|
Assert.assertEquals(1, artifact.getCellImages().size());
|
||||||
|
Assert.assertEquals("sheet1-r2c2-001", artifact.getCellImages().get(0).getReferenceKey());
|
||||||
|
Assert.assertEquals("images/sheet1/r2c2-001.png", artifact.getCellImages().get(0).getSourcePath());
|
||||||
|
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
|
||||||
|
Assert.assertEquals("images/sheet1/r2c2-001.png", artifact.getSheetImages().get(0).getSourcePaths().get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldKeepImageKeysUniqueForNonAsciiSheetNames() throws Exception {
|
||||||
|
RecordingClient client = new RecordingClient(defaultProperties());
|
||||||
|
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||||
|
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||||
|
defaultProperties(),
|
||||||
|
client,
|
||||||
|
mapper,
|
||||||
|
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||||
|
);
|
||||||
|
|
||||||
|
XlsxParseRequest request = new XlsxParseRequest();
|
||||||
|
request.addFile(ParseFile.of("unicode-sheets.xlsx", buildWorkbookBytesWithUnicodeSheetNames()));
|
||||||
|
|
||||||
|
ParseResponse response = service.parse(request);
|
||||||
|
ParseResult result = response.getResults().get(0);
|
||||||
|
|
||||||
|
Assert.assertEquals(2, result.getImages().size());
|
||||||
|
Assert.assertNotEquals(result.getImages().get(0).getName(), result.getImages().get(1).getName());
|
||||||
|
Assert.assertNotEquals(result.getImages().get(0).getSourcePath(), result.getImages().get(1).getSourcePath());
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("[IMG:" + result.getImages().get(0).getName() + "]"));
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("[IMG:" + result.getImages().get(1).getName() + "]"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldDetectJpegMimeType() throws Exception {
|
||||||
|
RecordingClient client = new RecordingClient(defaultProperties());
|
||||||
|
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||||
|
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||||
|
defaultProperties(),
|
||||||
|
client,
|
||||||
|
mapper,
|
||||||
|
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||||
|
);
|
||||||
|
|
||||||
|
XlsxParseRequest request = new XlsxParseRequest();
|
||||||
|
request.addFile(ParseFile.of("jpeg.xlsx", buildWorkbookBytesWithJpegImage()));
|
||||||
|
|
||||||
|
ParseResponse response = service.parse(request);
|
||||||
|
ParseResult result = response.getResults().get(0);
|
||||||
|
|
||||||
|
Assert.assertEquals(1, result.getImages().size());
|
||||||
|
Assert.assertEquals("image/jpeg", result.getImages().get(0).getMimeType());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldAppendImageReferenceForImageOnlySheet() throws Exception {
|
||||||
|
RecordingClient client = new RecordingClient(defaultProperties());
|
||||||
|
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||||
|
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||||
|
defaultProperties(),
|
||||||
|
client,
|
||||||
|
mapper,
|
||||||
|
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor())
|
||||||
|
);
|
||||||
|
|
||||||
|
XlsxParseRequest request = new XlsxParseRequest();
|
||||||
|
request.addFile(ParseFile.of("image-only.xlsx", buildWorkbookBytesWithImageOnlySheet()));
|
||||||
|
|
||||||
|
ParseResponse response = service.parse(request);
|
||||||
|
ParseResult result = response.getResults().get(0);
|
||||||
|
XlsxParseArtifact artifact = extractXlsxArtifact(result);
|
||||||
|
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("# Sheet1"));
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("_empty sheet_"));
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片说明"));
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains(""));
|
||||||
|
Assert.assertTrue(result.getMarkdown().contains("- 占位符:[IMG:sheet1-r2c2-001]"));
|
||||||
|
Assert.assertEquals(1, result.getImages().size());
|
||||||
|
Assert.assertEquals(1, artifact.getSheetImages().size());
|
||||||
|
Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldTrackAsyncLifecycleAndExposeResult() throws Exception {
|
||||||
|
RecordingClient client = new RecordingClient(defaultProperties());
|
||||||
|
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||||
|
ManualExecutor executor = new ManualExecutor();
|
||||||
|
MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService(
|
||||||
|
defaultProperties(),
|
||||||
|
client,
|
||||||
|
mapper,
|
||||||
|
new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executor)
|
||||||
|
);
|
||||||
|
|
||||||
|
XlsxParseRequest request = new XlsxParseRequest();
|
||||||
|
request.addFile(ParseFile.of("async.xlsx", buildWorkbookBytes()));
|
||||||
|
|
||||||
|
ParseTaskStatus submitted = service.submit(request);
|
||||||
|
Assert.assertEquals("queued", submitted.getStatus());
|
||||||
|
Assert.assertEquals("queued", submitted.getCurrentStage());
|
||||||
|
Assert.assertEquals(Integer.valueOf(0), submitted.getProgressPercent());
|
||||||
|
|
||||||
|
ParseTaskInfo queuedInfo = service.queryTaskInfo(submitted.getTaskId());
|
||||||
|
Assert.assertNull(queuedInfo.getResult());
|
||||||
|
try {
|
||||||
|
service.queryResult(submitted.getTaskId());
|
||||||
|
Assert.fail("任务未完成时应抛出异常");
|
||||||
|
} catch (DocumentParseException expected) {
|
||||||
|
Assert.assertTrue(expected.getMessage().contains(submitted.getTaskId()));
|
||||||
|
}
|
||||||
|
|
||||||
|
executor.runNext();
|
||||||
|
|
||||||
|
ParseTaskStatus completed = service.queryTask(submitted.getTaskId());
|
||||||
|
Assert.assertEquals("completed", completed.getStatus());
|
||||||
|
Assert.assertEquals("completed", completed.getCurrentStage());
|
||||||
|
Assert.assertEquals(Integer.valueOf(100), completed.getProgressPercent());
|
||||||
|
Assert.assertEquals("任务执行完成", completed.getStatusMessage());
|
||||||
|
|
||||||
|
ParseTaskInfo completedInfo = service.queryTaskInfo(submitted.getTaskId());
|
||||||
|
Assert.assertNotNull(completedInfo.getResult());
|
||||||
|
Assert.assertTrue(completedInfo.getResult().getResults().get(0).getMarkdown().contains("[IMG:sheet1-r2c2-001]"));
|
||||||
|
Assert.assertEquals(completedInfo.getResult(), service.queryResult(submitted.getTaskId()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] buildWorkbookBytes() throws Exception {
|
||||||
|
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||||
|
XSSFSheet sheet = workbook.createSheet("Sheet1");
|
||||||
|
sheet.createRow(0).createCell(0).setCellValue("商品");
|
||||||
|
sheet.getRow(0).createCell(1).setCellValue("图片");
|
||||||
|
sheet.createRow(1).createCell(0).setCellValue("手机");
|
||||||
|
addPicture(workbook, sheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||||
|
return writeWorkbook(workbook);
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] buildWorkbookBytesWithUnicodeSheetNames() throws Exception {
|
||||||
|
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||||
|
|
||||||
|
XSSFSheet detailSheet = workbook.createSheet("明细");
|
||||||
|
detailSheet.createRow(0).createCell(0).setCellValue("图片");
|
||||||
|
addPicture(workbook, detailSheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||||
|
|
||||||
|
XSSFSheet summarySheet = workbook.createSheet("汇总");
|
||||||
|
summarySheet.createRow(0).createCell(0).setCellValue("图片");
|
||||||
|
addPicture(workbook, summarySheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||||
|
|
||||||
|
return writeWorkbook(workbook);
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] buildWorkbookBytesWithJpegImage() throws Exception {
|
||||||
|
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||||
|
XSSFSheet sheet = workbook.createSheet("Sheet1");
|
||||||
|
sheet.createRow(0).createCell(0).setCellValue("图片");
|
||||||
|
addPicture(workbook, sheet, 1, 1, createImageBytes("jpg"), XSSFWorkbook.PICTURE_TYPE_JPEG);
|
||||||
|
return writeWorkbook(workbook);
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] buildWorkbookBytesWithImageOnlySheet() throws Exception {
|
||||||
|
XSSFWorkbook workbook = new XSSFWorkbook();
|
||||||
|
XSSFSheet sheet = workbook.createSheet("Sheet1");
|
||||||
|
addPicture(workbook, sheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG);
|
||||||
|
return writeWorkbook(workbook);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addPicture(XSSFWorkbook workbook,
|
||||||
|
XSSFSheet sheet,
|
||||||
|
int rowIndex,
|
||||||
|
int colIndex,
|
||||||
|
byte[] imageBytes,
|
||||||
|
int pictureType) {
|
||||||
|
int pictureIndex = workbook.addPicture(imageBytes, pictureType);
|
||||||
|
XSSFDrawing drawing = sheet.createDrawingPatriarch();
|
||||||
|
ClientAnchor anchor = workbook.getCreationHelper().createClientAnchor();
|
||||||
|
anchor.setRow1(rowIndex);
|
||||||
|
anchor.setCol1(colIndex);
|
||||||
|
anchor.setRow2(rowIndex + 1);
|
||||||
|
anchor.setCol2(colIndex + 1);
|
||||||
|
drawing.createPicture(anchor, pictureIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] writeWorkbook(XSSFWorkbook workbook) throws Exception {
|
||||||
|
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
workbook.write(outputStream);
|
||||||
|
workbook.close();
|
||||||
|
return outputStream.toByteArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] createImageBytes(String format) throws Exception {
|
||||||
|
BufferedImage image = new BufferedImage(2, 2, BufferedImage.TYPE_INT_RGB);
|
||||||
|
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
ImageIO.write(image, format, outputStream);
|
||||||
|
return outputStream.toByteArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
private MineruProperties defaultProperties() {
|
||||||
|
MineruProperties properties = new MineruProperties();
|
||||||
|
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Executor directExecutor() {
|
||||||
|
return new Executor() {
|
||||||
|
@Override
|
||||||
|
public void execute(Runnable command) {
|
||||||
|
command.run();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private XlsxParseArtifact extractXlsxArtifact(ParseResult result) {
|
||||||
|
ParseArtifacts artifacts = result.getArtifacts();
|
||||||
|
Assert.assertNotNull(artifacts);
|
||||||
|
Object artifact = artifacts.getExtraJsonArtifacts().get("xlsx");
|
||||||
|
Assert.assertTrue(artifact instanceof XlsxParseArtifact);
|
||||||
|
return (XlsxParseArtifact) artifact;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 手动执行的测试执行器,用于验证异步任务状态流转。
|
||||||
|
*/
|
||||||
|
private static class ManualExecutor implements Executor {
|
||||||
|
|
||||||
|
private final Queue<Runnable> tasks = new ArrayDeque<Runnable>();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void execute(Runnable command) {
|
||||||
|
tasks.offer(command);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void runNext() {
|
||||||
|
Runnable task = tasks.poll();
|
||||||
|
Assert.assertNotNull("应当存在待执行任务", task);
|
||||||
|
task.run();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class RecordingClient extends MineruClient {
|
||||||
|
|
||||||
|
private RecordingClient(MineruProperties properties) {
|
||||||
|
super(properties, new MineruMapper(properties));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MineruResultPayload parse(com.easyagents.document.core.entity.ParseRequest request) {
|
||||||
|
return new MineruMapper(testProperties()).toResultPayload(syncPayload());
|
||||||
|
}
|
||||||
|
|
||||||
|
private JSONObject syncPayload() {
|
||||||
|
JSONObject payload = new JSONObject();
|
||||||
|
payload.put("backend", "vlm-http-client");
|
||||||
|
payload.put("version", "3.0.9");
|
||||||
|
JSONObject result = new JSONObject();
|
||||||
|
result.put("md_content", "图片文字描述");
|
||||||
|
JSONObject results = new JSONObject();
|
||||||
|
results.put("image", result);
|
||||||
|
payload.put("results", results);
|
||||||
|
return payload;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MineruProperties testProperties() {
|
||||||
|
MineruProperties properties = new MineruProperties();
|
||||||
|
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||||
|
return properties;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -17,5 +17,7 @@
|
|||||||
<modules>
|
<modules>
|
||||||
<module>easy-agents-document-core</module>
|
<module>easy-agents-document-core</module>
|
||||||
<module>easy-agents-document-pdf</module>
|
<module>easy-agents-document-pdf</module>
|
||||||
|
<module>easy-agents-document-pptx</module>
|
||||||
|
<module>easy-agents-document-xlsx</module>
|
||||||
</modules>
|
</modules>
|
||||||
</project>
|
</project>
|
||||||
|
|||||||
@@ -61,6 +61,16 @@
|
|||||||
<artifactId>easy-agents-document-pdf</artifactId>
|
<artifactId>easy-agents-document-pdf</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-document-pptx</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-document-xlsx</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.easyagents</groupId>
|
<groupId>com.easyagents</groupId>
|
||||||
<artifactId>easy-agents-rag-ingestion</artifactId>
|
<artifactId>easy-agents-rag-ingestion</artifactId>
|
||||||
|
|||||||
@@ -0,0 +1,119 @@
|
|||||||
|
package com.easyagents.spring.boot.document.mineru;
|
||||||
|
|
||||||
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 通用 MinerU 文档配置。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
@ConfigurationProperties(prefix = "easy-agents.document.mineru")
|
||||||
|
public class CommonMineruDocumentProperties {
|
||||||
|
|
||||||
|
private String baseUrl;
|
||||||
|
private Integer connectTimeoutMs = 3000;
|
||||||
|
private Integer readTimeoutMs = 600000;
|
||||||
|
private Integer writeTimeoutMs = 600000;
|
||||||
|
private Integer pollIntervalMs = 1000;
|
||||||
|
private Integer resultTimeoutMs = 1800000;
|
||||||
|
private String defaultBackend = "vlm-http-client";
|
||||||
|
private String defaultParseMethod = "auto";
|
||||||
|
private List<String> defaultLangList = new ArrayList<String>(Arrays.asList("ch"));
|
||||||
|
private Boolean defaultFormulaEnable = true;
|
||||||
|
private Boolean defaultTableEnable = true;
|
||||||
|
|
||||||
|
public String getBaseUrl() {
|
||||||
|
return baseUrl;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBaseUrl(String baseUrl) {
|
||||||
|
this.baseUrl = baseUrl;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getConnectTimeoutMs() {
|
||||||
|
return connectTimeoutMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setConnectTimeoutMs(Integer connectTimeoutMs) {
|
||||||
|
this.connectTimeoutMs = connectTimeoutMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getReadTimeoutMs() {
|
||||||
|
return readTimeoutMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReadTimeoutMs(Integer readTimeoutMs) {
|
||||||
|
this.readTimeoutMs = readTimeoutMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getWriteTimeoutMs() {
|
||||||
|
return writeTimeoutMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setWriteTimeoutMs(Integer writeTimeoutMs) {
|
||||||
|
this.writeTimeoutMs = writeTimeoutMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getPollIntervalMs() {
|
||||||
|
return pollIntervalMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPollIntervalMs(Integer pollIntervalMs) {
|
||||||
|
this.pollIntervalMs = pollIntervalMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getResultTimeoutMs() {
|
||||||
|
return resultTimeoutMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setResultTimeoutMs(Integer resultTimeoutMs) {
|
||||||
|
this.resultTimeoutMs = resultTimeoutMs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDefaultBackend() {
|
||||||
|
return defaultBackend;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDefaultBackend(String defaultBackend) {
|
||||||
|
this.defaultBackend = defaultBackend;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDefaultParseMethod() {
|
||||||
|
return defaultParseMethod;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDefaultParseMethod(String defaultParseMethod) {
|
||||||
|
this.defaultParseMethod = defaultParseMethod;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getDefaultLangList() {
|
||||||
|
return defaultLangList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDefaultLangList(List<String> defaultLangList) {
|
||||||
|
this.defaultLangList = defaultLangList == null
|
||||||
|
? new ArrayList<String>(Arrays.asList("ch"))
|
||||||
|
: defaultLangList;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean getDefaultFormulaEnable() {
|
||||||
|
return defaultFormulaEnable;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDefaultFormulaEnable(Boolean defaultFormulaEnable) {
|
||||||
|
this.defaultFormulaEnable = defaultFormulaEnable;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Boolean getDefaultTableEnable() {
|
||||||
|
return defaultTableEnable;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDefaultTableEnable(Boolean defaultTableEnable) {
|
||||||
|
this.defaultTableEnable = defaultTableEnable;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,9 +1,11 @@
|
|||||||
package com.easyagents.spring.boot.document.pdf.mineru;
|
package com.easyagents.spring.boot.document.pdf.mineru;
|
||||||
|
|
||||||
import com.easyagents.document.core.DocumentParseService;
|
import com.easyagents.document.core.DocumentParseService;
|
||||||
|
import com.easyagents.document.core.mineru.MineruProperties;
|
||||||
import com.easyagents.document.pdf.PdfDocumentParseService;
|
import com.easyagents.document.pdf.PdfDocumentParseService;
|
||||||
import com.easyagents.document.pdf.mineru.MineruPdfDocumentParseService;
|
import com.easyagents.document.pdf.mineru.MineruPdfDocumentParseService;
|
||||||
import com.easyagents.document.pdf.mineru.MineruProperties;
|
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
|
||||||
|
import com.easyagents.core.util.StringUtil;
|
||||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
|
||||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
|
||||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||||
@@ -20,7 +22,7 @@ import org.springframework.context.annotation.Configuration;
|
|||||||
@Configuration(proxyBeanMethods = false)
|
@Configuration(proxyBeanMethods = false)
|
||||||
@ConditionalOnClass(MineruPdfDocumentParseService.class)
|
@ConditionalOnClass(MineruPdfDocumentParseService.class)
|
||||||
@ConditionalOnProperty(prefix = "easy-agents.document.pdf", name = "provider", havingValue = "mineru")
|
@ConditionalOnProperty(prefix = "easy-agents.document.pdf", name = "provider", havingValue = "mineru")
|
||||||
@EnableConfigurationProperties(MineruDocumentProperties.class)
|
@EnableConfigurationProperties({MineruDocumentProperties.class, CommonMineruDocumentProperties.class})
|
||||||
public class MineruPdfAutoConfiguration {
|
public class MineruPdfAutoConfiguration {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -31,8 +33,9 @@ public class MineruPdfAutoConfiguration {
|
|||||||
*/
|
*/
|
||||||
@Bean
|
@Bean
|
||||||
@ConditionalOnMissingBean(PdfDocumentParseService.class)
|
@ConditionalOnMissingBean(PdfDocumentParseService.class)
|
||||||
public PdfDocumentParseService pdfDocumentParseService(MineruDocumentProperties properties) {
|
public PdfDocumentParseService pdfDocumentParseService(MineruDocumentProperties properties,
|
||||||
return new MineruPdfDocumentParseService(toMineruProperties(properties));
|
CommonMineruDocumentProperties commonProperties) {
|
||||||
|
return new MineruPdfDocumentParseService(toMineruProperties(properties, commonProperties));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -47,19 +50,21 @@ public class MineruPdfAutoConfiguration {
|
|||||||
return pdfDocumentParseService;
|
return pdfDocumentParseService;
|
||||||
}
|
}
|
||||||
|
|
||||||
private MineruProperties toMineruProperties(MineruDocumentProperties properties) {
|
private MineruProperties toMineruProperties(MineruDocumentProperties properties,
|
||||||
|
CommonMineruDocumentProperties commonProperties) {
|
||||||
MineruProperties mineruProperties = new MineruProperties();
|
MineruProperties mineruProperties = new MineruProperties();
|
||||||
mineruProperties.setBaseUrl(properties.getBaseUrl());
|
boolean useCommon = commonProperties != null && StringUtil.hasText(commonProperties.getBaseUrl());
|
||||||
mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs());
|
mineruProperties.setBaseUrl(useCommon ? commonProperties.getBaseUrl() : properties.getBaseUrl());
|
||||||
mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs());
|
mineruProperties.setConnectTimeoutMs(useCommon ? commonProperties.getConnectTimeoutMs() : properties.getConnectTimeoutMs());
|
||||||
mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs());
|
mineruProperties.setReadTimeoutMs(useCommon ? commonProperties.getReadTimeoutMs() : properties.getReadTimeoutMs());
|
||||||
mineruProperties.setPollIntervalMs(properties.getPollIntervalMs());
|
mineruProperties.setWriteTimeoutMs(useCommon ? commonProperties.getWriteTimeoutMs() : properties.getWriteTimeoutMs());
|
||||||
mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs());
|
mineruProperties.setPollIntervalMs(useCommon ? commonProperties.getPollIntervalMs() : properties.getPollIntervalMs());
|
||||||
mineruProperties.setDefaultBackend(properties.getDefaultBackend());
|
mineruProperties.setResultTimeoutMs(useCommon ? commonProperties.getResultTimeoutMs() : properties.getResultTimeoutMs());
|
||||||
mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod());
|
mineruProperties.setDefaultBackend(useCommon ? commonProperties.getDefaultBackend() : properties.getDefaultBackend());
|
||||||
mineruProperties.setDefaultLangList(properties.getDefaultLangList());
|
mineruProperties.setDefaultParseMethod(useCommon ? commonProperties.getDefaultParseMethod() : properties.getDefaultParseMethod());
|
||||||
mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable());
|
mineruProperties.setDefaultLangList(useCommon ? commonProperties.getDefaultLangList() : properties.getDefaultLangList());
|
||||||
mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable());
|
mineruProperties.setDefaultFormulaEnable(useCommon ? commonProperties.getDefaultFormulaEnable() : properties.getDefaultFormulaEnable());
|
||||||
|
mineruProperties.setDefaultTableEnable(useCommon ? commonProperties.getDefaultTableEnable() : properties.getDefaultTableEnable());
|
||||||
return mineruProperties;
|
return mineruProperties;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,61 @@
|
|||||||
|
package com.easyagents.spring.boot.document.pptx;
|
||||||
|
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||||
|
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||||
|
import com.easyagents.document.core.mineru.MineruProperties;
|
||||||
|
import com.easyagents.document.pptx.PptxDocumentParseService;
|
||||||
|
import com.easyagents.document.pptx.mineru.MineruPptxDocumentParseService;
|
||||||
|
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
|
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
|
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
|
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||||
|
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||||
|
import org.springframework.context.annotation.Bean;
|
||||||
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MinerU PPTX 自动装配。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
@Configuration(proxyBeanMethods = false)
|
||||||
|
@ConditionalOnClass(MineruPptxDocumentParseService.class)
|
||||||
|
@ConditionalOnProperty(prefix = "easy-agents.document.pptx", name = "enabled", havingValue = "true")
|
||||||
|
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, PptxDocumentProperties.class})
|
||||||
|
public class MineruPptxAutoConfiguration {
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
@ConditionalOnMissingBean(name = "pptxDocumentAsyncTaskManager")
|
||||||
|
public DocumentAsyncTaskManager pptxDocumentAsyncTaskManager(PptxDocumentProperties properties) {
|
||||||
|
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads();
|
||||||
|
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
|
||||||
|
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
@ConditionalOnMissingBean(PptxDocumentParseService.class)
|
||||||
|
public PptxDocumentParseService pptxDocumentParseService(CommonMineruDocumentProperties commonProperties,
|
||||||
|
DocumentAsyncTaskManager pptxDocumentAsyncTaskManager) {
|
||||||
|
return new MineruPptxDocumentParseService(toMineruProperties(commonProperties), pptxDocumentAsyncTaskManager);
|
||||||
|
}
|
||||||
|
|
||||||
|
private MineruProperties toMineruProperties(CommonMineruDocumentProperties properties) {
|
||||||
|
MineruProperties mineruProperties = new MineruProperties();
|
||||||
|
mineruProperties.setBaseUrl(properties.getBaseUrl());
|
||||||
|
mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs());
|
||||||
|
mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs());
|
||||||
|
mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs());
|
||||||
|
mineruProperties.setPollIntervalMs(properties.getPollIntervalMs());
|
||||||
|
mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs());
|
||||||
|
mineruProperties.setDefaultBackend(properties.getDefaultBackend());
|
||||||
|
mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod());
|
||||||
|
mineruProperties.setDefaultLangList(properties.getDefaultLangList());
|
||||||
|
mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable());
|
||||||
|
mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable());
|
||||||
|
return mineruProperties;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
package com.easyagents.spring.boot.document.pptx;
|
||||||
|
|
||||||
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PPTX 文档配置。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
@ConfigurationProperties(prefix = "easy-agents.document.pptx")
|
||||||
|
public class PptxDocumentProperties {
|
||||||
|
|
||||||
|
private Boolean enabled = false;
|
||||||
|
private Integer asyncThreads = 2;
|
||||||
|
|
||||||
|
public Boolean getEnabled() {
|
||||||
|
return enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEnabled(Boolean enabled) {
|
||||||
|
this.enabled = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getAsyncThreads() {
|
||||||
|
return asyncThreads;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAsyncThreads(Integer asyncThreads) {
|
||||||
|
this.asyncThreads = asyncThreads;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,61 @@
|
|||||||
|
package com.easyagents.spring.boot.document.xlsx;
|
||||||
|
|
||||||
|
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||||
|
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
|
||||||
|
import com.easyagents.document.core.mineru.MineruProperties;
|
||||||
|
import com.easyagents.document.xlsx.XlsxDocumentParseService;
|
||||||
|
import com.easyagents.document.xlsx.mineru.MineruXlsxDocumentParseService;
|
||||||
|
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
|
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
|
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
|
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||||
|
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||||
|
import org.springframework.context.annotation.Bean;
|
||||||
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MinerU XLSX 自动装配。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
@Configuration(proxyBeanMethods = false)
|
||||||
|
@ConditionalOnClass(MineruXlsxDocumentParseService.class)
|
||||||
|
@ConditionalOnProperty(prefix = "easy-agents.document.xlsx", name = "enabled", havingValue = "true")
|
||||||
|
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, XlsxDocumentProperties.class})
|
||||||
|
public class MineruXlsxAutoConfiguration {
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
@ConditionalOnMissingBean(name = "xlsxDocumentAsyncTaskManager")
|
||||||
|
public DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager(XlsxDocumentProperties properties) {
|
||||||
|
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads();
|
||||||
|
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
|
||||||
|
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
@ConditionalOnMissingBean(XlsxDocumentParseService.class)
|
||||||
|
public XlsxDocumentParseService xlsxDocumentParseService(CommonMineruDocumentProperties commonProperties,
|
||||||
|
DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager) {
|
||||||
|
return new MineruXlsxDocumentParseService(toMineruProperties(commonProperties), xlsxDocumentAsyncTaskManager);
|
||||||
|
}
|
||||||
|
|
||||||
|
private MineruProperties toMineruProperties(CommonMineruDocumentProperties properties) {
|
||||||
|
MineruProperties mineruProperties = new MineruProperties();
|
||||||
|
mineruProperties.setBaseUrl(properties.getBaseUrl());
|
||||||
|
mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs());
|
||||||
|
mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs());
|
||||||
|
mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs());
|
||||||
|
mineruProperties.setPollIntervalMs(properties.getPollIntervalMs());
|
||||||
|
mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs());
|
||||||
|
mineruProperties.setDefaultBackend(properties.getDefaultBackend());
|
||||||
|
mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod());
|
||||||
|
mineruProperties.setDefaultLangList(properties.getDefaultLangList());
|
||||||
|
mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable());
|
||||||
|
mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable());
|
||||||
|
return mineruProperties;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
package com.easyagents.spring.boot.document.xlsx;
|
||||||
|
|
||||||
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* XLSX 文档配置。
|
||||||
|
*
|
||||||
|
* @author Codex
|
||||||
|
* @since 2026-04-16
|
||||||
|
*/
|
||||||
|
@ConfigurationProperties(prefix = "easy-agents.document.xlsx")
|
||||||
|
public class XlsxDocumentProperties {
|
||||||
|
|
||||||
|
private Boolean enabled = false;
|
||||||
|
private Integer asyncThreads = 2;
|
||||||
|
|
||||||
|
public Boolean getEnabled() {
|
||||||
|
return enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEnabled(Boolean enabled) {
|
||||||
|
this.enabled = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer getAsyncThreads() {
|
||||||
|
return asyncThreads;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAsyncThreads(Integer asyncThreads) {
|
||||||
|
this.asyncThreads = asyncThreads;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -9,3 +9,5 @@ com.easyagents.spring.boot.store.elasticsearch.ElasticSearchAutoConfiguration
|
|||||||
com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration
|
com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration
|
||||||
com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration
|
com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration
|
||||||
com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration
|
com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration
|
||||||
|
com.easyagents.spring.boot.document.pptx.MineruPptxAutoConfiguration
|
||||||
|
com.easyagents.spring.boot.document.xlsx.MineruXlsxAutoConfiguration
|
||||||
|
|||||||
@@ -2,8 +2,12 @@ package com.easyagents.spring.boot.autoconfigure;
|
|||||||
|
|
||||||
import com.easyagents.document.core.DocumentParseService;
|
import com.easyagents.document.core.DocumentParseService;
|
||||||
import com.easyagents.document.pdf.PdfDocumentParseService;
|
import com.easyagents.document.pdf.PdfDocumentParseService;
|
||||||
|
import com.easyagents.document.pptx.PptxDocumentParseService;
|
||||||
|
import com.easyagents.document.xlsx.XlsxDocumentParseService;
|
||||||
import com.easyagents.llm.ollama.OllamaChatModel;
|
import com.easyagents.llm.ollama.OllamaChatModel;
|
||||||
|
import com.easyagents.spring.boot.document.pptx.MineruPptxAutoConfiguration;
|
||||||
import com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration;
|
import com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration;
|
||||||
|
import com.easyagents.spring.boot.document.xlsx.MineruXlsxAutoConfiguration;
|
||||||
import com.easyagents.spring.boot.llm.ollama.OllamaAutoConfiguration;
|
import com.easyagents.spring.boot.llm.ollama.OllamaAutoConfiguration;
|
||||||
import com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration;
|
import com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration;
|
||||||
import com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration;
|
import com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration;
|
||||||
@@ -18,7 +22,9 @@ public class StarterConditionalAutoConfigurationTest {
|
|||||||
RagIngestionAutoConfiguration.class,
|
RagIngestionAutoConfiguration.class,
|
||||||
OllamaAutoConfiguration.class,
|
OllamaAutoConfiguration.class,
|
||||||
OpenSearchAutoConfiguration.class,
|
OpenSearchAutoConfiguration.class,
|
||||||
MineruPdfAutoConfiguration.class
|
MineruPdfAutoConfiguration.class,
|
||||||
|
MineruPptxAutoConfiguration.class,
|
||||||
|
MineruXlsxAutoConfiguration.class
|
||||||
);
|
);
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -51,4 +57,19 @@ public class StarterConditionalAutoConfigurationTest {
|
|||||||
Assert.assertNotNull(context.getBean(DocumentParseService.class));
|
Assert.assertNotNull(context.getBean(DocumentParseService.class));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldCreatePptxAndXlsxBeansWhenEnabled() {
|
||||||
|
contextRunner
|
||||||
|
.withPropertyValues(
|
||||||
|
"easy-agents.document.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api",
|
||||||
|
"easy-agents.document.pptx.enabled=true",
|
||||||
|
"easy-agents.document.xlsx.enabled=true"
|
||||||
|
)
|
||||||
|
.run(context -> {
|
||||||
|
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
|
||||||
|
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
|
||||||
|
Assert.assertFalse(context.containsBean("documentParseService"));
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
12
pom.xml
12
pom.xml
@@ -132,6 +132,18 @@
|
|||||||
<version>${revision}</version>
|
<version>${revision}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-document-pptx</artifactId>
|
||||||
|
<version>${revision}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.easyagents</groupId>
|
||||||
|
<artifactId>easy-agents-document-xlsx</artifactId>
|
||||||
|
<version>${revision}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.easyagents</groupId>
|
<groupId>com.easyagents</groupId>
|
||||||
<artifactId>easy-agents-rag-core</artifactId>
|
<artifactId>easy-agents-rag-core</artifactId>
|
||||||
|
|||||||
Reference in New Issue
Block a user