feat: 扩展 Office 文档解析能力
- 重构 document-core 与 MinerU 公共层,补齐 Office 异步任务基础设施 - 新增 PPTX/XLSX 解析模块与 starter 自动装配 - 补充 README 与相关测试覆盖
This commit is contained in:
@@ -24,5 +24,26 @@
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.alibaba.fastjson2</groupId>
|
||||
<artifactId>fastjson2</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.squareup.okhttp3</groupId>
|
||||
<artifactId>okhttp</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
package com.easyagents.document.core;
|
||||
|
||||
import com.easyagents.document.core.model.ParseRequest;
|
||||
import com.easyagents.document.core.model.ParseResponse;
|
||||
import com.easyagents.document.core.model.ParseTaskInfo;
|
||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
|
||||
/**
|
||||
* 统一文档解析服务抽象。
|
||||
@@ -11,7 +11,7 @@ import com.easyagents.document.core.model.ParseTaskStatus;
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
*/
|
||||
public interface DocumentParseService {
|
||||
public interface DocumentParseService<R extends ParseRequest> {
|
||||
|
||||
/**
|
||||
* 同步解析文档并直接返回结果。
|
||||
|
||||
@@ -0,0 +1,173 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.Executor;
|
||||
|
||||
/**
|
||||
* 文档异步任务管理器。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class DocumentAsyncTaskManager {
|
||||
|
||||
private final DocumentAsyncTaskRepository repository;
|
||||
private final Executor executor;
|
||||
|
||||
/**
|
||||
* 创建任务管理器。
|
||||
*
|
||||
* @param repository 任务仓库
|
||||
* @param executor 执行器
|
||||
*/
|
||||
public DocumentAsyncTaskManager(DocumentAsyncTaskRepository repository, Executor executor) {
|
||||
if (repository == null) {
|
||||
throw new IllegalArgumentException("DocumentAsyncTaskRepository must not be null");
|
||||
}
|
||||
if (executor == null) {
|
||||
throw new IllegalArgumentException("Executor must not be null");
|
||||
}
|
||||
this.repository = repository;
|
||||
this.executor = executor;
|
||||
}
|
||||
|
||||
/**
|
||||
* 提交异步任务。
|
||||
*
|
||||
* @param backend 后端标识
|
||||
* @param fileNames 文件名列表
|
||||
* @param runner 任务执行器
|
||||
* @return 初始任务状态
|
||||
*/
|
||||
public ParseTaskStatus submit(String backend, List<String> fileNames, final DocumentAsyncTaskRunner runner) {
|
||||
final String taskId = UUID.randomUUID().toString();
|
||||
final ParseTaskStatus status = new ParseTaskStatus();
|
||||
status.setTaskId(taskId);
|
||||
status.setStatus("queued");
|
||||
status.setBackend(backend);
|
||||
status.setFileNames(fileNames == null ? new ArrayList<String>() : new ArrayList<String>(fileNames));
|
||||
status.setCreatedAt(Instant.now().toString());
|
||||
status.setCurrentStage("queued");
|
||||
status.setProgressPercent(0);
|
||||
status.setProcessedItems(0);
|
||||
status.setTotalItems(fileNames == null ? 0 : fileNames.size());
|
||||
status.setStatusMessage("任务已进入队列");
|
||||
|
||||
final DocumentAsyncTaskRecord record = new DocumentAsyncTaskRecord(status);
|
||||
repository.save(record);
|
||||
|
||||
executor.execute(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
markRunning(record);
|
||||
try {
|
||||
ParseResponse response = runner.run(new RepositoryBackedTaskUpdater(record));
|
||||
ParseTaskStatus completed = record.getStatusSnapshot();
|
||||
completed.setStatus("completed");
|
||||
completed.setCompletedAt(Instant.now().toString());
|
||||
completed.setProgressPercent(100);
|
||||
completed.setCurrentStage("completed");
|
||||
completed.setStatusMessage("任务执行完成");
|
||||
record.setResult(response);
|
||||
record.updateStatus(completed);
|
||||
} catch (Exception exception) {
|
||||
ParseTaskStatus failed = record.getStatusSnapshot();
|
||||
failed.setStatus("failed");
|
||||
failed.setCompletedAt(Instant.now().toString());
|
||||
failed.setCurrentStage("failed");
|
||||
failed.setStatusMessage(exception.getMessage());
|
||||
failed.setError(exception.getMessage());
|
||||
record.updateStatus(failed);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return record.getStatusSnapshot();
|
||||
}
|
||||
|
||||
/**
|
||||
* 查询任务状态。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 任务状态
|
||||
*/
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
return requireRecord(taskId).getStatusSnapshot();
|
||||
}
|
||||
|
||||
/**
|
||||
* 查询任务聚合信息。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 聚合信息
|
||||
*/
|
||||
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||
return requireRecord(taskId).getTaskInfoSnapshot();
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取任务结果。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 任务结果
|
||||
*/
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
DocumentAsyncTaskRecord record = requireRecord(taskId);
|
||||
ParseTaskStatus status = record.getStatusSnapshot();
|
||||
if (!"completed".equalsIgnoreCase(status.getStatus())) {
|
||||
throw new DocumentParseException("Document async task is not completed: " + taskId);
|
||||
}
|
||||
return record.getResult();
|
||||
}
|
||||
|
||||
private DocumentAsyncTaskRecord requireRecord(String taskId) {
|
||||
if (!StringUtil.hasText(taskId)) {
|
||||
throw new IllegalArgumentException("taskId must not be empty");
|
||||
}
|
||||
DocumentAsyncTaskRecord record = repository.find(taskId);
|
||||
if (record == null) {
|
||||
throw new DocumentParseException("Document async task not found: " + taskId);
|
||||
}
|
||||
return record;
|
||||
}
|
||||
|
||||
private void markRunning(DocumentAsyncTaskRecord record) {
|
||||
ParseTaskStatus status = record.getStatusSnapshot();
|
||||
status.setStatus("preparing");
|
||||
status.setStartedAt(Instant.now().toString());
|
||||
status.setCurrentStage("preparing");
|
||||
status.setProgressPercent(0);
|
||||
status.setStatusMessage("任务开始执行");
|
||||
record.updateStatus(status);
|
||||
}
|
||||
|
||||
private static class RepositoryBackedTaskUpdater implements DocumentAsyncTaskUpdater {
|
||||
|
||||
private final DocumentAsyncTaskRecord record;
|
||||
|
||||
private RepositoryBackedTaskUpdater(DocumentAsyncTaskRecord record) {
|
||||
this.record = record;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void update(String stage, Integer progressPercent, Integer processedItems, Integer totalItems, String statusMessage) {
|
||||
ParseTaskStatus status = record.getStatusSnapshot();
|
||||
status.setStatus("completed".equalsIgnoreCase(stage) ? "completed" : "running");
|
||||
status.setCurrentStage(stage);
|
||||
status.setProgressPercent(progressPercent);
|
||||
status.setProcessedItems(processedItems);
|
||||
status.setTotalItems(totalItems);
|
||||
status.setStatusMessage(statusMessage);
|
||||
record.updateStatus(status);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
|
||||
/**
|
||||
* 文档异步任务记录。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class DocumentAsyncTaskRecord {
|
||||
|
||||
private final ParseTaskStatus status;
|
||||
private ParseResponse result;
|
||||
|
||||
/**
|
||||
* 创建任务记录。
|
||||
*
|
||||
* @param status 初始状态
|
||||
*/
|
||||
public DocumentAsyncTaskRecord(ParseTaskStatus status) {
|
||||
this.status = status;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取状态快照。
|
||||
*
|
||||
* @return 状态快照
|
||||
*/
|
||||
public synchronized ParseTaskStatus getStatusSnapshot() {
|
||||
return copyStatus(status);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取聚合信息快照。
|
||||
*
|
||||
* @return 聚合信息
|
||||
*/
|
||||
public synchronized ParseTaskInfo getTaskInfoSnapshot() {
|
||||
ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(status);
|
||||
taskInfo.setResult(result);
|
||||
return taskInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取结果。
|
||||
*
|
||||
* @return 最终结果
|
||||
*/
|
||||
public synchronized ParseResponse getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 更新任务状态。
|
||||
*
|
||||
* @param newStatus 新状态
|
||||
*/
|
||||
public synchronized void updateStatus(ParseTaskStatus newStatus) {
|
||||
if (newStatus == null) {
|
||||
return;
|
||||
}
|
||||
copyInto(newStatus, status);
|
||||
}
|
||||
|
||||
/**
|
||||
* 更新任务结果。
|
||||
*
|
||||
* @param result 最终结果
|
||||
*/
|
||||
public synchronized void setResult(ParseResponse result) {
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
private ParseTaskStatus copyStatus(ParseTaskStatus source) {
|
||||
ParseTaskStatus copy = new ParseTaskStatus();
|
||||
copyInto(source, copy);
|
||||
return copy;
|
||||
}
|
||||
|
||||
private void copyInto(ParseTaskStatus source, ParseTaskStatus target) {
|
||||
target.setTaskId(source.getTaskId());
|
||||
target.setStatus(source.getStatus());
|
||||
target.setBackend(source.getBackend());
|
||||
target.setFileNames(source.getFileNames());
|
||||
target.setCreatedAt(source.getCreatedAt());
|
||||
target.setStartedAt(source.getStartedAt());
|
||||
target.setCompletedAt(source.getCompletedAt());
|
||||
target.setError(source.getError());
|
||||
target.setStatusUrl(source.getStatusUrl());
|
||||
target.setResultUrl(source.getResultUrl());
|
||||
target.setQueuedAhead(source.getQueuedAhead());
|
||||
target.setProgressPercent(source.getProgressPercent());
|
||||
target.setCurrentStage(source.getCurrentStage());
|
||||
target.setProcessedItems(source.getProcessedItems());
|
||||
target.setTotalItems(source.getTotalItems());
|
||||
target.setStatusMessage(source.getStatusMessage());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
/**
|
||||
* 文档异步任务仓库。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface DocumentAsyncTaskRepository {
|
||||
|
||||
/**
|
||||
* 保存任务记录。
|
||||
*
|
||||
* @param record 任务记录
|
||||
*/
|
||||
void save(DocumentAsyncTaskRecord record);
|
||||
|
||||
/**
|
||||
* 获取任务记录。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 任务记录,不存在时返回 {@code null}
|
||||
*/
|
||||
DocumentAsyncTaskRecord find(String taskId);
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
|
||||
/**
|
||||
* 文档异步任务执行器。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface DocumentAsyncTaskRunner {
|
||||
|
||||
/**
|
||||
* 执行任务。
|
||||
*
|
||||
* @param updater 状态更新器
|
||||
* @return 解析结果
|
||||
* @throws Exception 执行异常
|
||||
*/
|
||||
ParseResponse run(DocumentAsyncTaskUpdater updater) throws Exception;
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
/**
|
||||
* 文档异步任务进度更新器。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public interface DocumentAsyncTaskUpdater {
|
||||
|
||||
/**
|
||||
* 更新任务状态。
|
||||
*
|
||||
* @param stage 当前阶段
|
||||
* @param progressPercent 进度百分比
|
||||
* @param processedItems 已处理数量
|
||||
* @param totalItems 总数量
|
||||
* @param statusMessage 状态说明
|
||||
*/
|
||||
void update(String stage, Integer progressPercent, Integer processedItems, Integer totalItems, String statusMessage);
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* 基于内存的异步任务仓库。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class InMemoryDocumentAsyncTaskRepository implements DocumentAsyncTaskRepository {
|
||||
|
||||
private final Map<String, DocumentAsyncTaskRecord> records = new ConcurrentHashMap<String, DocumentAsyncTaskRecord>();
|
||||
|
||||
@Override
|
||||
public void save(DocumentAsyncTaskRecord record) {
|
||||
if (record == null || record.getStatusSnapshot() == null || record.getStatusSnapshot().getTaskId() == null) {
|
||||
return;
|
||||
}
|
||||
records.put(record.getStatusSnapshot().getTaskId(), record);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocumentAsyncTaskRecord find(String taskId) {
|
||||
return records.get(taskId);
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -16,6 +16,7 @@ public class DocumentImage {
|
||||
private String mimeType;
|
||||
private String sourcePath;
|
||||
private String dataUrl;
|
||||
private byte[] content;
|
||||
private List<Double> boundingBox = new ArrayList<Double>();
|
||||
private List<String> captions = new ArrayList<String>();
|
||||
private List<String> footnotes = new ArrayList<String>();
|
||||
@@ -60,6 +61,14 @@ public class DocumentImage {
|
||||
this.dataUrl = dataUrl;
|
||||
}
|
||||
|
||||
public byte[] getContent() {
|
||||
return content;
|
||||
}
|
||||
|
||||
public void setContent(byte[] content) {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
public List<Double> getBoundingBox() {
|
||||
return boundingBox;
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -13,12 +13,7 @@ public class ParseRequest {
|
||||
|
||||
private List<ParseFile> files = new ArrayList<ParseFile>();
|
||||
private String backend;
|
||||
private String parseMethod = "auto";
|
||||
private List<String> languages = new ArrayList<String>();
|
||||
private Boolean formulaEnabled = true;
|
||||
private Boolean tableEnabled = true;
|
||||
private Integer startPageIndex = 0;
|
||||
private Integer endPageIndex = 99999;
|
||||
private Boolean returnMarkdown = true;
|
||||
private Boolean returnMiddleJson = true;
|
||||
private Boolean returnContentList = true;
|
||||
@@ -38,6 +33,25 @@ public class ParseRequest {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 复制通用字段到目标请求。
|
||||
*
|
||||
* @param target 目标请求
|
||||
*/
|
||||
public void copyCommonFieldsTo(ParseRequest target) {
|
||||
if (target == null) {
|
||||
return;
|
||||
}
|
||||
target.setFiles(new ArrayList<ParseFile>(getFiles()));
|
||||
target.setBackend(getBackend());
|
||||
target.setLanguages(new ArrayList<String>(getLanguages()));
|
||||
target.setReturnMarkdown(getReturnMarkdown());
|
||||
target.setReturnMiddleJson(getReturnMiddleJson());
|
||||
target.setReturnContentList(getReturnContentList());
|
||||
target.setReturnModelOutput(getReturnModelOutput());
|
||||
target.setReturnImages(getReturnImages());
|
||||
}
|
||||
|
||||
public List<ParseFile> getFiles() {
|
||||
return files;
|
||||
}
|
||||
@@ -54,14 +68,6 @@ public class ParseRequest {
|
||||
this.backend = backend;
|
||||
}
|
||||
|
||||
public String getParseMethod() {
|
||||
return parseMethod;
|
||||
}
|
||||
|
||||
public void setParseMethod(String parseMethod) {
|
||||
this.parseMethod = parseMethod;
|
||||
}
|
||||
|
||||
public List<String> getLanguages() {
|
||||
return languages;
|
||||
}
|
||||
@@ -70,38 +76,6 @@ public class ParseRequest {
|
||||
this.languages = languages == null ? new ArrayList<String>() : languages;
|
||||
}
|
||||
|
||||
public Boolean getFormulaEnabled() {
|
||||
return formulaEnabled;
|
||||
}
|
||||
|
||||
public void setFormulaEnabled(Boolean formulaEnabled) {
|
||||
this.formulaEnabled = formulaEnabled;
|
||||
}
|
||||
|
||||
public Boolean getTableEnabled() {
|
||||
return tableEnabled;
|
||||
}
|
||||
|
||||
public void setTableEnabled(Boolean tableEnabled) {
|
||||
this.tableEnabled = tableEnabled;
|
||||
}
|
||||
|
||||
public Integer getStartPageIndex() {
|
||||
return startPageIndex;
|
||||
}
|
||||
|
||||
public void setStartPageIndex(Integer startPageIndex) {
|
||||
this.startPageIndex = startPageIndex;
|
||||
}
|
||||
|
||||
public Integer getEndPageIndex() {
|
||||
return endPageIndex;
|
||||
}
|
||||
|
||||
public void setEndPageIndex(Integer endPageIndex) {
|
||||
this.endPageIndex = endPageIndex;
|
||||
}
|
||||
|
||||
public Boolean getReturnMarkdown() {
|
||||
return returnMarkdown;
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
/**
|
||||
* 异步任务聚合查询结果。
|
||||
@@ -35,6 +35,11 @@ public class ParseTaskInfo extends ParseTaskStatus {
|
||||
taskInfo.setStatusUrl(status.getStatusUrl());
|
||||
taskInfo.setResultUrl(status.getResultUrl());
|
||||
taskInfo.setQueuedAhead(status.getQueuedAhead());
|
||||
taskInfo.setProgressPercent(status.getProgressPercent());
|
||||
taskInfo.setCurrentStage(status.getCurrentStage());
|
||||
taskInfo.setProcessedItems(status.getProcessedItems());
|
||||
taskInfo.setTotalItems(status.getTotalItems());
|
||||
taskInfo.setStatusMessage(status.getStatusMessage());
|
||||
return taskInfo;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,158 @@
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 异步任务状态。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
*/
|
||||
public class ParseTaskStatus {
|
||||
|
||||
private String taskId;
|
||||
private String status;
|
||||
private String backend;
|
||||
private List<String> fileNames = new ArrayList<String>();
|
||||
private String createdAt;
|
||||
private String startedAt;
|
||||
private String completedAt;
|
||||
private String error;
|
||||
private String statusUrl;
|
||||
private String resultUrl;
|
||||
private Integer queuedAhead;
|
||||
private Integer progressPercent;
|
||||
private String currentStage;
|
||||
private Integer processedItems;
|
||||
private Integer totalItems;
|
||||
private String statusMessage;
|
||||
|
||||
public String getTaskId() {
|
||||
return taskId;
|
||||
}
|
||||
|
||||
public void setTaskId(String taskId) {
|
||||
this.taskId = taskId;
|
||||
}
|
||||
|
||||
public String getStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
public void setStatus(String status) {
|
||||
this.status = status;
|
||||
}
|
||||
|
||||
public String getBackend() {
|
||||
return backend;
|
||||
}
|
||||
|
||||
public void setBackend(String backend) {
|
||||
this.backend = backend;
|
||||
}
|
||||
|
||||
public List<String> getFileNames() {
|
||||
return fileNames;
|
||||
}
|
||||
|
||||
public void setFileNames(List<String> fileNames) {
|
||||
this.fileNames = fileNames == null ? new ArrayList<String>() : fileNames;
|
||||
}
|
||||
|
||||
public String getCreatedAt() {
|
||||
return createdAt;
|
||||
}
|
||||
|
||||
public void setCreatedAt(String createdAt) {
|
||||
this.createdAt = createdAt;
|
||||
}
|
||||
|
||||
public String getStartedAt() {
|
||||
return startedAt;
|
||||
}
|
||||
|
||||
public void setStartedAt(String startedAt) {
|
||||
this.startedAt = startedAt;
|
||||
}
|
||||
|
||||
public String getCompletedAt() {
|
||||
return completedAt;
|
||||
}
|
||||
|
||||
public void setCompletedAt(String completedAt) {
|
||||
this.completedAt = completedAt;
|
||||
}
|
||||
|
||||
public String getError() {
|
||||
return error;
|
||||
}
|
||||
|
||||
public void setError(String error) {
|
||||
this.error = error;
|
||||
}
|
||||
|
||||
public String getStatusUrl() {
|
||||
return statusUrl;
|
||||
}
|
||||
|
||||
public void setStatusUrl(String statusUrl) {
|
||||
this.statusUrl = statusUrl;
|
||||
}
|
||||
|
||||
public String getResultUrl() {
|
||||
return resultUrl;
|
||||
}
|
||||
|
||||
public void setResultUrl(String resultUrl) {
|
||||
this.resultUrl = resultUrl;
|
||||
}
|
||||
|
||||
public Integer getQueuedAhead() {
|
||||
return queuedAhead;
|
||||
}
|
||||
|
||||
public void setQueuedAhead(Integer queuedAhead) {
|
||||
this.queuedAhead = queuedAhead;
|
||||
}
|
||||
|
||||
public Integer getProgressPercent() {
|
||||
return progressPercent;
|
||||
}
|
||||
|
||||
public void setProgressPercent(Integer progressPercent) {
|
||||
this.progressPercent = progressPercent;
|
||||
}
|
||||
|
||||
public String getCurrentStage() {
|
||||
return currentStage;
|
||||
}
|
||||
|
||||
public void setCurrentStage(String currentStage) {
|
||||
this.currentStage = currentStage;
|
||||
}
|
||||
|
||||
public Integer getProcessedItems() {
|
||||
return processedItems;
|
||||
}
|
||||
|
||||
public void setProcessedItems(Integer processedItems) {
|
||||
this.processedItems = processedItems;
|
||||
}
|
||||
|
||||
public Integer getTotalItems() {
|
||||
return totalItems;
|
||||
}
|
||||
|
||||
public void setTotalItems(Integer totalItems) {
|
||||
this.totalItems = totalItems;
|
||||
}
|
||||
|
||||
public String getStatusMessage() {
|
||||
return statusMessage;
|
||||
}
|
||||
|
||||
public void setStatusMessage(String statusMessage) {
|
||||
this.statusMessage = statusMessage;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
/**
|
||||
* PDF 解析请求。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class PdfParseRequest extends ParseRequest {
|
||||
|
||||
private String parseMethod = "auto";
|
||||
private Boolean formulaEnabled = true;
|
||||
private Boolean tableEnabled = true;
|
||||
private Integer startPageIndex = 0;
|
||||
private Integer endPageIndex = 99999;
|
||||
|
||||
/**
|
||||
* 将通用请求转换为 PDF 请求。
|
||||
*
|
||||
* @param request 原始请求
|
||||
* @return PDF 请求
|
||||
*/
|
||||
public static PdfParseRequest from(ParseRequest request) {
|
||||
PdfParseRequest pdfParseRequest = new PdfParseRequest();
|
||||
if (request == null) {
|
||||
return pdfParseRequest;
|
||||
}
|
||||
request.copyCommonFieldsTo(pdfParseRequest);
|
||||
if (request instanceof PdfParseRequest) {
|
||||
PdfParseRequest source = (PdfParseRequest) request;
|
||||
pdfParseRequest.setParseMethod(source.getParseMethod());
|
||||
pdfParseRequest.setFormulaEnabled(source.getFormulaEnabled());
|
||||
pdfParseRequest.setTableEnabled(source.getTableEnabled());
|
||||
pdfParseRequest.setStartPageIndex(source.getStartPageIndex());
|
||||
pdfParseRequest.setEndPageIndex(source.getEndPageIndex());
|
||||
}
|
||||
return pdfParseRequest;
|
||||
}
|
||||
|
||||
public String getParseMethod() {
|
||||
return parseMethod;
|
||||
}
|
||||
|
||||
public void setParseMethod(String parseMethod) {
|
||||
this.parseMethod = parseMethod;
|
||||
}
|
||||
|
||||
public Boolean getFormulaEnabled() {
|
||||
return formulaEnabled;
|
||||
}
|
||||
|
||||
public void setFormulaEnabled(Boolean formulaEnabled) {
|
||||
this.formulaEnabled = formulaEnabled;
|
||||
}
|
||||
|
||||
public Boolean getTableEnabled() {
|
||||
return tableEnabled;
|
||||
}
|
||||
|
||||
public void setTableEnabled(Boolean tableEnabled) {
|
||||
this.tableEnabled = tableEnabled;
|
||||
}
|
||||
|
||||
public Integer getStartPageIndex() {
|
||||
return startPageIndex;
|
||||
}
|
||||
|
||||
public void setStartPageIndex(Integer startPageIndex) {
|
||||
this.startPageIndex = startPageIndex;
|
||||
}
|
||||
|
||||
public Integer getEndPageIndex() {
|
||||
return endPageIndex;
|
||||
}
|
||||
|
||||
public void setEndPageIndex(Integer endPageIndex) {
|
||||
this.endPageIndex = endPageIndex;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
/**
|
||||
* PPTX 解析请求。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class PptxParseRequest extends ParseRequest {
|
||||
|
||||
private Integer startSlideIndex = 0;
|
||||
private Integer endSlideIndex;
|
||||
private Double renderScale = 2.0d;
|
||||
private String imageFormat = "png";
|
||||
private Boolean includeSlideImageReference = true;
|
||||
|
||||
/**
|
||||
* 将通用请求转换为 PPTX 请求。
|
||||
*
|
||||
* @param request 原始请求
|
||||
* @return PPTX 请求
|
||||
*/
|
||||
public static PptxParseRequest from(ParseRequest request) {
|
||||
PptxParseRequest pptxParseRequest = new PptxParseRequest();
|
||||
if (request == null) {
|
||||
return pptxParseRequest;
|
||||
}
|
||||
request.copyCommonFieldsTo(pptxParseRequest);
|
||||
if (request instanceof PptxParseRequest) {
|
||||
PptxParseRequest source = (PptxParseRequest) request;
|
||||
pptxParseRequest.setStartSlideIndex(source.getStartSlideIndex());
|
||||
pptxParseRequest.setEndSlideIndex(source.getEndSlideIndex());
|
||||
pptxParseRequest.setRenderScale(source.getRenderScale());
|
||||
pptxParseRequest.setImageFormat(source.getImageFormat());
|
||||
pptxParseRequest.setIncludeSlideImageReference(source.getIncludeSlideImageReference());
|
||||
}
|
||||
return pptxParseRequest;
|
||||
}
|
||||
|
||||
public Integer getStartSlideIndex() {
|
||||
return startSlideIndex;
|
||||
}
|
||||
|
||||
public void setStartSlideIndex(Integer startSlideIndex) {
|
||||
this.startSlideIndex = startSlideIndex;
|
||||
}
|
||||
|
||||
public Integer getEndSlideIndex() {
|
||||
return endSlideIndex;
|
||||
}
|
||||
|
||||
public void setEndSlideIndex(Integer endSlideIndex) {
|
||||
this.endSlideIndex = endSlideIndex;
|
||||
}
|
||||
|
||||
public Double getRenderScale() {
|
||||
return renderScale;
|
||||
}
|
||||
|
||||
public void setRenderScale(Double renderScale) {
|
||||
this.renderScale = renderScale;
|
||||
}
|
||||
|
||||
public String getImageFormat() {
|
||||
return imageFormat;
|
||||
}
|
||||
|
||||
public void setImageFormat(String imageFormat) {
|
||||
this.imageFormat = imageFormat;
|
||||
}
|
||||
|
||||
public Boolean getIncludeSlideImageReference() {
|
||||
return includeSlideImageReference;
|
||||
}
|
||||
|
||||
public void setIncludeSlideImageReference(Boolean includeSlideImageReference) {
|
||||
this.includeSlideImageReference = includeSlideImageReference;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
package com.easyagents.document.core.entity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* XLSX 解析请求。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class XlsxParseRequest extends ParseRequest {
|
||||
|
||||
private List<String> sheetNames = new ArrayList<String>();
|
||||
private Boolean includeHiddenSheets = false;
|
||||
private Boolean ocrEmbeddedImages = true;
|
||||
private Integer maxRowsPerSheet;
|
||||
private Boolean includeImageAppendix = true;
|
||||
|
||||
/**
|
||||
* 将通用请求转换为 XLSX 请求。
|
||||
*
|
||||
* @param request 原始请求
|
||||
* @return XLSX 请求
|
||||
*/
|
||||
public static XlsxParseRequest from(ParseRequest request) {
|
||||
XlsxParseRequest xlsxParseRequest = new XlsxParseRequest();
|
||||
if (request == null) {
|
||||
return xlsxParseRequest;
|
||||
}
|
||||
request.copyCommonFieldsTo(xlsxParseRequest);
|
||||
if (request instanceof XlsxParseRequest) {
|
||||
XlsxParseRequest source = (XlsxParseRequest) request;
|
||||
xlsxParseRequest.setSheetNames(new ArrayList<String>(source.getSheetNames()));
|
||||
xlsxParseRequest.setIncludeHiddenSheets(source.getIncludeHiddenSheets());
|
||||
xlsxParseRequest.setOcrEmbeddedImages(source.getOcrEmbeddedImages());
|
||||
xlsxParseRequest.setMaxRowsPerSheet(source.getMaxRowsPerSheet());
|
||||
xlsxParseRequest.setIncludeImageAppendix(source.getIncludeImageAppendix());
|
||||
}
|
||||
return xlsxParseRequest;
|
||||
}
|
||||
|
||||
public List<String> getSheetNames() {
|
||||
return sheetNames;
|
||||
}
|
||||
|
||||
public void setSheetNames(List<String> sheetNames) {
|
||||
this.sheetNames = sheetNames == null ? new ArrayList<String>() : sheetNames;
|
||||
}
|
||||
|
||||
public Boolean getIncludeHiddenSheets() {
|
||||
return includeHiddenSheets;
|
||||
}
|
||||
|
||||
public void setIncludeHiddenSheets(Boolean includeHiddenSheets) {
|
||||
this.includeHiddenSheets = includeHiddenSheets;
|
||||
}
|
||||
|
||||
public Boolean getOcrEmbeddedImages() {
|
||||
return ocrEmbeddedImages;
|
||||
}
|
||||
|
||||
public void setOcrEmbeddedImages(Boolean ocrEmbeddedImages) {
|
||||
this.ocrEmbeddedImages = ocrEmbeddedImages;
|
||||
}
|
||||
|
||||
public Integer getMaxRowsPerSheet() {
|
||||
return maxRowsPerSheet;
|
||||
}
|
||||
|
||||
public void setMaxRowsPerSheet(Integer maxRowsPerSheet) {
|
||||
this.maxRowsPerSheet = maxRowsPerSheet;
|
||||
}
|
||||
|
||||
public Boolean getIncludeImageAppendix() {
|
||||
return includeImageAppendix;
|
||||
}
|
||||
|
||||
public void setIncludeImageAppendix(Boolean includeImageAppendix) {
|
||||
this.includeImageAppendix = includeImageAppendix;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import okhttp3.MediaType;
|
||||
import okhttp3.MultipartBody;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import okhttp3.RequestBody;
|
||||
import okhttp3.Response;
|
||||
import okhttp3.ResponseBody;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URLConnection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* MinerU HTTP 客户端。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruClient {
|
||||
|
||||
private static final MediaType DEFAULT_MEDIA_TYPE = MediaType.parse("application/octet-stream");
|
||||
|
||||
private final String baseUrl;
|
||||
private final OkHttpClient okHttpClient;
|
||||
private final MineruMapper mineruMapper;
|
||||
|
||||
/**
|
||||
* 创建客户端。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param mineruMapper DTO 映射器
|
||||
*/
|
||||
public MineruClient(MineruProperties properties, MineruMapper mineruMapper) {
|
||||
this(
|
||||
properties,
|
||||
new OkHttpClient.Builder()
|
||||
.connectTimeout(properties.getConnectTimeoutMs(), TimeUnit.MILLISECONDS)
|
||||
.readTimeout(properties.getReadTimeoutMs(), TimeUnit.MILLISECONDS)
|
||||
.writeTimeout(properties.getWriteTimeoutMs(), TimeUnit.MILLISECONDS)
|
||||
.build(),
|
||||
mineruMapper
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建客户端。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param okHttpClient HTTP 客户端
|
||||
* @param mineruMapper DTO 映射器
|
||||
*/
|
||||
public MineruClient(MineruProperties properties, OkHttpClient okHttpClient, MineruMapper mineruMapper) {
|
||||
if (properties == null || !StringUtil.hasText(properties.getBaseUrl())) {
|
||||
throw new IllegalArgumentException("MinerU baseUrl must not be empty");
|
||||
}
|
||||
this.baseUrl = normalizeBaseUrl(properties.getBaseUrl());
|
||||
this.okHttpClient = okHttpClient;
|
||||
this.mineruMapper = mineruMapper;
|
||||
}
|
||||
|
||||
/**
|
||||
* 调用同步解析接口。
|
||||
*
|
||||
* @param request 解析请求
|
||||
* @return 原始结果
|
||||
*/
|
||||
public MineruResultPayload parse(ParseRequest request) {
|
||||
return mineruMapper.toResultPayload(executeJsonMultipart("/file_parse", request, buildSyncFormFields(request)));
|
||||
}
|
||||
|
||||
/**
|
||||
* 提交异步解析任务。
|
||||
*
|
||||
* @param request 解析请求
|
||||
* @return 原始任务状态
|
||||
*/
|
||||
public MineruTaskStatus submit(ParseRequest request) {
|
||||
return mineruMapper.toTaskStatus(executeJsonMultipart("/tasks", request, buildAsyncFormFields(request)));
|
||||
}
|
||||
|
||||
/**
|
||||
* 查询任务状态。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 原始任务状态
|
||||
*/
|
||||
public MineruTaskStatus queryTask(String taskId) {
|
||||
return mineruMapper.toTaskStatus(executeJsonGet("/tasks/" + taskId));
|
||||
}
|
||||
|
||||
/**
|
||||
* 下载异步结果 ZIP。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return ZIP 二进制
|
||||
*/
|
||||
public byte[] queryResultZip(String taskId) {
|
||||
String path = "/tasks/" + taskId + "/result";
|
||||
Request request = new Request.Builder().url(baseUrl + path).get().build();
|
||||
try (Response response = okHttpClient.newCall(request).execute()) {
|
||||
ResponseBody body = response.body();
|
||||
byte[] responseBytes = body == null ? new byte[0] : body.bytes();
|
||||
if (!response.isSuccessful()) {
|
||||
throw buildHttpException(path, response.code(), responseBytes);
|
||||
}
|
||||
String contentType = response.header("Content-Type");
|
||||
if (contentType != null && contentType.contains("application/json")) {
|
||||
JSONObject jsonObject = JSON.parseObject(new String(responseBytes));
|
||||
throw new DocumentParseException("MinerU async result is not ready: " + jsonObject.toJSONString());
|
||||
}
|
||||
if (responseBytes.length < 2 || responseBytes[0] != 'P' || responseBytes[1] != 'K') {
|
||||
throw new DocumentParseException("MinerU async result is not a valid ZIP payload");
|
||||
}
|
||||
return responseBytes;
|
||||
} catch (IOException exception) {
|
||||
throw new DocumentParseException("Failed to query MinerU result ZIP", exception);
|
||||
}
|
||||
}
|
||||
|
||||
protected JSONObject executeJsonMultipart(String path, ParseRequest request, Map<String, List<String>> fields) {
|
||||
MultipartBody.Builder formBuilder = new MultipartBody.Builder().setType(MultipartBody.FORM);
|
||||
appendFiles(formBuilder, request.getFiles());
|
||||
appendStringFields(formBuilder, fields);
|
||||
Request httpRequest = new Request.Builder()
|
||||
.url(baseUrl + path)
|
||||
.post(formBuilder.build())
|
||||
.build();
|
||||
return executeJsonRequest(path, httpRequest);
|
||||
}
|
||||
|
||||
protected JSONObject executeJsonGet(String path) {
|
||||
Request request = new Request.Builder().url(baseUrl + path).get().build();
|
||||
return executeJsonRequest(path, request);
|
||||
}
|
||||
|
||||
protected JSONObject executeJsonRequest(String path, Request request) {
|
||||
try (Response response = okHttpClient.newCall(request).execute()) {
|
||||
ResponseBody body = response.body();
|
||||
String bodyText = body == null ? "" : body.string();
|
||||
if (!response.isSuccessful()) {
|
||||
throw buildHttpException(path, response.code(), bodyText == null ? new byte[0] : bodyText.getBytes());
|
||||
}
|
||||
return JSON.parseObject(bodyText);
|
||||
} catch (IOException exception) {
|
||||
throw new DocumentParseException("Failed to call MinerU endpoint: " + path, exception);
|
||||
}
|
||||
}
|
||||
|
||||
private void appendFiles(MultipartBody.Builder formBuilder, List<ParseFile> files) {
|
||||
if (files == null || files.isEmpty()) {
|
||||
throw new IllegalArgumentException("Parse request must contain at least one file");
|
||||
}
|
||||
for (ParseFile file : files) {
|
||||
if (file == null || !StringUtil.hasText(file.getFileName()) || file.getContent() == null) {
|
||||
throw new IllegalArgumentException("Parse request contains an invalid file");
|
||||
}
|
||||
MediaType mediaType = StringUtil.hasText(file.getContentType())
|
||||
? MediaType.parse(file.getContentType())
|
||||
: detectMediaType(file.getFileName());
|
||||
formBuilder.addFormDataPart(
|
||||
"files",
|
||||
file.getFileName(),
|
||||
RequestBody.create(file.getContent(), mediaType)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private void appendStringFields(MultipartBody.Builder formBuilder, Map<String, List<String>> fields) {
|
||||
for (Map.Entry<String, List<String>> entry : fields.entrySet()) {
|
||||
if (entry.getValue() == null) {
|
||||
continue;
|
||||
}
|
||||
for (String value : entry.getValue()) {
|
||||
if (value != null) {
|
||||
formBuilder.addFormDataPart(entry.getKey(), value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, List<String>> buildSyncFormFields(ParseRequest request) {
|
||||
return mineruMapper.buildSyncFormFields(request);
|
||||
}
|
||||
|
||||
private Map<String, List<String>> buildAsyncFormFields(ParseRequest request) {
|
||||
return mineruMapper.buildAsyncFormFields(request);
|
||||
}
|
||||
|
||||
private DocumentParseException buildHttpException(String path, int statusCode, byte[] bodyBytes) {
|
||||
String bodyText = bodyBytes == null ? "" : new String(bodyBytes);
|
||||
return new DocumentParseException(
|
||||
"MinerU request failed: path=" + path + ", status=" + statusCode + ", body=" + bodyText
|
||||
);
|
||||
}
|
||||
|
||||
private String normalizeBaseUrl(String baseUrl) {
|
||||
if (baseUrl.endsWith("/")) {
|
||||
return baseUrl.substring(0, baseUrl.length() - 1);
|
||||
}
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
private MediaType detectMediaType(String fileName) {
|
||||
String mimeType = URLConnection.guessContentTypeFromName(fileName);
|
||||
return StringUtil.hasText(mimeType) ? MediaType.parse(mimeType) : DEFAULT_MEDIA_TYPE;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,218 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.DocumentParseService;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* 基于 MinerU API 的文档解析服务,支持 docx 文档和 pdf 文档。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruDocumentParseService<R extends ParseRequest> implements DocumentParseService<R> {
|
||||
|
||||
public static final String PROVIDER_NAME = "mineru";
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MineruDocumentParseService.class);
|
||||
|
||||
private final MineruProperties properties;
|
||||
private final MineruClient client;
|
||||
private final MineruMapper mapper;
|
||||
|
||||
/**
|
||||
* 创建默认服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
*/
|
||||
public MineruDocumentParseService(MineruProperties properties) {
|
||||
this(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建默认服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param mapper 结果映射器
|
||||
*/
|
||||
public MineruDocumentParseService(MineruProperties properties, MineruMapper mapper) {
|
||||
this(properties, new MineruClient(properties, mapper), mapper);
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建服务实例。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
* @param client HTTP 客户端
|
||||
* @param mapper 结果映射器
|
||||
*/
|
||||
public MineruDocumentParseService(MineruProperties properties, MineruClient client, MineruMapper mapper) {
|
||||
this.properties = properties;
|
||||
this.client = client;
|
||||
this.mapper = mapper;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse parse(ParseRequest request) {
|
||||
ParseRequest normalizedRequest = normalizeRequest(request);
|
||||
LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
normalizedRequest.getBackend());
|
||||
ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
|
||||
LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus submit(ParseRequest request) {
|
||||
ParseRequest normalizedRequest = normalizeRequest(request);
|
||||
normalizedRequest.setReturnMarkdown(true);
|
||||
normalizedRequest.setReturnMiddleJson(true);
|
||||
normalizedRequest.setReturnContentList(true);
|
||||
normalizedRequest.setReturnModelOutput(true);
|
||||
normalizedRequest.setReturnImages(true);
|
||||
LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
normalizedRequest.getBackend());
|
||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
|
||||
LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
|
||||
PROVIDER_NAME,
|
||||
taskStatus == null ? null : taskStatus.getTaskId(),
|
||||
taskStatus == null ? null : taskStatus.getStatus());
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
|
||||
LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
taskStatus == null ? null : taskStatus.getStatus());
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
|
||||
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
|
||||
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||
LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
MineruTaskStatus taskStatus = client.queryTask(taskId);
|
||||
ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(mapper.toParseTaskStatus(taskStatus));
|
||||
if ("completed".equalsIgnoreCase(taskStatus.getStatus())) {
|
||||
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||
taskInfo.setResult(response);
|
||||
}
|
||||
LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
taskInfo.getStatus(),
|
||||
taskInfo.getResult() != null);
|
||||
return taskInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取 MinerU 配置。
|
||||
*
|
||||
* @return MinerU 配置
|
||||
*/
|
||||
protected MineruProperties getProperties() {
|
||||
return properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* 归一化解析请求,补齐默认参数。
|
||||
*
|
||||
* @param request 原始请求
|
||||
* @return 归一化后的请求
|
||||
*/
|
||||
protected ParseRequest normalizeRequest(ParseRequest request) {
|
||||
if (request == null) {
|
||||
throw new IllegalArgumentException("ParseRequest must not be null");
|
||||
}
|
||||
if (request.getFiles() == null || request.getFiles().isEmpty()) {
|
||||
throw new IllegalArgumentException("ParseRequest files must not be empty");
|
||||
}
|
||||
ParseRequest normalizedRequest = new ParseRequest();
|
||||
normalizedRequest.setFiles(new ArrayList<ParseFile>(request.getFiles()));
|
||||
normalizedRequest.setBackend(StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend());
|
||||
normalizedRequest.setLanguages(
|
||||
request.getLanguages() == null || request.getLanguages().isEmpty()
|
||||
? new ArrayList<String>(properties.getDefaultLangList())
|
||||
: new ArrayList<String>(request.getLanguages())
|
||||
);
|
||||
normalizedRequest.setReturnMarkdown(request.getReturnMarkdown() == null ? Boolean.TRUE : request.getReturnMarkdown());
|
||||
normalizedRequest.setReturnMiddleJson(request.getReturnMiddleJson() == null ? Boolean.TRUE : request.getReturnMiddleJson());
|
||||
normalizedRequest.setReturnContentList(request.getReturnContentList() == null ? Boolean.TRUE : request.getReturnContentList());
|
||||
normalizedRequest.setReturnModelOutput(request.getReturnModelOutput() == null ? Boolean.FALSE : request.getReturnModelOutput());
|
||||
normalizedRequest.setReturnImages(request.getReturnImages() == null ? Boolean.TRUE : request.getReturnImages());
|
||||
return normalizedRequest;
|
||||
}
|
||||
|
||||
/**
|
||||
* 校验任务 ID。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
*/
|
||||
protected void validateTaskId(String taskId) {
|
||||
if (!StringUtil.hasText(taskId)) {
|
||||
throw new IllegalArgumentException("taskId must not be empty");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 轮询任务状态直到完成或失败。
|
||||
*
|
||||
* @param taskId 任务 ID
|
||||
* @return 已完成的任务状态
|
||||
*/
|
||||
protected MineruTaskStatus waitForTaskCompleted(String taskId) {
|
||||
long deadline = System.currentTimeMillis() + properties.getResultTimeoutMs();
|
||||
while (true) {
|
||||
MineruTaskStatus taskStatus = client.queryTask(taskId);
|
||||
if ("completed".equals(taskStatus.getStatus())) {
|
||||
return taskStatus;
|
||||
}
|
||||
if ("failed".equals(taskStatus.getStatus())) {
|
||||
throw new DocumentParseException("MinerU task failed: " + taskStatus.getError());
|
||||
}
|
||||
if (System.currentTimeMillis() >= deadline) {
|
||||
throw new DocumentParseException("MinerU task result timeout: " + taskId);
|
||||
}
|
||||
try {
|
||||
Thread.sleep(properties.getPollIntervalMs());
|
||||
} catch (InterruptedException exception) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new DocumentParseException("Interrupted while waiting for MinerU task: " + taskId, exception);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,923 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSON;
|
||||
import com.alibaba.fastjson2.JSONArray;
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.core.util.StringUtil;
|
||||
import com.easyagents.document.core.exception.DocumentParseException;
|
||||
import com.easyagents.document.core.entity.DocumentBlock;
|
||||
import com.easyagents.document.core.entity.DocumentImage;
|
||||
import com.easyagents.document.core.entity.DocumentPage;
|
||||
import com.easyagents.document.core.entity.DocumentTable;
|
||||
import com.easyagents.document.core.entity.PdfParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseArtifacts;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URLConnection;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Base64;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
/**
|
||||
* MinerU 原始协议与统一模型之间的映射器。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruMapper {
|
||||
|
||||
private final MineruProperties properties;
|
||||
|
||||
/**
|
||||
* 创建映射器。
|
||||
*
|
||||
* @param properties MinerU 配置
|
||||
*/
|
||||
public MineruMapper(MineruProperties properties) {
|
||||
this.properties = properties;
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建同步请求表单字段。
|
||||
*
|
||||
* @param request 解析请求
|
||||
* @return 表单字段
|
||||
*/
|
||||
public Map<String, List<String>> buildSyncFormFields(ParseRequest request) {
|
||||
Map<String, List<String>> fields = buildBaseFormFields(request);
|
||||
putSingleValue(fields, "return_md", String.valueOf(isTrue(request.getReturnMarkdown())));
|
||||
putSingleValue(fields, "return_middle_json", String.valueOf(isTrue(request.getReturnMiddleJson())));
|
||||
putSingleValue(fields, "return_content_list", String.valueOf(isTrue(request.getReturnContentList())));
|
||||
putSingleValue(fields, "return_model_output", String.valueOf(isTrue(request.getReturnModelOutput())));
|
||||
putSingleValue(fields, "return_images", String.valueOf(isTrue(request.getReturnImages())));
|
||||
putSingleValue(fields, "response_format_zip", "false");
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建异步请求表单字段。
|
||||
*
|
||||
* @param request 解析请求
|
||||
* @return 表单字段
|
||||
*/
|
||||
public Map<String, List<String>> buildAsyncFormFields(ParseRequest request) {
|
||||
Map<String, List<String>> fields = buildBaseFormFields(request);
|
||||
putSingleValue(fields, "return_md", "true");
|
||||
putSingleValue(fields, "return_middle_json", "true");
|
||||
putSingleValue(fields, "return_content_list", "true");
|
||||
putSingleValue(fields, "return_model_output", "true");
|
||||
putSingleValue(fields, "return_images", "true");
|
||||
putSingleValue(fields, "response_format_zip", "true");
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将原始 JSON 转为 MinerU 任务状态 DTO。
|
||||
*
|
||||
* @param jsonObject 原始 JSON
|
||||
* @return 任务状态 DTO
|
||||
*/
|
||||
public MineruTaskStatus toTaskStatus(JSONObject jsonObject) {
|
||||
MineruTaskStatus taskStatus = new MineruTaskStatus();
|
||||
taskStatus.setTaskId(jsonObject.getString("task_id"));
|
||||
taskStatus.setStatus(jsonObject.getString("status"));
|
||||
taskStatus.setBackend(jsonObject.getString("backend"));
|
||||
taskStatus.setFileNames(toStringList(jsonObject.getJSONArray("file_names")));
|
||||
taskStatus.setCreatedAt(jsonObject.getString("created_at"));
|
||||
taskStatus.setStartedAt(jsonObject.getString("started_at"));
|
||||
taskStatus.setCompletedAt(jsonObject.getString("completed_at"));
|
||||
taskStatus.setError(jsonObject.getString("error"));
|
||||
taskStatus.setStatusUrl(jsonObject.getString("status_url"));
|
||||
taskStatus.setResultUrl(jsonObject.getString("result_url"));
|
||||
taskStatus.setQueuedAhead(jsonObject.getInteger("queued_ahead"));
|
||||
taskStatus.setVersion(jsonObject.getString("version"));
|
||||
taskStatus.setMessage(jsonObject.getString("message"));
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将原始 JSON 转为 MinerU 结果 DTO。
|
||||
*
|
||||
* @param jsonObject 原始 JSON
|
||||
* @return 结果 DTO
|
||||
*/
|
||||
public MineruResultPayload toResultPayload(JSONObject jsonObject) {
|
||||
MineruResultPayload payload = new MineruResultPayload();
|
||||
payload.setBackend(jsonObject.getString("backend"));
|
||||
payload.setVersion(jsonObject.getString("version"));
|
||||
Map<String, JSONObject> results = new LinkedHashMap<String, JSONObject>();
|
||||
JSONObject resultJson = jsonObject.getJSONObject("results");
|
||||
if (resultJson != null) {
|
||||
for (String key : resultJson.keySet()) {
|
||||
results.put(key, resultJson.getJSONObject(key));
|
||||
}
|
||||
}
|
||||
payload.setResults(results);
|
||||
return payload;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将 MinerU 任务状态转为统一模型。
|
||||
*
|
||||
* @param taskStatus 原始任务状态
|
||||
* @return 统一任务状态
|
||||
*/
|
||||
public ParseTaskStatus toParseTaskStatus(MineruTaskStatus taskStatus) {
|
||||
ParseTaskStatus status = new ParseTaskStatus();
|
||||
status.setTaskId(taskStatus.getTaskId());
|
||||
status.setStatus(taskStatus.getStatus());
|
||||
status.setBackend(taskStatus.getBackend());
|
||||
status.setFileNames(taskStatus.getFileNames());
|
||||
status.setCreatedAt(taskStatus.getCreatedAt());
|
||||
status.setStartedAt(taskStatus.getStartedAt());
|
||||
status.setCompletedAt(taskStatus.getCompletedAt());
|
||||
status.setError(taskStatus.getError());
|
||||
status.setStatusUrl(taskStatus.getStatusUrl());
|
||||
status.setResultUrl(taskStatus.getResultUrl());
|
||||
status.setQueuedAhead(taskStatus.getQueuedAhead());
|
||||
return status;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将同步 JSON 结果转为统一响应。
|
||||
*
|
||||
* @param payload MinerU 结果 DTO
|
||||
* @return 统一响应
|
||||
*/
|
||||
public ParseResponse toParseResponse(MineruResultPayload payload) {
|
||||
ParseResponse response = new ParseResponse();
|
||||
response.setBackend(payload.getBackend());
|
||||
response.setVersion(payload.getVersion());
|
||||
List<ParseResult> parseResults = new ArrayList<ParseResult>();
|
||||
for (Map.Entry<String, JSONObject> entry : payload.getResults().entrySet()) {
|
||||
parseResults.add(mapSingleResult(entry.getKey(), entry.getValue()));
|
||||
}
|
||||
response.setResults(parseResults);
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将 ZIP 结果转为统一响应。
|
||||
*
|
||||
* @param zipBytes ZIP 二进制
|
||||
* @return 统一响应
|
||||
*/
|
||||
public ParseResponse fromZip(byte[] zipBytes) {
|
||||
Map<String, ZipArtifactBundle> bundles = unzip(zipBytes);
|
||||
if (bundles.isEmpty()) {
|
||||
throw new DocumentParseException("MinerU ZIP result does not contain any parse artifacts");
|
||||
}
|
||||
ParseResponse response = new ParseResponse();
|
||||
List<ParseResult> parseResults = new ArrayList<ParseResult>();
|
||||
for (Map.Entry<String, ZipArtifactBundle> entry : bundles.entrySet()) {
|
||||
parseResults.add(mapZipBundle(entry.getKey(), entry.getValue()));
|
||||
}
|
||||
response.setResults(parseResults);
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* 使用异步任务状态和 ZIP 内部工件回填响应元数据。
|
||||
*
|
||||
* @param response 统一响应
|
||||
* @param backend 任务状态中的 backend
|
||||
* @param version 任务状态中的 version
|
||||
*/
|
||||
public void enrichAsyncResponse(ParseResponse response, String backend, String version) {
|
||||
if (response == null) {
|
||||
return;
|
||||
}
|
||||
response.setBackend(StringUtil.hasText(backend) ? backend : resolveBackendFromResults(response));
|
||||
String resolvedVersion = StringUtil.hasText(version) ? version : resolveVersionFromResults(response);
|
||||
response.setVersion(resolvedVersion);
|
||||
}
|
||||
|
||||
private Map<String, List<String>> buildBaseFormFields(ParseRequest request) {
|
||||
Map<String, List<String>> fields = new LinkedHashMap<String, List<String>>();
|
||||
putSingleValue(fields, "backend", StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend());
|
||||
List<String> languages = request.getLanguages();
|
||||
if (languages == null || languages.isEmpty()) {
|
||||
languages = properties.getDefaultLangList();
|
||||
}
|
||||
if (languages != null && !languages.isEmpty()) {
|
||||
fields.put("lang_list", new ArrayList<String>(languages));
|
||||
}
|
||||
if (request instanceof PdfParseRequest) {
|
||||
PdfParseRequest pdfParseRequest = (PdfParseRequest) request;
|
||||
putSingleValue(fields, "parse_method",
|
||||
StringUtil.hasText(pdfParseRequest.getParseMethod()) ? pdfParseRequest.getParseMethod() : properties.getDefaultParseMethod());
|
||||
putSingleValue(fields, "formula_enable",
|
||||
String.valueOf(boolOrDefault(pdfParseRequest.getFormulaEnabled(), properties.getDefaultFormulaEnable())));
|
||||
putSingleValue(fields, "table_enable",
|
||||
String.valueOf(boolOrDefault(pdfParseRequest.getTableEnabled(), properties.getDefaultTableEnable())));
|
||||
putSingleValue(fields, "start_page_id", String.valueOf(intOrDefault(pdfParseRequest.getStartPageIndex(), 0)));
|
||||
putSingleValue(fields, "end_page_id", String.valueOf(intOrDefault(pdfParseRequest.getEndPageIndex(), 99999)));
|
||||
}
|
||||
return fields;
|
||||
}
|
||||
|
||||
private void putSingleValue(Map<String, List<String>> fields, String key, String value) {
|
||||
List<String> values = new ArrayList<String>(1);
|
||||
values.add(value);
|
||||
fields.put(key, values);
|
||||
}
|
||||
|
||||
private ParseResult mapSingleResult(String fileName, JSONObject fileResult) {
|
||||
ParseResult result = new ParseResult();
|
||||
result.setFileName(fileName);
|
||||
result.setMarkdown(fileResult.getString("md_content"));
|
||||
result.setPlainText(result.getMarkdown());
|
||||
|
||||
ParseArtifacts artifacts = new ParseArtifacts();
|
||||
artifacts.setMiddleJson(fileResult.get("middle_json"));
|
||||
artifacts.setContentList(fileResult.get("content_list"));
|
||||
artifacts.setModelOutput(fileResult.get("model_output"));
|
||||
result.setArtifacts(artifacts);
|
||||
|
||||
Map<String, String> imageDataUrls = toStringMap(fileResult.getJSONObject("images"));
|
||||
Map<String, byte[]> imageContents = toBinaryMap(imageDataUrls);
|
||||
applyStructuredArtifacts(result, imageDataUrls, imageContents);
|
||||
if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) {
|
||||
result.getWarnings().add("MinerU did not return markdown, middle_json or content_list");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private ParseResult mapZipBundle(String fileName, ZipArtifactBundle bundle) {
|
||||
ParseResult result = new ParseResult();
|
||||
result.setFileName(fileName);
|
||||
|
||||
String markdown = firstText(bundle.entriesBySuffix, ".md");
|
||||
result.setMarkdown(markdown);
|
||||
result.setPlainText(markdown);
|
||||
|
||||
ParseArtifacts artifacts = new ParseArtifacts();
|
||||
Object middleArtifact = firstJsonValue(bundle.entriesBySuffix, "_middle.json");
|
||||
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
|
||||
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
|
||||
|
||||
JSONObject middleJson = asObject(middleArtifact);
|
||||
JSONArray contentList = asArray(contentListArtifact);
|
||||
Object modelOutput = modelOutputArtifact;
|
||||
|
||||
if (contentList == null && middleArtifact instanceof JSONArray) {
|
||||
contentList = (JSONArray) middleArtifact;
|
||||
middleJson = null;
|
||||
middleArtifact = null;
|
||||
}
|
||||
if (contentList == null && modelOutputArtifact instanceof JSONArray) {
|
||||
contentList = (JSONArray) modelOutputArtifact;
|
||||
}
|
||||
|
||||
artifacts.setMiddleJson(middleArtifact);
|
||||
artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
|
||||
artifacts.setModelOutput(modelOutput);
|
||||
|
||||
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
|
||||
if (contentListV2 != null) {
|
||||
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
|
||||
}
|
||||
|
||||
for (Map.Entry<String, byte[]> entry : bundle.otherBinaryEntries.entrySet()) {
|
||||
artifacts.getExtraBinaryArtifacts().put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
result.setArtifacts(artifacts);
|
||||
|
||||
Map<String, String> imageDataUrls = new LinkedHashMap<String, String>();
|
||||
Map<String, byte[]> imageContents = new LinkedHashMap<String, byte[]>();
|
||||
for (Map.Entry<String, byte[]> imageEntry : bundle.images.entrySet()) {
|
||||
imageDataUrls.put(imageEntry.getKey(), toDataUrl(imageEntry.getKey(), imageEntry.getValue()));
|
||||
imageContents.put(imageEntry.getKey(), imageEntry.getValue());
|
||||
}
|
||||
applyStructuredArtifacts(result, imageDataUrls, imageContents);
|
||||
|
||||
if (markdown == null && middleJson == null && contentList == null) {
|
||||
throw new DocumentParseException("MinerU ZIP result missing critical artifacts for file: " + fileName);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls, Map<String, byte[]> imageContents) {
|
||||
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson());
|
||||
JSONArray contentList = asArray(result.getArtifacts().getContentList());
|
||||
|
||||
if (middleJson != null) {
|
||||
fillPages(result, middleJson);
|
||||
result.getMetadata().put("middleBackend", middleJson.getString("_backend"));
|
||||
result.getMetadata().put("middleVersion", middleJson.getString("_version_name"));
|
||||
}
|
||||
|
||||
if (contentList != null) {
|
||||
fillFromContentList(result, contentList, imageDataUrls, imageContents);
|
||||
} else if (middleJson != null) {
|
||||
fillFromMiddleJson(result, middleJson, imageDataUrls, imageContents);
|
||||
}
|
||||
|
||||
if ((result.getImages() == null || result.getImages().isEmpty()) && imageDataUrls != null && !imageDataUrls.isEmpty()) {
|
||||
for (Map.Entry<String, String> entry : imageDataUrls.entrySet()) {
|
||||
DocumentImage image = new DocumentImage();
|
||||
image.setName(baseName(entry.getKey()));
|
||||
image.setSourcePath(entry.getKey());
|
||||
image.setDataUrl(entry.getValue());
|
||||
image.setContent(matchBinaryContent(entry.getKey(), imageContents));
|
||||
image.setMimeType(detectMimeType(entry.getKey()));
|
||||
result.getImages().add(image);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void fillPages(ParseResult result, JSONObject middleJson) {
|
||||
JSONArray pdfInfo = middleJson.getJSONArray("pdf_info");
|
||||
if (pdfInfo == null) {
|
||||
return;
|
||||
}
|
||||
List<DocumentPage> pages = new ArrayList<DocumentPage>();
|
||||
for (int index = 0; index < pdfInfo.size(); index++) {
|
||||
JSONObject pageJson = pdfInfo.getJSONObject(index);
|
||||
DocumentPage page = new DocumentPage();
|
||||
page.setPageIndex(pageJson.getInteger("page_idx"));
|
||||
JSONArray pageSize = pageJson.getJSONArray("page_size");
|
||||
if (pageSize != null && pageSize.size() >= 2) {
|
||||
page.setWidth(pageSize.getDouble(0));
|
||||
page.setHeight(pageSize.getDouble(1));
|
||||
}
|
||||
page.getMetadata().put("raw", pageJson);
|
||||
pages.add(page);
|
||||
}
|
||||
result.setPages(pages);
|
||||
}
|
||||
|
||||
private void fillFromContentList(ParseResult result,
|
||||
JSONArray contentList,
|
||||
Map<String, String> imageDataUrls,
|
||||
Map<String, byte[]> imageContents) {
|
||||
for (int index = 0; index < contentList.size(); index++) {
|
||||
JSONObject item = contentList.getJSONObject(index);
|
||||
if (item == null) {
|
||||
continue;
|
||||
}
|
||||
DocumentBlock block = new DocumentBlock();
|
||||
block.setType(item.getString("type"));
|
||||
block.setPageIndex(item.getInteger("page_idx"));
|
||||
block.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
|
||||
Integer blockLevel = item.getInteger("text_level");
|
||||
if (blockLevel == null) {
|
||||
blockLevel = item.getInteger("level");
|
||||
}
|
||||
block.setLevel(blockLevel);
|
||||
block.setText(extractBlockText(item));
|
||||
block.setHtml(item.getString("table_body"));
|
||||
block.setImagePath(item.getString("img_path"));
|
||||
block.getMetadata().put("raw", item);
|
||||
result.getBlocks().add(block);
|
||||
|
||||
if ("table".equals(item.getString("type"))) {
|
||||
DocumentTable table = new DocumentTable();
|
||||
table.setPageIndex(item.getInteger("page_idx"));
|
||||
table.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
|
||||
table.setHtml(item.getString("table_body"));
|
||||
table.setImagePath(item.getString("img_path"));
|
||||
table.setCaptions(toStringList(item.getJSONArray("table_caption")));
|
||||
table.setFootnotes(toStringList(item.getJSONArray("table_footnote")));
|
||||
result.getTables().add(table);
|
||||
}
|
||||
|
||||
if (isVisualType(item.getString("type"))) {
|
||||
DocumentImage image = new DocumentImage();
|
||||
image.setPageIndex(item.getInteger("page_idx"));
|
||||
image.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
|
||||
image.setSourcePath(item.getString("img_path"));
|
||||
image.setName(baseName(item.getString("img_path")));
|
||||
image.setMimeType(detectMimeType(item.getString("img_path")));
|
||||
image.setCaptions(extractCaptions(item));
|
||||
image.setFootnotes(extractFootnotes(item));
|
||||
image.setDataUrl(matchDataUrl(item.getString("img_path"), imageDataUrls));
|
||||
image.setContent(matchBinaryContent(item.getString("img_path"), imageContents));
|
||||
result.getImages().add(image);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void fillFromMiddleJson(ParseResult result,
|
||||
JSONObject middleJson,
|
||||
Map<String, String> imageDataUrls,
|
||||
Map<String, byte[]> imageContents) {
|
||||
JSONArray pages = middleJson.getJSONArray("pdf_info");
|
||||
if (pages == null) {
|
||||
return;
|
||||
}
|
||||
for (int pageIndex = 0; pageIndex < pages.size(); pageIndex++) {
|
||||
JSONObject page = pages.getJSONObject(pageIndex);
|
||||
fillBlocksFromMiddlePage(result, page.getJSONArray("para_blocks"), page.getInteger("page_idx"));
|
||||
fillVisualsFromMiddlePage(result, page.getJSONArray("tables"), page.getInteger("page_idx"), true, imageDataUrls, imageContents);
|
||||
fillVisualsFromMiddlePage(result, page.getJSONArray("images"), page.getInteger("page_idx"), false, imageDataUrls, imageContents);
|
||||
}
|
||||
}
|
||||
|
||||
private void fillBlocksFromMiddlePage(ParseResult result, JSONArray blocks, Integer pageIndex) {
|
||||
if (blocks == null) {
|
||||
return;
|
||||
}
|
||||
for (int index = 0; index < blocks.size(); index++) {
|
||||
JSONObject blockJson = blocks.getJSONObject(index);
|
||||
if (blockJson == null) {
|
||||
continue;
|
||||
}
|
||||
DocumentBlock block = new DocumentBlock();
|
||||
block.setType(blockJson.getString("type"));
|
||||
block.setPageIndex(pageIndex);
|
||||
block.setBoundingBox(toDoubleList(blockJson.getJSONArray("bbox")));
|
||||
block.setText(extractTextFromMiddleBlock(blockJson));
|
||||
block.setImagePath(extractImagePathFromMiddleBlock(blockJson));
|
||||
block.getMetadata().put("raw", blockJson);
|
||||
result.getBlocks().add(block);
|
||||
}
|
||||
}
|
||||
|
||||
private void fillVisualsFromMiddlePage(ParseResult result,
|
||||
JSONArray blocks,
|
||||
Integer pageIndex,
|
||||
boolean table,
|
||||
Map<String, String> imageDataUrls,
|
||||
Map<String, byte[]> imageContents) {
|
||||
if (blocks == null) {
|
||||
return;
|
||||
}
|
||||
for (int index = 0; index < blocks.size(); index++) {
|
||||
JSONObject blockJson = blocks.getJSONObject(index);
|
||||
if (blockJson == null) {
|
||||
continue;
|
||||
}
|
||||
if (table) {
|
||||
DocumentTable documentTable = new DocumentTable();
|
||||
documentTable.setPageIndex(pageIndex);
|
||||
documentTable.setBoundingBox(toDoubleList(blockJson.getJSONArray("bbox")));
|
||||
documentTable.setCaptions(extractTextsByType(blockJson, "table_caption"));
|
||||
documentTable.setFootnotes(extractTextsByType(blockJson, "table_footnote"));
|
||||
documentTable.setImagePath(extractImagePathByType(blockJson, "table_body"));
|
||||
result.getTables().add(documentTable);
|
||||
} else {
|
||||
DocumentImage documentImage = new DocumentImage();
|
||||
documentImage.setPageIndex(pageIndex);
|
||||
documentImage.setBoundingBox(toDoubleList(blockJson.getJSONArray("bbox")));
|
||||
documentImage.setCaptions(extractTextsByType(blockJson, "image_caption"));
|
||||
documentImage.setFootnotes(extractTextsByType(blockJson, "image_footnote"));
|
||||
documentImage.setSourcePath(extractImagePathByType(blockJson, "image_body"));
|
||||
documentImage.setName(baseName(documentImage.getSourcePath()));
|
||||
documentImage.setMimeType(detectMimeType(documentImage.getSourcePath()));
|
||||
documentImage.setDataUrl(matchDataUrl(documentImage.getSourcePath(), imageDataUrls));
|
||||
documentImage.setContent(matchBinaryContent(documentImage.getSourcePath(), imageContents));
|
||||
result.getImages().add(documentImage);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String resolveBackendFromResults(ParseResponse response) {
|
||||
if (response.getResults() == null || response.getResults().isEmpty()) {
|
||||
return properties.getDefaultBackend();
|
||||
}
|
||||
for (ParseResult result : response.getResults()) {
|
||||
Object middleBackend = result.getMetadata().get("middleBackend");
|
||||
if (middleBackend instanceof String && StringUtil.hasText((String) middleBackend)) {
|
||||
return (String) middleBackend;
|
||||
}
|
||||
}
|
||||
return properties.getDefaultBackend();
|
||||
}
|
||||
|
||||
private String resolveVersionFromResults(ParseResponse response) {
|
||||
if (response.getResults() == null || response.getResults().isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
for (ParseResult result : response.getResults()) {
|
||||
Object middleVersion = result.getMetadata().get("middleVersion");
|
||||
if (middleVersion instanceof String && StringUtil.hasText((String) middleVersion)) {
|
||||
return (String) middleVersion;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private Map<String, ZipArtifactBundle> unzip(byte[] zipBytes) {
|
||||
Map<String, ZipArtifactBundle> bundles = new LinkedHashMap<String, ZipArtifactBundle>();
|
||||
try (ZipInputStream zipInputStream = new ZipInputStream(new ByteArrayInputStream(zipBytes))) {
|
||||
ZipEntry entry;
|
||||
while ((entry = zipInputStream.getNextEntry()) != null) {
|
||||
if (entry.isDirectory()) {
|
||||
continue;
|
||||
}
|
||||
byte[] entryBytes = readBytes(zipInputStream);
|
||||
String entryName = entry.getName();
|
||||
String fileName = resolveFileName(entryName);
|
||||
ZipArtifactBundle bundle = bundles.get(fileName);
|
||||
if (bundle == null) {
|
||||
bundle = new ZipArtifactBundle();
|
||||
bundles.put(fileName, bundle);
|
||||
}
|
||||
if (entryName.contains("/images/")) {
|
||||
bundle.images.put(entryName, entryBytes);
|
||||
} else if (entryName.endsWith(".md")
|
||||
|| entryName.endsWith("_middle.json")
|
||||
|| entryName.endsWith("_content_list.json")
|
||||
|| entryName.endsWith("_content_list_v2.json")
|
||||
|| entryName.endsWith("_model.json")) {
|
||||
bundle.entriesBySuffix.put(entryName, entryBytes);
|
||||
} else {
|
||||
bundle.otherBinaryEntries.put(entryName, entryBytes);
|
||||
}
|
||||
}
|
||||
} catch (IOException exception) {
|
||||
throw new DocumentParseException("Failed to unzip MinerU result", exception);
|
||||
}
|
||||
return bundles;
|
||||
}
|
||||
|
||||
private byte[] readBytes(ZipInputStream zipInputStream) throws IOException {
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
byte[] buffer = new byte[8192];
|
||||
int length;
|
||||
while ((length = zipInputStream.read(buffer)) >= 0) {
|
||||
outputStream.write(buffer, 0, length);
|
||||
}
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
|
||||
private String resolveFileName(String entryName) {
|
||||
String[] segments = entryName.split("/");
|
||||
if (segments.length > 0 && StringUtil.hasText(segments[0])) {
|
||||
return segments[0];
|
||||
}
|
||||
String fileName = baseName(entryName);
|
||||
int dotIndex = fileName.indexOf('.');
|
||||
return dotIndex > 0 ? fileName.substring(0, dotIndex) : fileName;
|
||||
}
|
||||
|
||||
private String firstText(Map<String, byte[]> entries, String suffix) {
|
||||
for (Map.Entry<String, byte[]> entry : entries.entrySet()) {
|
||||
if (entry.getKey().endsWith(suffix)) {
|
||||
return new String(entry.getValue());
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private Object firstJsonValue(Map<String, byte[]> entries, String suffix) {
|
||||
String text = firstText(entries, suffix);
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return JSON.parse(text);
|
||||
} catch (Exception exception) {
|
||||
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
|
||||
}
|
||||
}
|
||||
|
||||
private JSONObject asObject(Object value) {
|
||||
if (value instanceof JSONObject) {
|
||||
return (JSONObject) value;
|
||||
}
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
if (value instanceof JSONArray) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseObject(JSON.toJSONString(value));
|
||||
}
|
||||
|
||||
private JSONArray asArray(Object value) {
|
||||
if (value instanceof JSONArray) {
|
||||
return (JSONArray) value;
|
||||
}
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseArray(JSON.toJSONString(value));
|
||||
}
|
||||
|
||||
private List<String> toStringList(JSONArray jsonArray) {
|
||||
if (jsonArray == null || jsonArray.isEmpty()) {
|
||||
return new ArrayList<String>();
|
||||
}
|
||||
List<String> values = new ArrayList<String>();
|
||||
for (int index = 0; index < jsonArray.size(); index++) {
|
||||
values.add(jsonArray.getString(index));
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
private Map<String, String> toStringMap(JSONObject jsonObject) {
|
||||
if (jsonObject == null || jsonObject.isEmpty()) {
|
||||
return new LinkedHashMap<String, String>();
|
||||
}
|
||||
Map<String, String> values = new LinkedHashMap<String, String>();
|
||||
for (String key : jsonObject.keySet()) {
|
||||
values.put(key, jsonObject.getString(key));
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
private Map<String, byte[]> toBinaryMap(Map<String, String> dataUrls) {
|
||||
Map<String, byte[]> values = new LinkedHashMap<String, byte[]>();
|
||||
if (dataUrls == null || dataUrls.isEmpty()) {
|
||||
return values;
|
||||
}
|
||||
for (Map.Entry<String, String> entry : dataUrls.entrySet()) {
|
||||
byte[] content = decodeDataUrl(entry.getValue());
|
||||
if (content != null) {
|
||||
values.put(entry.getKey(), content);
|
||||
}
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
private List<Double> toDoubleList(JSONArray jsonArray) {
|
||||
if (jsonArray == null || jsonArray.isEmpty()) {
|
||||
return new ArrayList<Double>();
|
||||
}
|
||||
List<Double> values = new ArrayList<Double>();
|
||||
for (int index = 0; index < jsonArray.size(); index++) {
|
||||
values.add(jsonArray.getDouble(index));
|
||||
}
|
||||
return values;
|
||||
}
|
||||
|
||||
private List<String> extractCaptions(JSONObject item) {
|
||||
List<String> texts = new ArrayList<String>();
|
||||
texts.addAll(toStringList(item.getJSONArray("image_caption")));
|
||||
texts.addAll(toStringList(item.getJSONArray("table_caption")));
|
||||
return texts;
|
||||
}
|
||||
|
||||
private List<String> extractFootnotes(JSONObject item) {
|
||||
List<String> texts = new ArrayList<String>();
|
||||
texts.addAll(toStringList(item.getJSONArray("image_footnote")));
|
||||
texts.addAll(toStringList(item.getJSONArray("table_footnote")));
|
||||
return texts;
|
||||
}
|
||||
|
||||
private boolean isVisualType(String type) {
|
||||
return "image".equals(type) || "table".equals(type) || "chart".equals(type) || "seal".equals(type);
|
||||
}
|
||||
|
||||
private String extractBlockText(JSONObject item) {
|
||||
String type = item.getString("type");
|
||||
if ("text".equals(type) || "header".equals(type) || "footer".equals(type)
|
||||
|| "page_number".equals(type) || "aside_text".equals(type) || "page_footnote".equals(type)
|
||||
|| "equation".equals(type) || "title".equals(type)) {
|
||||
String text = item.getString("text");
|
||||
return StringUtil.hasText(text) ? text : item.getString("content");
|
||||
}
|
||||
if ("list".equals(type)) {
|
||||
return joinList(toStringList(item.getJSONArray("list_items")));
|
||||
}
|
||||
if ("code".equals(type)) {
|
||||
return item.getString("code_body");
|
||||
}
|
||||
if ("image".equals(type)) {
|
||||
return joinList(toStringList(item.getJSONArray("image_caption")));
|
||||
}
|
||||
if ("table".equals(type)) {
|
||||
String tableCaption = joinList(toStringList(item.getJSONArray("table_caption")));
|
||||
return StringUtil.hasText(tableCaption) ? tableCaption : item.getString("content");
|
||||
}
|
||||
String text = item.getString("text");
|
||||
return StringUtil.hasText(text) ? text : item.getString("content");
|
||||
}
|
||||
|
||||
private String extractTextFromMiddleBlock(JSONObject blockJson) {
|
||||
List<String> texts = new ArrayList<String>();
|
||||
JSONArray blocks = blockJson.getJSONArray("blocks");
|
||||
if (blocks == null) {
|
||||
return null;
|
||||
}
|
||||
for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) {
|
||||
JSONObject childBlock = blocks.getJSONObject(blockIndex);
|
||||
JSONArray lines = childBlock.getJSONArray("lines");
|
||||
if (lines == null) {
|
||||
continue;
|
||||
}
|
||||
for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) {
|
||||
JSONObject line = lines.getJSONObject(lineIndex);
|
||||
JSONArray spans = line.getJSONArray("spans");
|
||||
if (spans == null) {
|
||||
continue;
|
||||
}
|
||||
for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) {
|
||||
JSONObject span = spans.getJSONObject(spanIndex);
|
||||
if (span.containsKey("content")) {
|
||||
texts.add(span.getString("content"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return joinList(texts);
|
||||
}
|
||||
|
||||
private String extractImagePathFromMiddleBlock(JSONObject blockJson) {
|
||||
JSONArray blocks = blockJson.getJSONArray("blocks");
|
||||
if (blocks == null) {
|
||||
return null;
|
||||
}
|
||||
for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) {
|
||||
JSONObject childBlock = blocks.getJSONObject(blockIndex);
|
||||
JSONArray lines = childBlock.getJSONArray("lines");
|
||||
if (lines == null) {
|
||||
continue;
|
||||
}
|
||||
for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) {
|
||||
JSONObject line = lines.getJSONObject(lineIndex);
|
||||
JSONArray spans = line.getJSONArray("spans");
|
||||
if (spans == null) {
|
||||
continue;
|
||||
}
|
||||
for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) {
|
||||
JSONObject span = spans.getJSONObject(spanIndex);
|
||||
if (span.containsKey("img_path")) {
|
||||
return span.getString("img_path");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private List<String> extractTextsByType(JSONObject visualBlock, String expectedType) {
|
||||
List<String> texts = new ArrayList<String>();
|
||||
JSONArray blocks = visualBlock.getJSONArray("blocks");
|
||||
if (blocks == null) {
|
||||
return texts;
|
||||
}
|
||||
for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) {
|
||||
JSONObject childBlock = blocks.getJSONObject(blockIndex);
|
||||
if (!expectedType.equals(childBlock.getString("type"))) {
|
||||
continue;
|
||||
}
|
||||
JSONArray lines = childBlock.getJSONArray("lines");
|
||||
if (lines == null) {
|
||||
continue;
|
||||
}
|
||||
for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) {
|
||||
JSONObject line = lines.getJSONObject(lineIndex);
|
||||
JSONArray spans = line.getJSONArray("spans");
|
||||
if (spans == null) {
|
||||
continue;
|
||||
}
|
||||
for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) {
|
||||
JSONObject span = spans.getJSONObject(spanIndex);
|
||||
if (span.containsKey("content")) {
|
||||
texts.add(span.getString("content"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return texts;
|
||||
}
|
||||
|
||||
private String extractImagePathByType(JSONObject visualBlock, String expectedType) {
|
||||
JSONArray blocks = visualBlock.getJSONArray("blocks");
|
||||
if (blocks == null) {
|
||||
return null;
|
||||
}
|
||||
for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) {
|
||||
JSONObject childBlock = blocks.getJSONObject(blockIndex);
|
||||
if (!expectedType.equals(childBlock.getString("type"))) {
|
||||
continue;
|
||||
}
|
||||
JSONArray lines = childBlock.getJSONArray("lines");
|
||||
if (lines == null) {
|
||||
continue;
|
||||
}
|
||||
for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) {
|
||||
JSONObject line = lines.getJSONObject(lineIndex);
|
||||
JSONArray spans = line.getJSONArray("spans");
|
||||
if (spans == null) {
|
||||
continue;
|
||||
}
|
||||
for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) {
|
||||
JSONObject span = spans.getJSONObject(spanIndex);
|
||||
if (span.containsKey("img_path")) {
|
||||
return span.getString("img_path");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private String matchDataUrl(String imagePath, Map<String, String> imageDataUrls) {
|
||||
if (imageDataUrls == null || imageDataUrls.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
if (StringUtil.hasText(imagePath) && imageDataUrls.containsKey(imagePath)) {
|
||||
return imageDataUrls.get(imagePath);
|
||||
}
|
||||
String baseName = baseName(imagePath);
|
||||
if (!StringUtil.hasText(baseName)) {
|
||||
return null;
|
||||
}
|
||||
for (Map.Entry<String, String> entry : imageDataUrls.entrySet()) {
|
||||
if (baseName.equals(baseName(entry.getKey()))) {
|
||||
return entry.getValue();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private byte[] matchBinaryContent(String imagePath, Map<String, byte[]> imageContents) {
|
||||
if (imageContents == null || imageContents.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
if (StringUtil.hasText(imagePath) && imageContents.containsKey(imagePath)) {
|
||||
return imageContents.get(imagePath);
|
||||
}
|
||||
String currentBaseName = baseName(imagePath);
|
||||
if (!StringUtil.hasText(currentBaseName)) {
|
||||
return null;
|
||||
}
|
||||
for (Map.Entry<String, byte[]> entry : imageContents.entrySet()) {
|
||||
if (currentBaseName.equals(baseName(entry.getKey()))) {
|
||||
return entry.getValue();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private String baseName(String path) {
|
||||
if (!StringUtil.hasText(path)) {
|
||||
return null;
|
||||
}
|
||||
int slashIndex = path.lastIndexOf('/');
|
||||
return slashIndex >= 0 ? path.substring(slashIndex + 1) : path;
|
||||
}
|
||||
|
||||
private String detectMimeType(String path) {
|
||||
if (!StringUtil.hasText(path)) {
|
||||
return null;
|
||||
}
|
||||
String mimeType = URLConnection.guessContentTypeFromName(path);
|
||||
return StringUtil.hasText(mimeType) ? mimeType : "application/octet-stream";
|
||||
}
|
||||
|
||||
private String toDataUrl(String path, byte[] content) {
|
||||
return "data:" + detectMimeType(path) + ";base64," + Base64.getEncoder().encodeToString(content);
|
||||
}
|
||||
|
||||
private byte[] decodeDataUrl(String dataUrl) {
|
||||
if (!StringUtil.hasText(dataUrl)) {
|
||||
return null;
|
||||
}
|
||||
int commaIndex = dataUrl.indexOf(',');
|
||||
if (commaIndex < 0 || commaIndex == dataUrl.length() - 1) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return Base64.getDecoder().decode(dataUrl.substring(commaIndex + 1));
|
||||
} catch (IllegalArgumentException exception) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private String joinList(List<String> values) {
|
||||
if (values == null || values.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int index = 0; index < values.size(); index++) {
|
||||
if (index > 0) {
|
||||
builder.append('\n');
|
||||
}
|
||||
builder.append(values.get(index));
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private boolean boolOrDefault(Boolean value, Boolean defaultValue) {
|
||||
return value == null ? isTrue(defaultValue) : value;
|
||||
}
|
||||
|
||||
private boolean isTrue(Boolean value) {
|
||||
return value != null && value;
|
||||
}
|
||||
|
||||
private int intOrDefault(Integer value, int defaultValue) {
|
||||
return value == null ? defaultValue : value;
|
||||
}
|
||||
|
||||
private static class ZipArtifactBundle {
|
||||
private final Map<String, byte[]> entriesBySuffix = new LinkedHashMap<String, byte[]>();
|
||||
private final Map<String, byte[]> images = new LinkedHashMap<String, byte[]>();
|
||||
private final Map<String, byte[]> otherBinaryEntries = new LinkedHashMap<String, byte[]>();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* MinerU 文档解析配置。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruProperties {
|
||||
|
||||
private String baseUrl;
|
||||
private Integer connectTimeoutMs = 3000;
|
||||
private Integer readTimeoutMs = 600000;
|
||||
private Integer writeTimeoutMs = 600000;
|
||||
private Integer pollIntervalMs = 1000;
|
||||
private Integer resultTimeoutMs = 1800000;
|
||||
private String defaultBackend = "vlm-http-client";
|
||||
private String defaultParseMethod = "auto";
|
||||
private List<String> defaultLangList = new ArrayList<String>(Arrays.asList("ch"));
|
||||
private Boolean defaultFormulaEnable = true;
|
||||
private Boolean defaultTableEnable = true;
|
||||
|
||||
public String getBaseUrl() {
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
public void setBaseUrl(String baseUrl) {
|
||||
this.baseUrl = baseUrl;
|
||||
}
|
||||
|
||||
public Integer getConnectTimeoutMs() {
|
||||
return connectTimeoutMs;
|
||||
}
|
||||
|
||||
public void setConnectTimeoutMs(Integer connectTimeoutMs) {
|
||||
this.connectTimeoutMs = connectTimeoutMs;
|
||||
}
|
||||
|
||||
public Integer getReadTimeoutMs() {
|
||||
return readTimeoutMs;
|
||||
}
|
||||
|
||||
public void setReadTimeoutMs(Integer readTimeoutMs) {
|
||||
this.readTimeoutMs = readTimeoutMs;
|
||||
}
|
||||
|
||||
public Integer getWriteTimeoutMs() {
|
||||
return writeTimeoutMs;
|
||||
}
|
||||
|
||||
public void setWriteTimeoutMs(Integer writeTimeoutMs) {
|
||||
this.writeTimeoutMs = writeTimeoutMs;
|
||||
}
|
||||
|
||||
public Integer getPollIntervalMs() {
|
||||
return pollIntervalMs;
|
||||
}
|
||||
|
||||
public void setPollIntervalMs(Integer pollIntervalMs) {
|
||||
this.pollIntervalMs = pollIntervalMs;
|
||||
}
|
||||
|
||||
public Integer getResultTimeoutMs() {
|
||||
return resultTimeoutMs;
|
||||
}
|
||||
|
||||
public void setResultTimeoutMs(Integer resultTimeoutMs) {
|
||||
this.resultTimeoutMs = resultTimeoutMs;
|
||||
}
|
||||
|
||||
public String getDefaultBackend() {
|
||||
return defaultBackend;
|
||||
}
|
||||
|
||||
public void setDefaultBackend(String defaultBackend) {
|
||||
this.defaultBackend = defaultBackend;
|
||||
}
|
||||
|
||||
public String getDefaultParseMethod() {
|
||||
return defaultParseMethod;
|
||||
}
|
||||
|
||||
public void setDefaultParseMethod(String defaultParseMethod) {
|
||||
this.defaultParseMethod = defaultParseMethod;
|
||||
}
|
||||
|
||||
public List<String> getDefaultLangList() {
|
||||
return defaultLangList;
|
||||
}
|
||||
|
||||
public void setDefaultLangList(List<String> defaultLangList) {
|
||||
this.defaultLangList = defaultLangList == null
|
||||
? new ArrayList<String>(Arrays.asList("ch"))
|
||||
: defaultLangList;
|
||||
}
|
||||
|
||||
public Boolean getDefaultFormulaEnable() {
|
||||
return defaultFormulaEnable;
|
||||
}
|
||||
|
||||
public void setDefaultFormulaEnable(Boolean defaultFormulaEnable) {
|
||||
this.defaultFormulaEnable = defaultFormulaEnable;
|
||||
}
|
||||
|
||||
public Boolean getDefaultTableEnable() {
|
||||
return defaultTableEnable;
|
||||
}
|
||||
|
||||
public void setDefaultTableEnable(Boolean defaultTableEnable) {
|
||||
this.defaultTableEnable = defaultTableEnable;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* MinerU 结果载荷。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruResultPayload {
|
||||
|
||||
private String backend;
|
||||
private String version;
|
||||
private Map<String, JSONObject> results = new LinkedHashMap<String, JSONObject>();
|
||||
|
||||
public String getBackend() {
|
||||
return backend;
|
||||
}
|
||||
|
||||
public void setBackend(String backend) {
|
||||
this.backend = backend;
|
||||
}
|
||||
|
||||
public String getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public void setVersion(String version) {
|
||||
this.version = version;
|
||||
}
|
||||
|
||||
public Map<String, JSONObject> getResults() {
|
||||
return results;
|
||||
}
|
||||
|
||||
public void setResults(Map<String, JSONObject> results) {
|
||||
this.results = results == null ? new LinkedHashMap<String, JSONObject>() : results;
|
||||
}
|
||||
}
|
||||
@@ -1,15 +1,15 @@
|
||||
package com.easyagents.document.core.model;
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 异步任务状态。
|
||||
* MinerU 原始任务状态。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class ParseTaskStatus {
|
||||
public class MineruTaskStatus {
|
||||
|
||||
private String taskId;
|
||||
private String status;
|
||||
@@ -22,6 +22,8 @@ public class ParseTaskStatus {
|
||||
private String statusUrl;
|
||||
private String resultUrl;
|
||||
private Integer queuedAhead;
|
||||
private String version;
|
||||
private String message;
|
||||
|
||||
public String getTaskId() {
|
||||
return taskId;
|
||||
@@ -110,4 +112,20 @@ public class ParseTaskStatus {
|
||||
public void setQueuedAhead(Integer queuedAhead) {
|
||||
this.queuedAhead = queuedAhead;
|
||||
}
|
||||
|
||||
public String getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public void setVersion(String version) {
|
||||
this.version = version;
|
||||
}
|
||||
|
||||
public String getMessage() {
|
||||
return message;
|
||||
}
|
||||
|
||||
public void setMessage(String message) {
|
||||
this.message = message;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
package com.easyagents.document.core.support;
|
||||
|
||||
import com.easyagents.document.core.DocumentParseService;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
|
||||
import com.easyagents.document.core.async.DocumentAsyncTaskUpdater;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 带统一异步任务能力的文档解析服务基类。
|
||||
* 支持 ppt 和 excel,pdf 和 word 文档使用 mineru 自带异步能力
|
||||
*
|
||||
* @param <R> 请求类型
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public abstract class AbstractAsyncDocumentParseService<R extends ParseRequest> implements DocumentParseService<R> {
|
||||
|
||||
private final DocumentAsyncTaskManager taskManager;
|
||||
|
||||
/**
|
||||
* 创建服务基类。
|
||||
*
|
||||
* @param taskManager 异步任务管理器
|
||||
*/
|
||||
protected AbstractAsyncDocumentParseService(DocumentAsyncTaskManager taskManager) {
|
||||
if (taskManager == null) {
|
||||
throw new IllegalArgumentException("DocumentAsyncTaskManager must not be null");
|
||||
}
|
||||
this.taskManager = taskManager;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse parse(ParseRequest request) {
|
||||
return doParse(normalizeRequest(request), null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus submit(ParseRequest request) {
|
||||
final R normalizedRequest = normalizeRequest(request);
|
||||
return taskManager.submit(
|
||||
normalizedRequest.getBackend(),
|
||||
collectFileNames(normalizedRequest),
|
||||
updater -> doParse(normalizedRequest, updater)
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
return taskManager.queryTask(taskId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
return taskManager.queryResult(taskId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||
return taskManager.queryTaskInfo(taskId);
|
||||
}
|
||||
|
||||
/**
|
||||
* 归一化请求。
|
||||
*
|
||||
* @param request 原始请求
|
||||
* @return 归一化结果
|
||||
*/
|
||||
protected abstract R normalizeRequest(ParseRequest request);
|
||||
|
||||
/**
|
||||
* 执行解析。
|
||||
*
|
||||
* @param request 归一化请求
|
||||
* @param updater 进度更新器;同步解析时可能为 {@code null}
|
||||
* @return 解析结果
|
||||
*/
|
||||
protected abstract ParseResponse doParse(R request, DocumentAsyncTaskUpdater updater);
|
||||
|
||||
private List<String> collectFileNames(ParseRequest request) {
|
||||
List<String> fileNames = new ArrayList<String>();
|
||||
if (request == null || request.getFiles() == null) {
|
||||
return fileNames;
|
||||
}
|
||||
for (ParseFile file : request.getFiles()) {
|
||||
if (file != null && file.getFileName() != null) {
|
||||
fileNames.add(file.getFileName());
|
||||
}
|
||||
}
|
||||
return fileNames;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
package com.easyagents.document.core.async;
|
||||
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.concurrent.Executor;
|
||||
|
||||
/**
|
||||
* 异步任务管理器测试。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class DocumentAsyncTaskManagerTest {
|
||||
|
||||
@Test
|
||||
public void shouldTrackTaskLifecycleAndResult() {
|
||||
Executor directExecutor = new Executor() {
|
||||
@Override
|
||||
public void execute(Runnable command) {
|
||||
command.run();
|
||||
}
|
||||
};
|
||||
DocumentAsyncTaskManager manager = new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor);
|
||||
|
||||
ParseTaskStatus status = manager.submit("mineru", Collections.singletonList("demo.pptx"), updater -> {
|
||||
updater.update("ocr", 50, 1, 2, "处理中");
|
||||
ParseResponse response = new ParseResponse();
|
||||
ParseResult result = new ParseResult();
|
||||
result.setFileName("demo.pptx");
|
||||
result.setMarkdown("# Slide 1");
|
||||
response.setResults(Collections.singletonList(result));
|
||||
return response;
|
||||
});
|
||||
|
||||
ParseTaskInfo taskInfo = manager.queryTaskInfo(status.getTaskId());
|
||||
|
||||
Assert.assertEquals("completed", taskInfo.getStatus());
|
||||
Assert.assertEquals(Integer.valueOf(100), taskInfo.getProgressPercent());
|
||||
Assert.assertEquals("completed", taskInfo.getCurrentStage());
|
||||
Assert.assertNotNull(taskInfo.getResult());
|
||||
Assert.assertEquals(1, taskInfo.getResult().getResults().size());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,210 @@
|
||||
package com.easyagents.document.core.mineru;
|
||||
|
||||
import com.alibaba.fastjson2.JSONObject;
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import okhttp3.Request;
|
||||
import okio.Buffer;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
/**
|
||||
* MinerU 通用文档解析服务测试。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-16
|
||||
*/
|
||||
public class MineruDocumentParseServiceTest {
|
||||
|
||||
@Test
|
||||
public void shouldForceAsyncResultArtifacts() {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper);
|
||||
|
||||
ParseRequest request = buildRequest();
|
||||
request.setReturnMarkdown(false);
|
||||
request.setReturnMiddleJson(false);
|
||||
request.setReturnContentList(false);
|
||||
request.setReturnModelOutput(false);
|
||||
request.setReturnImages(false);
|
||||
|
||||
ParseTaskStatus status = service.submit(request);
|
||||
|
||||
Assert.assertEquals("task-1", status.getTaskId());
|
||||
Assert.assertTrue(client.lastSubmitRequest.getReturnMarkdown());
|
||||
Assert.assertTrue(client.lastSubmitRequest.getReturnMiddleJson());
|
||||
Assert.assertTrue(client.lastSubmitRequest.getReturnContentList());
|
||||
Assert.assertTrue(client.lastSubmitRequest.getReturnModelOutput());
|
||||
Assert.assertTrue(client.lastSubmitRequest.getReturnImages());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldUseTaskMetadataWhenQueryingAsyncZipResult() {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper);
|
||||
|
||||
ParseResponse response = service.queryResult("task-1");
|
||||
|
||||
Assert.assertEquals("vlm-http-client", response.getBackend());
|
||||
Assert.assertEquals("3.0.9", response.getVersion());
|
||||
Assert.assertEquals(1, response.getResults().size());
|
||||
Assert.assertEquals("demo", response.getResults().get(0).getFileName());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldReturnCompletedResultInTaskInfo() {
|
||||
RecordingClient client = new RecordingClient(defaultProperties());
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper);
|
||||
|
||||
ParseTaskInfo taskInfo = service.queryTaskInfo("task-1");
|
||||
|
||||
Assert.assertEquals("completed", taskInfo.getStatus());
|
||||
Assert.assertNotNull(taskInfo.getResult());
|
||||
Assert.assertEquals(1, taskInfo.getResult().getResults().size());
|
||||
Assert.assertEquals(1, client.queryResultZipCount);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldSendRepeatedLangListFields() {
|
||||
InspectingMultipartClient client = new InspectingMultipartClient(defaultProperties());
|
||||
ParseRequest request = buildRequest();
|
||||
request.setLanguages(java.util.Arrays.asList("zh", "en"));
|
||||
|
||||
client.parse(request);
|
||||
|
||||
Assert.assertEquals(2, countOccurrences(client.lastMultipartBody, "name=\"lang_list\""));
|
||||
Assert.assertTrue(client.lastMultipartBody.contains("\r\nzh\r\n"));
|
||||
Assert.assertTrue(client.lastMultipartBody.contains("\r\nen\r\n"));
|
||||
}
|
||||
|
||||
private ParseRequest buildRequest() {
|
||||
ParseRequest request = new ParseRequest();
|
||||
request.addFile(ParseFile.of("demo.pptx", "ppt".getBytes(StandardCharsets.UTF_8)));
|
||||
return request;
|
||||
}
|
||||
|
||||
private MineruProperties defaultProperties() {
|
||||
MineruProperties properties = new MineruProperties();
|
||||
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||
properties.setResultTimeoutMs(50);
|
||||
properties.setPollIntervalMs(1);
|
||||
return properties;
|
||||
}
|
||||
|
||||
private int countOccurrences(String source, String token) {
|
||||
int count = 0;
|
||||
int index = 0;
|
||||
while (source != null && token != null && !token.isEmpty() && (index = source.indexOf(token, index)) >= 0) {
|
||||
count++;
|
||||
index += token.length();
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private static class RecordingClient extends MineruClient {
|
||||
|
||||
private ParseRequest lastSubmitRequest;
|
||||
private int queryResultZipCount;
|
||||
|
||||
private RecordingClient(MineruProperties properties) {
|
||||
super(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
@Override
|
||||
public MineruTaskStatus submit(ParseRequest request) {
|
||||
this.lastSubmitRequest = request;
|
||||
MineruTaskStatus taskStatus = new MineruTaskStatus();
|
||||
taskStatus.setTaskId("task-1");
|
||||
taskStatus.setStatus("pending");
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MineruTaskStatus queryTask(String taskId) {
|
||||
MineruTaskStatus taskStatus = new MineruTaskStatus();
|
||||
taskStatus.setTaskId(taskId);
|
||||
taskStatus.setStatus("completed");
|
||||
taskStatus.setBackend("vlm-http-client");
|
||||
taskStatus.setVersion("3.0.9");
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] queryResultZip(String taskId) {
|
||||
queryResultZipCount++;
|
||||
try {
|
||||
return buildZipResult();
|
||||
} catch (IOException exception) {
|
||||
throw new IllegalStateException("Failed to build test ZIP", exception);
|
||||
}
|
||||
}
|
||||
|
||||
private static byte[] buildZipResult() throws IOException {
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
try (ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream)) {
|
||||
addEntry(zipOutputStream, "demo/vlm/demo.md", "# title");
|
||||
addEntry(zipOutputStream, "demo/vlm/demo_middle.json", middleJson().toJSONString());
|
||||
addEntry(zipOutputStream, "demo/vlm/demo_content_list.json", contentList().toJSONString());
|
||||
}
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
|
||||
private static void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException {
|
||||
zipOutputStream.putNextEntry(new ZipEntry(name));
|
||||
zipOutputStream.write(content.getBytes(StandardCharsets.UTF_8));
|
||||
zipOutputStream.closeEntry();
|
||||
}
|
||||
|
||||
private static JSONObject middleJson() {
|
||||
JSONObject middleJson = new JSONObject();
|
||||
middleJson.put("_backend", "vlm");
|
||||
middleJson.put("_version_name", "3.0.9");
|
||||
middleJson.put("pdf_info", new com.alibaba.fastjson2.JSONArray());
|
||||
return middleJson;
|
||||
}
|
||||
|
||||
private static com.alibaba.fastjson2.JSONArray contentList() {
|
||||
com.alibaba.fastjson2.JSONArray contentList = new com.alibaba.fastjson2.JSONArray();
|
||||
JSONObject text = new JSONObject();
|
||||
text.put("type", "text");
|
||||
text.put("text", "title");
|
||||
text.put("page_idx", 0);
|
||||
text.put("bbox", new com.alibaba.fastjson2.JSONArray());
|
||||
contentList.add(text);
|
||||
return contentList;
|
||||
}
|
||||
}
|
||||
|
||||
private static class InspectingMultipartClient extends MineruClient {
|
||||
|
||||
private String lastMultipartBody;
|
||||
|
||||
private InspectingMultipartClient(MineruProperties properties) {
|
||||
super(properties, new MineruMapper(properties));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected JSONObject executeJsonRequest(String path, Request request) {
|
||||
try {
|
||||
Buffer buffer = new Buffer();
|
||||
request.body().writeTo(buffer);
|
||||
this.lastMultipartBody = buffer.readUtf8();
|
||||
} catch (IOException exception) {
|
||||
throw new IllegalStateException("Failed to inspect multipart body", exception);
|
||||
}
|
||||
return new JSONObject();
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user