From b66876d0fdb4baa6dd6e19a2a3eae620a737fd82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E5=AD=90=E9=BB=98?= <925456043@qq.com> Date: Thu, 16 Apr 2026 21:51:16 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=89=A9=E5=B1=95=20Office=20=E6=96=87?= =?UTF-8?q?=E6=A1=A3=E8=A7=A3=E6=9E=90=E8=83=BD=E5=8A=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 重构 document-core 与 MinerU 公共层,补齐 Office 异步任务基础设施 - 新增 PPTX/XLSX 解析模块与 starter 自动装配 - 补充 README 与相关测试覆盖 --- README.md | 2 +- easy-agents-bom/pom.xml | 10 + .../easy-agents-document-core/pom.xml | 21 + .../document/core/DocumentParseService.java | 10 +- .../core/async/DocumentAsyncTaskManager.java | 173 +++++ .../core/async/DocumentAsyncTaskRecord.java | 101 +++ .../async/DocumentAsyncTaskRepository.java | 25 + .../core/async/DocumentAsyncTaskRunner.java | 21 + .../core/async/DocumentAsyncTaskUpdater.java | 21 + .../InMemoryDocumentAsyncTaskRepository.java | 28 + .../core/{model => entity}/DocumentBlock.java | 2 +- .../core/{model => entity}/DocumentImage.java | 11 +- .../core/{model => entity}/DocumentPage.java | 2 +- .../core/{model => entity}/DocumentTable.java | 2 +- .../{model => entity}/ParseArtifacts.java | 2 +- .../core/{model => entity}/ParseFile.java | 2 +- .../core/{model => entity}/ParseRequest.java | 66 +- .../core/{model => entity}/ParseResponse.java | 2 +- .../core/{model => entity}/ParseResult.java | 2 +- .../core/{model => entity}/ParseTaskInfo.java | 7 +- .../{model => entity}/ParseTaskStatus.java | 47 +- .../document/core/entity/PdfParseRequest.java | 79 +++ .../core/entity/PptxParseRequest.java | 79 +++ .../core/entity/XlsxParseRequest.java | 82 +++ .../document/core/mineru/MineruClient.java} | 24 +- .../mineru/MineruDocumentParseService.java | 218 ++++++ .../document/core}/mineru/MineruMapper.java | 127 +++- .../core}/mineru/MineruProperties.java | 6 +- .../core}/mineru/MineruResultPayload.java | 4 +- .../core}/mineru/MineruTaskStatus.java | 4 +- .../AbstractAsyncDocumentParseService.java | 98 +++ .../async/DocumentAsyncTaskManagerTest.java | 49 ++ .../MineruDocumentParseServiceTest.java | 210 ++++++ .../document/pdf/PdfDocumentParseService.java | 3 +- .../mineru/MineruPdfDocumentParseService.java | 188 +----- .../document/pdf/mineru/MineruMapperTest.java | 11 +- .../MineruPdfDocumentParseServiceTest.java | 19 +- .../easy-agents-document-pptx/pom.xml | 44 ++ .../pptx/PptxDocumentParseService.java | 13 + .../document/pptx/PptxDocumentProvider.java | 17 + .../MineruPptxDocumentParseService.java | 408 ++++++++++++ .../pptx/model/PptxParseArtifact.java | 23 + .../pptx/model/PptxSlideArtifact.java | 86 +++ .../MineruPptxDocumentParseServiceTest.java | 170 +++++ .../easy-agents-document-xlsx/pom.xml | 44 ++ .../xlsx/XlsxDocumentParseService.java | 13 + .../document/xlsx/XlsxDocumentProvider.java | 17 + .../MineruXlsxDocumentParseService.java | 625 ++++++++++++++++++ .../document/xlsx/model/XlsxCellArtifact.java | 59 ++ .../xlsx/model/XlsxCellImageArtifact.java | 101 +++ .../xlsx/model/XlsxParseArtifact.java | 59 ++ .../document/xlsx/model/XlsxRowArtifact.java | 32 + .../xlsx/model/XlsxSheetArtifact.java | 68 ++ .../xlsx/model/XlsxSheetImagesArtifact.java | 50 ++ .../MineruXlsxDocumentParseServiceTest.java | 333 ++++++++++ easy-agents-document/pom.xml | 2 + easy-agents-spring-boot-starter/pom.xml | 10 + .../CommonMineruDocumentProperties.java | 119 ++++ .../mineru/MineruPdfAutoConfiguration.java | 37 +- .../pptx/MineruPptxAutoConfiguration.java | 61 ++ .../document/pptx/PptxDocumentProperties.java | 32 + .../xlsx/MineruXlsxAutoConfiguration.java | 61 ++ .../document/xlsx/XlsxDocumentProperties.java | 32 + ...ot.autoconfigure.AutoConfiguration.imports | 2 + ...arterConditionalAutoConfigurationTest.java | 23 +- pom.xml | 12 + 66 files changed, 4015 insertions(+), 296 deletions(-) create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskManager.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRecord.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRepository.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRunner.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskUpdater.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/InMemoryDocumentAsyncTaskRepository.java rename easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/{model => entity}/DocumentBlock.java (97%) rename easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/{model => entity}/DocumentImage.java (89%) rename easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/{model => entity}/DocumentPage.java (95%) rename easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/{model => entity}/DocumentTable.java (97%) rename easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/{model => entity}/ParseArtifacts.java (97%) rename easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/{model => entity}/ParseFile.java (97%) rename easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/{model => entity}/ParseRequest.java (67%) rename easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/{model => entity}/ParseResponse.java (94%) rename easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/{model => entity}/ParseResult.java (98%) rename easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/{model => entity}/ParseTaskInfo.java (82%) rename easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/{model => entity}/ParseTaskStatus.java (66%) create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/PdfParseRequest.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/PptxParseRequest.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/XlsxParseRequest.java rename easy-agents-document/{easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfClient.java => easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruClient.java} (90%) create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruDocumentParseService.java rename easy-agents-document/{easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf => easy-agents-document-core/src/main/java/com/easyagents/document/core}/mineru/MineruMapper.java (87%) rename easy-agents-document/{easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf => easy-agents-document-core/src/main/java/com/easyagents/document/core}/mineru/MineruProperties.java (96%) rename easy-agents-document/{easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf => easy-agents-document-core/src/main/java/com/easyagents/document/core}/mineru/MineruResultPayload.java (92%) rename easy-agents-document/{easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf => easy-agents-document-core/src/main/java/com/easyagents/document/core}/mineru/MineruTaskStatus.java (97%) create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/support/AbstractAsyncDocumentParseService.java create mode 100644 easy-agents-document/easy-agents-document-core/src/test/java/com/easyagents/document/core/async/DocumentAsyncTaskManagerTest.java create mode 100644 easy-agents-document/easy-agents-document-core/src/test/java/com/easyagents/document/core/mineru/MineruDocumentParseServiceTest.java create mode 100644 easy-agents-document/easy-agents-document-pptx/pom.xml create mode 100644 easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/PptxDocumentParseService.java create mode 100644 easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/PptxDocumentProvider.java create mode 100644 easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/mineru/MineruPptxDocumentParseService.java create mode 100644 easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/model/PptxParseArtifact.java create mode 100644 easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/model/PptxSlideArtifact.java create mode 100644 easy-agents-document/easy-agents-document-pptx/src/test/java/com/easyagents/document/pptx/mineru/MineruPptxDocumentParseServiceTest.java create mode 100644 easy-agents-document/easy-agents-document-xlsx/pom.xml create mode 100644 easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/XlsxDocumentParseService.java create mode 100644 easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/XlsxDocumentProvider.java create mode 100644 easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/mineru/MineruXlsxDocumentParseService.java create mode 100644 easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxCellArtifact.java create mode 100644 easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxCellImageArtifact.java create mode 100644 easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxParseArtifact.java create mode 100644 easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxRowArtifact.java create mode 100644 easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxSheetArtifact.java create mode 100644 easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxSheetImagesArtifact.java create mode 100644 easy-agents-document/easy-agents-document-xlsx/src/test/java/com/easyagents/document/xlsx/mineru/MineruXlsxDocumentParseServiceTest.java create mode 100644 easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/mineru/CommonMineruDocumentProperties.java create mode 100644 easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pptx/MineruPptxAutoConfiguration.java create mode 100644 easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pptx/PptxDocumentProperties.java create mode 100644 easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/xlsx/MineruXlsxAutoConfiguration.java create mode 100644 easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/xlsx/XlsxDocumentProperties.java diff --git a/README.md b/README.md index 4605b14..020527c 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Easy-Agents 是一个轻量、可扩展的 Java AI 应用开发框架,覆盖 - `easy-agents-bom`:依赖版本管理(BOM)。 - `easy-agents-core`:核心抽象与基础能力。 -- `easy-agents-document`:统一文档解析能力域,当前提供 PDF 解析抽象与 MinerU provider。 +- `easy-agents-document`:统一文档解析能力域,当前提供 PDF、PPTX、XLSX 解析抽象与 MinerU 复用能力。 - `easy-agents-chat`:对话模型接入实现集合。 - `easy-agents-embedding`:向量化模型实现集合。 - `easy-agents-rerank`:重排模型实现集合。 diff --git a/easy-agents-bom/pom.xml b/easy-agents-bom/pom.xml index d02204d..b3dedaf 100644 --- a/easy-agents-bom/pom.xml +++ b/easy-agents-bom/pom.xml @@ -66,6 +66,16 @@ easy-agents-document-pdf + + com.easyagents + easy-agents-document-pptx + + + + com.easyagents + easy-agents-document-xlsx + + com.easyagents easy-agents-rag-core diff --git a/easy-agents-document/easy-agents-document-core/pom.xml b/easy-agents-document/easy-agents-document-core/pom.xml index d2dfd2c..19af638 100644 --- a/easy-agents-document/easy-agents-document-core/pom.xml +++ b/easy-agents-document/easy-agents-document-core/pom.xml @@ -24,5 +24,26 @@ com.easyagents easy-agents-core + + + com.alibaba.fastjson2 + fastjson2 + + + + com.squareup.okhttp3 + okhttp + + + + org.slf4j + slf4j-api + + + + junit + junit + test + diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/DocumentParseService.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/DocumentParseService.java index e5a736a..1810356 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/DocumentParseService.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/DocumentParseService.java @@ -1,9 +1,9 @@ package com.easyagents.document.core; -import com.easyagents.document.core.model.ParseRequest; -import com.easyagents.document.core.model.ParseResponse; -import com.easyagents.document.core.model.ParseTaskInfo; -import com.easyagents.document.core.model.ParseTaskStatus; +import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseTaskInfo; +import com.easyagents.document.core.entity.ParseTaskStatus; /** * 统一文档解析服务抽象。 @@ -11,7 +11,7 @@ import com.easyagents.document.core.model.ParseTaskStatus; * @author Codex * @since 2026-04-14 */ -public interface DocumentParseService { +public interface DocumentParseService { /** * 同步解析文档并直接返回结果。 diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskManager.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskManager.java new file mode 100644 index 0000000..5536b02 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskManager.java @@ -0,0 +1,173 @@ +package com.easyagents.document.core.async; + +import com.easyagents.core.util.StringUtil; +import com.easyagents.document.core.exception.DocumentParseException; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseTaskInfo; +import com.easyagents.document.core.entity.ParseTaskStatus; + +import java.time.Instant; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.Executor; + +/** + * 文档异步任务管理器。 + * + * @author Codex + * @since 2026-04-16 + */ +public class DocumentAsyncTaskManager { + + private final DocumentAsyncTaskRepository repository; + private final Executor executor; + + /** + * 创建任务管理器。 + * + * @param repository 任务仓库 + * @param executor 执行器 + */ + public DocumentAsyncTaskManager(DocumentAsyncTaskRepository repository, Executor executor) { + if (repository == null) { + throw new IllegalArgumentException("DocumentAsyncTaskRepository must not be null"); + } + if (executor == null) { + throw new IllegalArgumentException("Executor must not be null"); + } + this.repository = repository; + this.executor = executor; + } + + /** + * 提交异步任务。 + * + * @param backend 后端标识 + * @param fileNames 文件名列表 + * @param runner 任务执行器 + * @return 初始任务状态 + */ + public ParseTaskStatus submit(String backend, List fileNames, final DocumentAsyncTaskRunner runner) { + final String taskId = UUID.randomUUID().toString(); + final ParseTaskStatus status = new ParseTaskStatus(); + status.setTaskId(taskId); + status.setStatus("queued"); + status.setBackend(backend); + status.setFileNames(fileNames == null ? new ArrayList() : new ArrayList(fileNames)); + status.setCreatedAt(Instant.now().toString()); + status.setCurrentStage("queued"); + status.setProgressPercent(0); + status.setProcessedItems(0); + status.setTotalItems(fileNames == null ? 0 : fileNames.size()); + status.setStatusMessage("任务已进入队列"); + + final DocumentAsyncTaskRecord record = new DocumentAsyncTaskRecord(status); + repository.save(record); + + executor.execute(new Runnable() { + @Override + public void run() { + markRunning(record); + try { + ParseResponse response = runner.run(new RepositoryBackedTaskUpdater(record)); + ParseTaskStatus completed = record.getStatusSnapshot(); + completed.setStatus("completed"); + completed.setCompletedAt(Instant.now().toString()); + completed.setProgressPercent(100); + completed.setCurrentStage("completed"); + completed.setStatusMessage("任务执行完成"); + record.setResult(response); + record.updateStatus(completed); + } catch (Exception exception) { + ParseTaskStatus failed = record.getStatusSnapshot(); + failed.setStatus("failed"); + failed.setCompletedAt(Instant.now().toString()); + failed.setCurrentStage("failed"); + failed.setStatusMessage(exception.getMessage()); + failed.setError(exception.getMessage()); + record.updateStatus(failed); + } + } + }); + + return record.getStatusSnapshot(); + } + + /** + * 查询任务状态。 + * + * @param taskId 任务 ID + * @return 任务状态 + */ + public ParseTaskStatus queryTask(String taskId) { + return requireRecord(taskId).getStatusSnapshot(); + } + + /** + * 查询任务聚合信息。 + * + * @param taskId 任务 ID + * @return 聚合信息 + */ + public ParseTaskInfo queryTaskInfo(String taskId) { + return requireRecord(taskId).getTaskInfoSnapshot(); + } + + /** + * 获取任务结果。 + * + * @param taskId 任务 ID + * @return 任务结果 + */ + public ParseResponse queryResult(String taskId) { + DocumentAsyncTaskRecord record = requireRecord(taskId); + ParseTaskStatus status = record.getStatusSnapshot(); + if (!"completed".equalsIgnoreCase(status.getStatus())) { + throw new DocumentParseException("Document async task is not completed: " + taskId); + } + return record.getResult(); + } + + private DocumentAsyncTaskRecord requireRecord(String taskId) { + if (!StringUtil.hasText(taskId)) { + throw new IllegalArgumentException("taskId must not be empty"); + } + DocumentAsyncTaskRecord record = repository.find(taskId); + if (record == null) { + throw new DocumentParseException("Document async task not found: " + taskId); + } + return record; + } + + private void markRunning(DocumentAsyncTaskRecord record) { + ParseTaskStatus status = record.getStatusSnapshot(); + status.setStatus("preparing"); + status.setStartedAt(Instant.now().toString()); + status.setCurrentStage("preparing"); + status.setProgressPercent(0); + status.setStatusMessage("任务开始执行"); + record.updateStatus(status); + } + + private static class RepositoryBackedTaskUpdater implements DocumentAsyncTaskUpdater { + + private final DocumentAsyncTaskRecord record; + + private RepositoryBackedTaskUpdater(DocumentAsyncTaskRecord record) { + this.record = record; + } + + @Override + public void update(String stage, Integer progressPercent, Integer processedItems, Integer totalItems, String statusMessage) { + ParseTaskStatus status = record.getStatusSnapshot(); + status.setStatus("completed".equalsIgnoreCase(stage) ? "completed" : "running"); + status.setCurrentStage(stage); + status.setProgressPercent(progressPercent); + status.setProcessedItems(processedItems); + status.setTotalItems(totalItems); + status.setStatusMessage(statusMessage); + record.updateStatus(status); + } + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRecord.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRecord.java new file mode 100644 index 0000000..9b79c59 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRecord.java @@ -0,0 +1,101 @@ +package com.easyagents.document.core.async; + +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseTaskInfo; +import com.easyagents.document.core.entity.ParseTaskStatus; + +/** + * 文档异步任务记录。 + * + * @author Codex + * @since 2026-04-16 + */ +public class DocumentAsyncTaskRecord { + + private final ParseTaskStatus status; + private ParseResponse result; + + /** + * 创建任务记录。 + * + * @param status 初始状态 + */ + public DocumentAsyncTaskRecord(ParseTaskStatus status) { + this.status = status; + } + + /** + * 获取状态快照。 + * + * @return 状态快照 + */ + public synchronized ParseTaskStatus getStatusSnapshot() { + return copyStatus(status); + } + + /** + * 获取聚合信息快照。 + * + * @return 聚合信息 + */ + public synchronized ParseTaskInfo getTaskInfoSnapshot() { + ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(status); + taskInfo.setResult(result); + return taskInfo; + } + + /** + * 获取结果。 + * + * @return 最终结果 + */ + public synchronized ParseResponse getResult() { + return result; + } + + /** + * 更新任务状态。 + * + * @param newStatus 新状态 + */ + public synchronized void updateStatus(ParseTaskStatus newStatus) { + if (newStatus == null) { + return; + } + copyInto(newStatus, status); + } + + /** + * 更新任务结果。 + * + * @param result 最终结果 + */ + public synchronized void setResult(ParseResponse result) { + this.result = result; + } + + private ParseTaskStatus copyStatus(ParseTaskStatus source) { + ParseTaskStatus copy = new ParseTaskStatus(); + copyInto(source, copy); + return copy; + } + + private void copyInto(ParseTaskStatus source, ParseTaskStatus target) { + target.setTaskId(source.getTaskId()); + target.setStatus(source.getStatus()); + target.setBackend(source.getBackend()); + target.setFileNames(source.getFileNames()); + target.setCreatedAt(source.getCreatedAt()); + target.setStartedAt(source.getStartedAt()); + target.setCompletedAt(source.getCompletedAt()); + target.setError(source.getError()); + target.setStatusUrl(source.getStatusUrl()); + target.setResultUrl(source.getResultUrl()); + target.setQueuedAhead(source.getQueuedAhead()); + target.setProgressPercent(source.getProgressPercent()); + target.setCurrentStage(source.getCurrentStage()); + target.setProcessedItems(source.getProcessedItems()); + target.setTotalItems(source.getTotalItems()); + target.setStatusMessage(source.getStatusMessage()); + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRepository.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRepository.java new file mode 100644 index 0000000..1ed78b2 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRepository.java @@ -0,0 +1,25 @@ +package com.easyagents.document.core.async; + +/** + * 文档异步任务仓库。 + * + * @author Codex + * @since 2026-04-16 + */ +public interface DocumentAsyncTaskRepository { + + /** + * 保存任务记录。 + * + * @param record 任务记录 + */ + void save(DocumentAsyncTaskRecord record); + + /** + * 获取任务记录。 + * + * @param taskId 任务 ID + * @return 任务记录,不存在时返回 {@code null} + */ + DocumentAsyncTaskRecord find(String taskId); +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRunner.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRunner.java new file mode 100644 index 0000000..abc6912 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskRunner.java @@ -0,0 +1,21 @@ +package com.easyagents.document.core.async; + +import com.easyagents.document.core.entity.ParseResponse; + +/** + * 文档异步任务执行器。 + * + * @author Codex + * @since 2026-04-16 + */ +public interface DocumentAsyncTaskRunner { + + /** + * 执行任务。 + * + * @param updater 状态更新器 + * @return 解析结果 + * @throws Exception 执行异常 + */ + ParseResponse run(DocumentAsyncTaskUpdater updater) throws Exception; +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskUpdater.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskUpdater.java new file mode 100644 index 0000000..f939728 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/DocumentAsyncTaskUpdater.java @@ -0,0 +1,21 @@ +package com.easyagents.document.core.async; + +/** + * 文档异步任务进度更新器。 + * + * @author Codex + * @since 2026-04-16 + */ +public interface DocumentAsyncTaskUpdater { + + /** + * 更新任务状态。 + * + * @param stage 当前阶段 + * @param progressPercent 进度百分比 + * @param processedItems 已处理数量 + * @param totalItems 总数量 + * @param statusMessage 状态说明 + */ + void update(String stage, Integer progressPercent, Integer processedItems, Integer totalItems, String statusMessage); +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/InMemoryDocumentAsyncTaskRepository.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/InMemoryDocumentAsyncTaskRepository.java new file mode 100644 index 0000000..65fa831 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/async/InMemoryDocumentAsyncTaskRepository.java @@ -0,0 +1,28 @@ +package com.easyagents.document.core.async; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * 基于内存的异步任务仓库。 + * + * @author Codex + * @since 2026-04-16 + */ +public class InMemoryDocumentAsyncTaskRepository implements DocumentAsyncTaskRepository { + + private final Map records = new ConcurrentHashMap(); + + @Override + public void save(DocumentAsyncTaskRecord record) { + if (record == null || record.getStatusSnapshot() == null || record.getStatusSnapshot().getTaskId() == null) { + return; + } + records.put(record.getStatusSnapshot().getTaskId(), record); + } + + @Override + public DocumentAsyncTaskRecord find(String taskId) { + return records.get(taskId); + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentBlock.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentBlock.java similarity index 97% rename from easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentBlock.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentBlock.java index f525d4f..b20c784 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentBlock.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentBlock.java @@ -1,4 +1,4 @@ -package com.easyagents.document.core.model; +package com.easyagents.document.core.entity; import java.util.ArrayList; import java.util.LinkedHashMap; diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentImage.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentImage.java similarity index 89% rename from easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentImage.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentImage.java index 1e36938..0627431 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentImage.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentImage.java @@ -1,4 +1,4 @@ -package com.easyagents.document.core.model; +package com.easyagents.document.core.entity; import java.util.ArrayList; import java.util.List; @@ -16,6 +16,7 @@ public class DocumentImage { private String mimeType; private String sourcePath; private String dataUrl; + private byte[] content; private List boundingBox = new ArrayList(); private List captions = new ArrayList(); private List footnotes = new ArrayList(); @@ -60,6 +61,14 @@ public class DocumentImage { this.dataUrl = dataUrl; } + public byte[] getContent() { + return content; + } + + public void setContent(byte[] content) { + this.content = content; + } + public List getBoundingBox() { return boundingBox; } diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentPage.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentPage.java similarity index 95% rename from easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentPage.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentPage.java index a6a2f18..fba6244 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentPage.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentPage.java @@ -1,4 +1,4 @@ -package com.easyagents.document.core.model; +package com.easyagents.document.core.entity; import java.util.LinkedHashMap; import java.util.Map; diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentTable.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentTable.java similarity index 97% rename from easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentTable.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentTable.java index 4c688cf..82534d2 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentTable.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/DocumentTable.java @@ -1,4 +1,4 @@ -package com.easyagents.document.core.model; +package com.easyagents.document.core.entity; import java.util.ArrayList; import java.util.List; diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseArtifacts.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseArtifacts.java similarity index 97% rename from easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseArtifacts.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseArtifacts.java index 7e08d2b..01f343a 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseArtifacts.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseArtifacts.java @@ -1,4 +1,4 @@ -package com.easyagents.document.core.model; +package com.easyagents.document.core.entity; import java.util.LinkedHashMap; import java.util.Map; diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseFile.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseFile.java similarity index 97% rename from easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseFile.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseFile.java index e303390..7d646cb 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseFile.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseFile.java @@ -1,4 +1,4 @@ -package com.easyagents.document.core.model; +package com.easyagents.document.core.entity; import java.util.Arrays; diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseRequest.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseRequest.java similarity index 67% rename from easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseRequest.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseRequest.java index 1c815a0..51a3bdc 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseRequest.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseRequest.java @@ -1,4 +1,4 @@ -package com.easyagents.document.core.model; +package com.easyagents.document.core.entity; import java.util.ArrayList; import java.util.List; @@ -13,12 +13,7 @@ public class ParseRequest { private List files = new ArrayList(); private String backend; - private String parseMethod = "auto"; private List languages = new ArrayList(); - private Boolean formulaEnabled = true; - private Boolean tableEnabled = true; - private Integer startPageIndex = 0; - private Integer endPageIndex = 99999; private Boolean returnMarkdown = true; private Boolean returnMiddleJson = true; private Boolean returnContentList = true; @@ -38,6 +33,25 @@ public class ParseRequest { return this; } + /** + * 复制通用字段到目标请求。 + * + * @param target 目标请求 + */ + public void copyCommonFieldsTo(ParseRequest target) { + if (target == null) { + return; + } + target.setFiles(new ArrayList(getFiles())); + target.setBackend(getBackend()); + target.setLanguages(new ArrayList(getLanguages())); + target.setReturnMarkdown(getReturnMarkdown()); + target.setReturnMiddleJson(getReturnMiddleJson()); + target.setReturnContentList(getReturnContentList()); + target.setReturnModelOutput(getReturnModelOutput()); + target.setReturnImages(getReturnImages()); + } + public List getFiles() { return files; } @@ -54,14 +68,6 @@ public class ParseRequest { this.backend = backend; } - public String getParseMethod() { - return parseMethod; - } - - public void setParseMethod(String parseMethod) { - this.parseMethod = parseMethod; - } - public List getLanguages() { return languages; } @@ -70,38 +76,6 @@ public class ParseRequest { this.languages = languages == null ? new ArrayList() : languages; } - public Boolean getFormulaEnabled() { - return formulaEnabled; - } - - public void setFormulaEnabled(Boolean formulaEnabled) { - this.formulaEnabled = formulaEnabled; - } - - public Boolean getTableEnabled() { - return tableEnabled; - } - - public void setTableEnabled(Boolean tableEnabled) { - this.tableEnabled = tableEnabled; - } - - public Integer getStartPageIndex() { - return startPageIndex; - } - - public void setStartPageIndex(Integer startPageIndex) { - this.startPageIndex = startPageIndex; - } - - public Integer getEndPageIndex() { - return endPageIndex; - } - - public void setEndPageIndex(Integer endPageIndex) { - this.endPageIndex = endPageIndex; - } - public Boolean getReturnMarkdown() { return returnMarkdown; } diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResponse.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseResponse.java similarity index 94% rename from easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResponse.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseResponse.java index 16c3cad..ac7e0c0 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResponse.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseResponse.java @@ -1,4 +1,4 @@ -package com.easyagents.document.core.model; +package com.easyagents.document.core.entity; import java.util.ArrayList; import java.util.List; diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResult.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseResult.java similarity index 98% rename from easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResult.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseResult.java index b007f53..c42c648 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResult.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseResult.java @@ -1,4 +1,4 @@ -package com.easyagents.document.core.model; +package com.easyagents.document.core.entity; import java.util.ArrayList; import java.util.LinkedHashMap; diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskInfo.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseTaskInfo.java similarity index 82% rename from easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskInfo.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseTaskInfo.java index 603d2bb..844bbda 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskInfo.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseTaskInfo.java @@ -1,4 +1,4 @@ -package com.easyagents.document.core.model; +package com.easyagents.document.core.entity; /** * 异步任务聚合查询结果。 @@ -35,6 +35,11 @@ public class ParseTaskInfo extends ParseTaskStatus { taskInfo.setStatusUrl(status.getStatusUrl()); taskInfo.setResultUrl(status.getResultUrl()); taskInfo.setQueuedAhead(status.getQueuedAhead()); + taskInfo.setProgressPercent(status.getProgressPercent()); + taskInfo.setCurrentStage(status.getCurrentStage()); + taskInfo.setProcessedItems(status.getProcessedItems()); + taskInfo.setTotalItems(status.getTotalItems()); + taskInfo.setStatusMessage(status.getStatusMessage()); return taskInfo; } diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskStatus.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseTaskStatus.java similarity index 66% rename from easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskStatus.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseTaskStatus.java index 4c742cb..9083b42 100644 --- a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskStatus.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/ParseTaskStatus.java @@ -1,4 +1,4 @@ -package com.easyagents.document.core.model; +package com.easyagents.document.core.entity; import java.util.ArrayList; import java.util.List; @@ -22,6 +22,11 @@ public class ParseTaskStatus { private String statusUrl; private String resultUrl; private Integer queuedAhead; + private Integer progressPercent; + private String currentStage; + private Integer processedItems; + private Integer totalItems; + private String statusMessage; public String getTaskId() { return taskId; @@ -110,4 +115,44 @@ public class ParseTaskStatus { public void setQueuedAhead(Integer queuedAhead) { this.queuedAhead = queuedAhead; } + + public Integer getProgressPercent() { + return progressPercent; + } + + public void setProgressPercent(Integer progressPercent) { + this.progressPercent = progressPercent; + } + + public String getCurrentStage() { + return currentStage; + } + + public void setCurrentStage(String currentStage) { + this.currentStage = currentStage; + } + + public Integer getProcessedItems() { + return processedItems; + } + + public void setProcessedItems(Integer processedItems) { + this.processedItems = processedItems; + } + + public Integer getTotalItems() { + return totalItems; + } + + public void setTotalItems(Integer totalItems) { + this.totalItems = totalItems; + } + + public String getStatusMessage() { + return statusMessage; + } + + public void setStatusMessage(String statusMessage) { + this.statusMessage = statusMessage; + } } diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/PdfParseRequest.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/PdfParseRequest.java new file mode 100644 index 0000000..30e1a0c --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/PdfParseRequest.java @@ -0,0 +1,79 @@ +package com.easyagents.document.core.entity; + +/** + * PDF 解析请求。 + * + * @author Codex + * @since 2026-04-16 + */ +public class PdfParseRequest extends ParseRequest { + + private String parseMethod = "auto"; + private Boolean formulaEnabled = true; + private Boolean tableEnabled = true; + private Integer startPageIndex = 0; + private Integer endPageIndex = 99999; + + /** + * 将通用请求转换为 PDF 请求。 + * + * @param request 原始请求 + * @return PDF 请求 + */ + public static PdfParseRequest from(ParseRequest request) { + PdfParseRequest pdfParseRequest = new PdfParseRequest(); + if (request == null) { + return pdfParseRequest; + } + request.copyCommonFieldsTo(pdfParseRequest); + if (request instanceof PdfParseRequest) { + PdfParseRequest source = (PdfParseRequest) request; + pdfParseRequest.setParseMethod(source.getParseMethod()); + pdfParseRequest.setFormulaEnabled(source.getFormulaEnabled()); + pdfParseRequest.setTableEnabled(source.getTableEnabled()); + pdfParseRequest.setStartPageIndex(source.getStartPageIndex()); + pdfParseRequest.setEndPageIndex(source.getEndPageIndex()); + } + return pdfParseRequest; + } + + public String getParseMethod() { + return parseMethod; + } + + public void setParseMethod(String parseMethod) { + this.parseMethod = parseMethod; + } + + public Boolean getFormulaEnabled() { + return formulaEnabled; + } + + public void setFormulaEnabled(Boolean formulaEnabled) { + this.formulaEnabled = formulaEnabled; + } + + public Boolean getTableEnabled() { + return tableEnabled; + } + + public void setTableEnabled(Boolean tableEnabled) { + this.tableEnabled = tableEnabled; + } + + public Integer getStartPageIndex() { + return startPageIndex; + } + + public void setStartPageIndex(Integer startPageIndex) { + this.startPageIndex = startPageIndex; + } + + public Integer getEndPageIndex() { + return endPageIndex; + } + + public void setEndPageIndex(Integer endPageIndex) { + this.endPageIndex = endPageIndex; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/PptxParseRequest.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/PptxParseRequest.java new file mode 100644 index 0000000..767fc84 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/PptxParseRequest.java @@ -0,0 +1,79 @@ +package com.easyagents.document.core.entity; + +/** + * PPTX 解析请求。 + * + * @author Codex + * @since 2026-04-16 + */ +public class PptxParseRequest extends ParseRequest { + + private Integer startSlideIndex = 0; + private Integer endSlideIndex; + private Double renderScale = 2.0d; + private String imageFormat = "png"; + private Boolean includeSlideImageReference = true; + + /** + * 将通用请求转换为 PPTX 请求。 + * + * @param request 原始请求 + * @return PPTX 请求 + */ + public static PptxParseRequest from(ParseRequest request) { + PptxParseRequest pptxParseRequest = new PptxParseRequest(); + if (request == null) { + return pptxParseRequest; + } + request.copyCommonFieldsTo(pptxParseRequest); + if (request instanceof PptxParseRequest) { + PptxParseRequest source = (PptxParseRequest) request; + pptxParseRequest.setStartSlideIndex(source.getStartSlideIndex()); + pptxParseRequest.setEndSlideIndex(source.getEndSlideIndex()); + pptxParseRequest.setRenderScale(source.getRenderScale()); + pptxParseRequest.setImageFormat(source.getImageFormat()); + pptxParseRequest.setIncludeSlideImageReference(source.getIncludeSlideImageReference()); + } + return pptxParseRequest; + } + + public Integer getStartSlideIndex() { + return startSlideIndex; + } + + public void setStartSlideIndex(Integer startSlideIndex) { + this.startSlideIndex = startSlideIndex; + } + + public Integer getEndSlideIndex() { + return endSlideIndex; + } + + public void setEndSlideIndex(Integer endSlideIndex) { + this.endSlideIndex = endSlideIndex; + } + + public Double getRenderScale() { + return renderScale; + } + + public void setRenderScale(Double renderScale) { + this.renderScale = renderScale; + } + + public String getImageFormat() { + return imageFormat; + } + + public void setImageFormat(String imageFormat) { + this.imageFormat = imageFormat; + } + + public Boolean getIncludeSlideImageReference() { + return includeSlideImageReference; + } + + public void setIncludeSlideImageReference(Boolean includeSlideImageReference) { + this.includeSlideImageReference = includeSlideImageReference; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/XlsxParseRequest.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/XlsxParseRequest.java new file mode 100644 index 0000000..947fb47 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/entity/XlsxParseRequest.java @@ -0,0 +1,82 @@ +package com.easyagents.document.core.entity; + +import java.util.ArrayList; +import java.util.List; + +/** + * XLSX 解析请求。 + * + * @author Codex + * @since 2026-04-16 + */ +public class XlsxParseRequest extends ParseRequest { + + private List sheetNames = new ArrayList(); + private Boolean includeHiddenSheets = false; + private Boolean ocrEmbeddedImages = true; + private Integer maxRowsPerSheet; + private Boolean includeImageAppendix = true; + + /** + * 将通用请求转换为 XLSX 请求。 + * + * @param request 原始请求 + * @return XLSX 请求 + */ + public static XlsxParseRequest from(ParseRequest request) { + XlsxParseRequest xlsxParseRequest = new XlsxParseRequest(); + if (request == null) { + return xlsxParseRequest; + } + request.copyCommonFieldsTo(xlsxParseRequest); + if (request instanceof XlsxParseRequest) { + XlsxParseRequest source = (XlsxParseRequest) request; + xlsxParseRequest.setSheetNames(new ArrayList(source.getSheetNames())); + xlsxParseRequest.setIncludeHiddenSheets(source.getIncludeHiddenSheets()); + xlsxParseRequest.setOcrEmbeddedImages(source.getOcrEmbeddedImages()); + xlsxParseRequest.setMaxRowsPerSheet(source.getMaxRowsPerSheet()); + xlsxParseRequest.setIncludeImageAppendix(source.getIncludeImageAppendix()); + } + return xlsxParseRequest; + } + + public List getSheetNames() { + return sheetNames; + } + + public void setSheetNames(List sheetNames) { + this.sheetNames = sheetNames == null ? new ArrayList() : sheetNames; + } + + public Boolean getIncludeHiddenSheets() { + return includeHiddenSheets; + } + + public void setIncludeHiddenSheets(Boolean includeHiddenSheets) { + this.includeHiddenSheets = includeHiddenSheets; + } + + public Boolean getOcrEmbeddedImages() { + return ocrEmbeddedImages; + } + + public void setOcrEmbeddedImages(Boolean ocrEmbeddedImages) { + this.ocrEmbeddedImages = ocrEmbeddedImages; + } + + public Integer getMaxRowsPerSheet() { + return maxRowsPerSheet; + } + + public void setMaxRowsPerSheet(Integer maxRowsPerSheet) { + this.maxRowsPerSheet = maxRowsPerSheet; + } + + public Boolean getIncludeImageAppendix() { + return includeImageAppendix; + } + + public void setIncludeImageAppendix(Boolean includeImageAppendix) { + this.includeImageAppendix = includeImageAppendix; + } +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfClient.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruClient.java similarity index 90% rename from easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfClient.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruClient.java index 53f6cf9..1cb756c 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfClient.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruClient.java @@ -1,11 +1,11 @@ -package com.easyagents.document.pdf.mineru; +package com.easyagents.document.core.mineru; import com.alibaba.fastjson2.JSON; import com.alibaba.fastjson2.JSONObject; import com.easyagents.core.util.StringUtil; import com.easyagents.document.core.exception.DocumentParseException; -import com.easyagents.document.core.model.ParseFile; -import com.easyagents.document.core.model.ParseRequest; +import com.easyagents.document.core.entity.ParseFile; +import com.easyagents.document.core.entity.ParseRequest; import okhttp3.MediaType; import okhttp3.MultipartBody; import okhttp3.OkHttpClient; @@ -15,6 +15,7 @@ import okhttp3.Response; import okhttp3.ResponseBody; import java.io.IOException; +import java.net.URLConnection; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; @@ -23,11 +24,11 @@ import java.util.concurrent.TimeUnit; * MinerU HTTP 客户端。 * * @author Codex - * @since 2026-04-14 + * @since 2026-04-16 */ -public class MineruPdfClient { +public class MineruClient { - private static final MediaType DEFAULT_PDF_MEDIA_TYPE = MediaType.parse("application/pdf"); + private static final MediaType DEFAULT_MEDIA_TYPE = MediaType.parse("application/octet-stream"); private final String baseUrl; private final OkHttpClient okHttpClient; @@ -39,7 +40,7 @@ public class MineruPdfClient { * @param properties MinerU 配置 * @param mineruMapper DTO 映射器 */ - public MineruPdfClient(MineruProperties properties, MineruMapper mineruMapper) { + public MineruClient(MineruProperties properties, MineruMapper mineruMapper) { this( properties, new OkHttpClient.Builder() @@ -58,7 +59,7 @@ public class MineruPdfClient { * @param okHttpClient HTTP 客户端 * @param mineruMapper DTO 映射器 */ - public MineruPdfClient(MineruProperties properties, OkHttpClient okHttpClient, MineruMapper mineruMapper) { + public MineruClient(MineruProperties properties, OkHttpClient okHttpClient, MineruMapper mineruMapper) { if (properties == null || !StringUtil.hasText(properties.getBaseUrl())) { throw new IllegalArgumentException("MinerU baseUrl must not be empty"); } @@ -165,7 +166,7 @@ public class MineruPdfClient { } MediaType mediaType = StringUtil.hasText(file.getContentType()) ? MediaType.parse(file.getContentType()) - : DEFAULT_PDF_MEDIA_TYPE; + : detectMediaType(file.getFileName()); formBuilder.addFormDataPart( "files", file.getFileName(), @@ -208,4 +209,9 @@ public class MineruPdfClient { } return baseUrl; } + + private MediaType detectMediaType(String fileName) { + String mimeType = URLConnection.guessContentTypeFromName(fileName); + return StringUtil.hasText(mimeType) ? MediaType.parse(mimeType) : DEFAULT_MEDIA_TYPE; + } } diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruDocumentParseService.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruDocumentParseService.java new file mode 100644 index 0000000..50cbce5 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruDocumentParseService.java @@ -0,0 +1,218 @@ +package com.easyagents.document.core.mineru; + +import com.easyagents.core.util.StringUtil; +import com.easyagents.document.core.DocumentParseService; +import com.easyagents.document.core.exception.DocumentParseException; +import com.easyagents.document.core.entity.ParseFile; +import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseTaskInfo; +import com.easyagents.document.core.entity.ParseTaskStatus; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; + +/** + * 基于 MinerU API 的文档解析服务,支持 docx 文档和 pdf 文档。 + * + * @author Codex + * @since 2026-04-16 + */ +public class MineruDocumentParseService implements DocumentParseService { + + public static final String PROVIDER_NAME = "mineru"; + + private static final Logger LOG = LoggerFactory.getLogger(MineruDocumentParseService.class); + + private final MineruProperties properties; + private final MineruClient client; + private final MineruMapper mapper; + + /** + * 创建默认服务实例。 + * + * @param properties MinerU 配置 + */ + public MineruDocumentParseService(MineruProperties properties) { + this(properties, new MineruMapper(properties)); + } + + /** + * 创建默认服务实例。 + * + * @param properties MinerU 配置 + * @param mapper 结果映射器 + */ + public MineruDocumentParseService(MineruProperties properties, MineruMapper mapper) { + this(properties, new MineruClient(properties, mapper), mapper); + } + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + * @param client HTTP 客户端 + * @param mapper 结果映射器 + */ + public MineruDocumentParseService(MineruProperties properties, MineruClient client, MineruMapper mapper) { + this.properties = properties; + this.client = client; + this.mapper = mapper; + } + + @Override + public ParseResponse parse(ParseRequest request) { + ParseRequest normalizedRequest = normalizeRequest(request); + LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}", + PROVIDER_NAME, + normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(), + normalizedRequest.getBackend()); + ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest)); + LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}", + PROVIDER_NAME, + normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(), + response == null || response.getResults() == null ? 0 : response.getResults().size()); + return response; + } + + @Override + public ParseTaskStatus submit(ParseRequest request) { + ParseRequest normalizedRequest = normalizeRequest(request); + normalizedRequest.setReturnMarkdown(true); + normalizedRequest.setReturnMiddleJson(true); + normalizedRequest.setReturnContentList(true); + normalizedRequest.setReturnModelOutput(true); + normalizedRequest.setReturnImages(true); + LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}", + PROVIDER_NAME, + normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(), + normalizedRequest.getBackend()); + ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest)); + LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}", + PROVIDER_NAME, + taskStatus == null ? null : taskStatus.getTaskId(), + taskStatus == null ? null : taskStatus.getStatus()); + return taskStatus; + } + + @Override + public ParseTaskStatus queryTask(String taskId) { + validateTaskId(taskId); + ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId)); + LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}", + PROVIDER_NAME, + taskId, + taskStatus == null ? null : taskStatus.getStatus()); + return taskStatus; + } + + @Override + public ParseResponse queryResult(String taskId) { + validateTaskId(taskId); + LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId); + MineruTaskStatus taskStatus = waitForTaskCompleted(taskId); + ParseResponse response = mapper.fromZip(client.queryResultZip(taskId)); + mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion()); + LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}", + PROVIDER_NAME, + taskId, + response == null || response.getResults() == null ? 0 : response.getResults().size()); + return response; + } + + @Override + public ParseTaskInfo queryTaskInfo(String taskId) { + validateTaskId(taskId); + MineruTaskStatus taskStatus = client.queryTask(taskId); + ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(mapper.toParseTaskStatus(taskStatus)); + if ("completed".equalsIgnoreCase(taskStatus.getStatus())) { + ParseResponse response = mapper.fromZip(client.queryResultZip(taskId)); + mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion()); + taskInfo.setResult(response); + } + LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}", + PROVIDER_NAME, + taskId, + taskInfo.getStatus(), + taskInfo.getResult() != null); + return taskInfo; + } + + /** + * 获取 MinerU 配置。 + * + * @return MinerU 配置 + */ + protected MineruProperties getProperties() { + return properties; + } + + /** + * 归一化解析请求,补齐默认参数。 + * + * @param request 原始请求 + * @return 归一化后的请求 + */ + protected ParseRequest normalizeRequest(ParseRequest request) { + if (request == null) { + throw new IllegalArgumentException("ParseRequest must not be null"); + } + if (request.getFiles() == null || request.getFiles().isEmpty()) { + throw new IllegalArgumentException("ParseRequest files must not be empty"); + } + ParseRequest normalizedRequest = new ParseRequest(); + normalizedRequest.setFiles(new ArrayList(request.getFiles())); + normalizedRequest.setBackend(StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend()); + normalizedRequest.setLanguages( + request.getLanguages() == null || request.getLanguages().isEmpty() + ? new ArrayList(properties.getDefaultLangList()) + : new ArrayList(request.getLanguages()) + ); + normalizedRequest.setReturnMarkdown(request.getReturnMarkdown() == null ? Boolean.TRUE : request.getReturnMarkdown()); + normalizedRequest.setReturnMiddleJson(request.getReturnMiddleJson() == null ? Boolean.TRUE : request.getReturnMiddleJson()); + normalizedRequest.setReturnContentList(request.getReturnContentList() == null ? Boolean.TRUE : request.getReturnContentList()); + normalizedRequest.setReturnModelOutput(request.getReturnModelOutput() == null ? Boolean.FALSE : request.getReturnModelOutput()); + normalizedRequest.setReturnImages(request.getReturnImages() == null ? Boolean.TRUE : request.getReturnImages()); + return normalizedRequest; + } + + /** + * 校验任务 ID。 + * + * @param taskId 任务 ID + */ + protected void validateTaskId(String taskId) { + if (!StringUtil.hasText(taskId)) { + throw new IllegalArgumentException("taskId must not be empty"); + } + } + + /** + * 轮询任务状态直到完成或失败。 + * + * @param taskId 任务 ID + * @return 已完成的任务状态 + */ + protected MineruTaskStatus waitForTaskCompleted(String taskId) { + long deadline = System.currentTimeMillis() + properties.getResultTimeoutMs(); + while (true) { + MineruTaskStatus taskStatus = client.queryTask(taskId); + if ("completed".equals(taskStatus.getStatus())) { + return taskStatus; + } + if ("failed".equals(taskStatus.getStatus())) { + throw new DocumentParseException("MinerU task failed: " + taskStatus.getError()); + } + if (System.currentTimeMillis() >= deadline) { + throw new DocumentParseException("MinerU task result timeout: " + taskId); + } + try { + Thread.sleep(properties.getPollIntervalMs()); + } catch (InterruptedException exception) { + Thread.currentThread().interrupt(); + throw new DocumentParseException("Interrupted while waiting for MinerU task: " + taskId, exception); + } + } + } +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruMapper.java similarity index 87% rename from easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruMapper.java index c48d3d1..26d1db6 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruMapper.java @@ -1,19 +1,20 @@ -package com.easyagents.document.pdf.mineru; +package com.easyagents.document.core.mineru; import com.alibaba.fastjson2.JSON; import com.alibaba.fastjson2.JSONArray; import com.alibaba.fastjson2.JSONObject; import com.easyagents.core.util.StringUtil; import com.easyagents.document.core.exception.DocumentParseException; -import com.easyagents.document.core.model.DocumentBlock; -import com.easyagents.document.core.model.DocumentImage; -import com.easyagents.document.core.model.DocumentPage; -import com.easyagents.document.core.model.DocumentTable; -import com.easyagents.document.core.model.ParseArtifacts; -import com.easyagents.document.core.model.ParseRequest; -import com.easyagents.document.core.model.ParseResponse; -import com.easyagents.document.core.model.ParseResult; -import com.easyagents.document.core.model.ParseTaskStatus; +import com.easyagents.document.core.entity.DocumentBlock; +import com.easyagents.document.core.entity.DocumentImage; +import com.easyagents.document.core.entity.DocumentPage; +import com.easyagents.document.core.entity.DocumentTable; +import com.easyagents.document.core.entity.PdfParseRequest; +import com.easyagents.document.core.entity.ParseArtifacts; +import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseResult; +import com.easyagents.document.core.entity.ParseTaskStatus; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -31,7 +32,7 @@ import java.util.zip.ZipInputStream; * MinerU 原始协议与统一模型之间的映射器。 * * @author Codex - * @since 2026-04-14 + * @since 2026-04-16 */ public class MineruMapper { @@ -71,7 +72,6 @@ public class MineruMapper { */ public Map> buildAsyncFormFields(ParseRequest request) { Map> fields = buildBaseFormFields(request); - // 异步结果固定按全量 ZIP 返回,避免超大结果通过 JSON 传输。 putSingleValue(fields, "return_md", "true"); putSingleValue(fields, "return_middle_json", "true"); putSingleValue(fields, "return_content_list", "true"); @@ -205,19 +205,24 @@ public class MineruMapper { private Map> buildBaseFormFields(ParseRequest request) { Map> fields = new LinkedHashMap>(); putSingleValue(fields, "backend", StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend()); - putSingleValue(fields, "parse_method", StringUtil.hasText(request.getParseMethod()) ? request.getParseMethod() : properties.getDefaultParseMethod()); - putSingleValue(fields, "formula_enable", String.valueOf(boolOrDefault(request.getFormulaEnabled(), properties.getDefaultFormulaEnable()))); - putSingleValue(fields, "table_enable", String.valueOf(boolOrDefault(request.getTableEnabled(), properties.getDefaultTableEnable()))); - putSingleValue(fields, "start_page_id", String.valueOf(intOrDefault(request.getStartPageIndex(), 0))); - putSingleValue(fields, "end_page_id", String.valueOf(intOrDefault(request.getEndPageIndex(), 99999))); List languages = request.getLanguages(); if (languages == null || languages.isEmpty()) { languages = properties.getDefaultLangList(); } if (languages != null && !languages.isEmpty()) { - // MinerU 通过重复的 lang_list 表单字段接收多语言参数。 fields.put("lang_list", new ArrayList(languages)); } + if (request instanceof PdfParseRequest) { + PdfParseRequest pdfParseRequest = (PdfParseRequest) request; + putSingleValue(fields, "parse_method", + StringUtil.hasText(pdfParseRequest.getParseMethod()) ? pdfParseRequest.getParseMethod() : properties.getDefaultParseMethod()); + putSingleValue(fields, "formula_enable", + String.valueOf(boolOrDefault(pdfParseRequest.getFormulaEnabled(), properties.getDefaultFormulaEnable()))); + putSingleValue(fields, "table_enable", + String.valueOf(boolOrDefault(pdfParseRequest.getTableEnabled(), properties.getDefaultTableEnable()))); + putSingleValue(fields, "start_page_id", String.valueOf(intOrDefault(pdfParseRequest.getStartPageIndex(), 0))); + putSingleValue(fields, "end_page_id", String.valueOf(intOrDefault(pdfParseRequest.getEndPageIndex(), 99999))); + } return fields; } @@ -240,7 +245,8 @@ public class MineruMapper { result.setArtifacts(artifacts); Map imageDataUrls = toStringMap(fileResult.getJSONObject("images")); - applyStructuredArtifacts(result, imageDataUrls); + Map imageContents = toBinaryMap(imageDataUrls); + applyStructuredArtifacts(result, imageDataUrls, imageContents); if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) { result.getWarnings().add("MinerU did not return markdown, middle_json or content_list"); } @@ -264,7 +270,6 @@ public class MineruMapper { JSONArray contentList = asArray(contentListArtifact); Object modelOutput = modelOutputArtifact; - // MinerU 在 DOCX 等场景下可能将结构化块列表放在 middle/model 工件里,并且直接返回数组。 if (contentList == null && middleArtifact instanceof JSONArray) { contentList = (JSONArray) middleArtifact; middleJson = null; @@ -289,10 +294,12 @@ public class MineruMapper { result.setArtifacts(artifacts); Map imageDataUrls = new LinkedHashMap(); + Map imageContents = new LinkedHashMap(); for (Map.Entry imageEntry : bundle.images.entrySet()) { imageDataUrls.put(imageEntry.getKey(), toDataUrl(imageEntry.getKey(), imageEntry.getValue())); + imageContents.put(imageEntry.getKey(), imageEntry.getValue()); } - applyStructuredArtifacts(result, imageDataUrls); + applyStructuredArtifacts(result, imageDataUrls, imageContents); if (markdown == null && middleJson == null && contentList == null) { throw new DocumentParseException("MinerU ZIP result missing critical artifacts for file: " + fileName); @@ -300,7 +307,7 @@ public class MineruMapper { return result; } - private void applyStructuredArtifacts(ParseResult result, Map imageDataUrls) { + private void applyStructuredArtifacts(ParseResult result, Map imageDataUrls, Map imageContents) { JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson()); JSONArray contentList = asArray(result.getArtifacts().getContentList()); @@ -311,9 +318,9 @@ public class MineruMapper { } if (contentList != null) { - fillFromContentList(result, contentList, imageDataUrls); + fillFromContentList(result, contentList, imageDataUrls, imageContents); } else if (middleJson != null) { - fillFromMiddleJson(result, middleJson, imageDataUrls); + fillFromMiddleJson(result, middleJson, imageDataUrls, imageContents); } if ((result.getImages() == null || result.getImages().isEmpty()) && imageDataUrls != null && !imageDataUrls.isEmpty()) { @@ -322,6 +329,7 @@ public class MineruMapper { image.setName(baseName(entry.getKey())); image.setSourcePath(entry.getKey()); image.setDataUrl(entry.getValue()); + image.setContent(matchBinaryContent(entry.getKey(), imageContents)); image.setMimeType(detectMimeType(entry.getKey())); result.getImages().add(image); } @@ -349,7 +357,10 @@ public class MineruMapper { result.setPages(pages); } - private void fillFromContentList(ParseResult result, JSONArray contentList, Map imageDataUrls) { + private void fillFromContentList(ParseResult result, + JSONArray contentList, + Map imageDataUrls, + Map imageContents) { for (int index = 0; index < contentList.size(); index++) { JSONObject item = contentList.getJSONObject(index); if (item == null) { @@ -391,12 +402,16 @@ public class MineruMapper { image.setCaptions(extractCaptions(item)); image.setFootnotes(extractFootnotes(item)); image.setDataUrl(matchDataUrl(item.getString("img_path"), imageDataUrls)); + image.setContent(matchBinaryContent(item.getString("img_path"), imageContents)); result.getImages().add(image); } } } - private void fillFromMiddleJson(ParseResult result, JSONObject middleJson, Map imageDataUrls) { + private void fillFromMiddleJson(ParseResult result, + JSONObject middleJson, + Map imageDataUrls, + Map imageContents) { JSONArray pages = middleJson.getJSONArray("pdf_info"); if (pages == null) { return; @@ -404,8 +419,8 @@ public class MineruMapper { for (int pageIndex = 0; pageIndex < pages.size(); pageIndex++) { JSONObject page = pages.getJSONObject(pageIndex); fillBlocksFromMiddlePage(result, page.getJSONArray("para_blocks"), page.getInteger("page_idx")); - fillVisualsFromMiddlePage(result, page.getJSONArray("tables"), page.getInteger("page_idx"), true, imageDataUrls); - fillVisualsFromMiddlePage(result, page.getJSONArray("images"), page.getInteger("page_idx"), false, imageDataUrls); + fillVisualsFromMiddlePage(result, page.getJSONArray("tables"), page.getInteger("page_idx"), true, imageDataUrls, imageContents); + fillVisualsFromMiddlePage(result, page.getJSONArray("images"), page.getInteger("page_idx"), false, imageDataUrls, imageContents); } } @@ -429,7 +444,12 @@ public class MineruMapper { } } - private void fillVisualsFromMiddlePage(ParseResult result, JSONArray blocks, Integer pageIndex, boolean table, Map imageDataUrls) { + private void fillVisualsFromMiddlePage(ParseResult result, + JSONArray blocks, + Integer pageIndex, + boolean table, + Map imageDataUrls, + Map imageContents) { if (blocks == null) { return; } @@ -456,6 +476,7 @@ public class MineruMapper { documentImage.setName(baseName(documentImage.getSourcePath())); documentImage.setMimeType(detectMimeType(documentImage.getSourcePath())); documentImage.setDataUrl(matchDataUrl(documentImage.getSourcePath(), imageDataUrls)); + documentImage.setContent(matchBinaryContent(documentImage.getSourcePath(), imageContents)); result.getImages().add(documentImage); } } @@ -607,6 +628,20 @@ public class MineruMapper { return values; } + private Map toBinaryMap(Map dataUrls) { + Map values = new LinkedHashMap(); + if (dataUrls == null || dataUrls.isEmpty()) { + return values; + } + for (Map.Entry entry : dataUrls.entrySet()) { + byte[] content = decodeDataUrl(entry.getValue()); + if (content != null) { + values.put(entry.getKey(), content); + } + } + return values; + } + private List toDoubleList(JSONArray jsonArray) { if (jsonArray == null || jsonArray.isEmpty()) { return new ArrayList(); @@ -800,6 +835,25 @@ public class MineruMapper { return null; } + private byte[] matchBinaryContent(String imagePath, Map imageContents) { + if (imageContents == null || imageContents.isEmpty()) { + return null; + } + if (StringUtil.hasText(imagePath) && imageContents.containsKey(imagePath)) { + return imageContents.get(imagePath); + } + String currentBaseName = baseName(imagePath); + if (!StringUtil.hasText(currentBaseName)) { + return null; + } + for (Map.Entry entry : imageContents.entrySet()) { + if (currentBaseName.equals(baseName(entry.getKey()))) { + return entry.getValue(); + } + } + return null; + } + private String baseName(String path) { if (!StringUtil.hasText(path)) { return null; @@ -820,6 +874,21 @@ public class MineruMapper { return "data:" + detectMimeType(path) + ";base64," + Base64.getEncoder().encodeToString(content); } + private byte[] decodeDataUrl(String dataUrl) { + if (!StringUtil.hasText(dataUrl)) { + return null; + } + int commaIndex = dataUrl.indexOf(','); + if (commaIndex < 0 || commaIndex == dataUrl.length() - 1) { + return null; + } + try { + return Base64.getDecoder().decode(dataUrl.substring(commaIndex + 1)); + } catch (IllegalArgumentException exception) { + return null; + } + } + private String joinList(List values) { if (values == null || values.isEmpty()) { return null; diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruProperties.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruProperties.java similarity index 96% rename from easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruProperties.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruProperties.java index c5efd78..d45ab20 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruProperties.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruProperties.java @@ -1,14 +1,14 @@ -package com.easyagents.document.pdf.mineru; +package com.easyagents.document.core.mineru; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** - * MinerU PDF 解析配置。 + * MinerU 文档解析配置。 * * @author Codex - * @since 2026-04-14 + * @since 2026-04-16 */ public class MineruProperties { diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruResultPayload.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruResultPayload.java similarity index 92% rename from easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruResultPayload.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruResultPayload.java index 7ed2b9f..a4ff500 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruResultPayload.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruResultPayload.java @@ -1,4 +1,4 @@ -package com.easyagents.document.pdf.mineru; +package com.easyagents.document.core.mineru; import com.alibaba.fastjson2.JSONObject; @@ -9,7 +9,7 @@ import java.util.Map; * MinerU 结果载荷。 * * @author Codex - * @since 2026-04-14 + * @since 2026-04-16 */ public class MineruResultPayload { diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruTaskStatus.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruTaskStatus.java similarity index 97% rename from easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruTaskStatus.java rename to easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruTaskStatus.java index 99b476b..2a0b067 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruTaskStatus.java +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/mineru/MineruTaskStatus.java @@ -1,4 +1,4 @@ -package com.easyagents.document.pdf.mineru; +package com.easyagents.document.core.mineru; import java.util.ArrayList; import java.util.List; @@ -7,7 +7,7 @@ import java.util.List; * MinerU 原始任务状态。 * * @author Codex - * @since 2026-04-14 + * @since 2026-04-16 */ public class MineruTaskStatus { diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/support/AbstractAsyncDocumentParseService.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/support/AbstractAsyncDocumentParseService.java new file mode 100644 index 0000000..7ec1d0a --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/support/AbstractAsyncDocumentParseService.java @@ -0,0 +1,98 @@ +package com.easyagents.document.core.support; + +import com.easyagents.document.core.DocumentParseService; +import com.easyagents.document.core.async.DocumentAsyncTaskManager; +import com.easyagents.document.core.async.DocumentAsyncTaskUpdater; +import com.easyagents.document.core.entity.ParseFile; +import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseTaskInfo; +import com.easyagents.document.core.entity.ParseTaskStatus; + +import java.util.ArrayList; +import java.util.List; + +/** + * 带统一异步任务能力的文档解析服务基类。 + * 支持 ppt 和 excel,pdf 和 word 文档使用 mineru 自带异步能力 + * + * @param 请求类型 + * @author Codex + * @since 2026-04-16 + */ +public abstract class AbstractAsyncDocumentParseService implements DocumentParseService { + + private final DocumentAsyncTaskManager taskManager; + + /** + * 创建服务基类。 + * + * @param taskManager 异步任务管理器 + */ + protected AbstractAsyncDocumentParseService(DocumentAsyncTaskManager taskManager) { + if (taskManager == null) { + throw new IllegalArgumentException("DocumentAsyncTaskManager must not be null"); + } + this.taskManager = taskManager; + } + + @Override + public ParseResponse parse(ParseRequest request) { + return doParse(normalizeRequest(request), null); + } + + @Override + public ParseTaskStatus submit(ParseRequest request) { + final R normalizedRequest = normalizeRequest(request); + return taskManager.submit( + normalizedRequest.getBackend(), + collectFileNames(normalizedRequest), + updater -> doParse(normalizedRequest, updater) + ); + } + + @Override + public ParseTaskStatus queryTask(String taskId) { + return taskManager.queryTask(taskId); + } + + @Override + public ParseResponse queryResult(String taskId) { + return taskManager.queryResult(taskId); + } + + @Override + public ParseTaskInfo queryTaskInfo(String taskId) { + return taskManager.queryTaskInfo(taskId); + } + + /** + * 归一化请求。 + * + * @param request 原始请求 + * @return 归一化结果 + */ + protected abstract R normalizeRequest(ParseRequest request); + + /** + * 执行解析。 + * + * @param request 归一化请求 + * @param updater 进度更新器;同步解析时可能为 {@code null} + * @return 解析结果 + */ + protected abstract ParseResponse doParse(R request, DocumentAsyncTaskUpdater updater); + + private List collectFileNames(ParseRequest request) { + List fileNames = new ArrayList(); + if (request == null || request.getFiles() == null) { + return fileNames; + } + for (ParseFile file : request.getFiles()) { + if (file != null && file.getFileName() != null) { + fileNames.add(file.getFileName()); + } + } + return fileNames; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/test/java/com/easyagents/document/core/async/DocumentAsyncTaskManagerTest.java b/easy-agents-document/easy-agents-document-core/src/test/java/com/easyagents/document/core/async/DocumentAsyncTaskManagerTest.java new file mode 100644 index 0000000..cc10ede --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/test/java/com/easyagents/document/core/async/DocumentAsyncTaskManagerTest.java @@ -0,0 +1,49 @@ +package com.easyagents.document.core.async; + +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseResult; +import com.easyagents.document.core.entity.ParseTaskInfo; +import com.easyagents.document.core.entity.ParseTaskStatus; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Collections; +import java.util.concurrent.Executor; + +/** + * 异步任务管理器测试。 + * + * @author Codex + * @since 2026-04-16 + */ +public class DocumentAsyncTaskManagerTest { + + @Test + public void shouldTrackTaskLifecycleAndResult() { + Executor directExecutor = new Executor() { + @Override + public void execute(Runnable command) { + command.run(); + } + }; + DocumentAsyncTaskManager manager = new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor); + + ParseTaskStatus status = manager.submit("mineru", Collections.singletonList("demo.pptx"), updater -> { + updater.update("ocr", 50, 1, 2, "处理中"); + ParseResponse response = new ParseResponse(); + ParseResult result = new ParseResult(); + result.setFileName("demo.pptx"); + result.setMarkdown("# Slide 1"); + response.setResults(Collections.singletonList(result)); + return response; + }); + + ParseTaskInfo taskInfo = manager.queryTaskInfo(status.getTaskId()); + + Assert.assertEquals("completed", taskInfo.getStatus()); + Assert.assertEquals(Integer.valueOf(100), taskInfo.getProgressPercent()); + Assert.assertEquals("completed", taskInfo.getCurrentStage()); + Assert.assertNotNull(taskInfo.getResult()); + Assert.assertEquals(1, taskInfo.getResult().getResults().size()); + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/test/java/com/easyagents/document/core/mineru/MineruDocumentParseServiceTest.java b/easy-agents-document/easy-agents-document-core/src/test/java/com/easyagents/document/core/mineru/MineruDocumentParseServiceTest.java new file mode 100644 index 0000000..cdcaeb8 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/test/java/com/easyagents/document/core/mineru/MineruDocumentParseServiceTest.java @@ -0,0 +1,210 @@ +package com.easyagents.document.core.mineru; + +import com.alibaba.fastjson2.JSONObject; +import com.easyagents.document.core.entity.ParseFile; +import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseTaskInfo; +import com.easyagents.document.core.entity.ParseTaskStatus; +import okhttp3.Request; +import okio.Buffer; +import org.junit.Assert; +import org.junit.Test; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; + +/** + * MinerU 通用文档解析服务测试。 + * + * @author Codex + * @since 2026-04-16 + */ +public class MineruDocumentParseServiceTest { + + @Test + public void shouldForceAsyncResultArtifacts() { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper); + + ParseRequest request = buildRequest(); + request.setReturnMarkdown(false); + request.setReturnMiddleJson(false); + request.setReturnContentList(false); + request.setReturnModelOutput(false); + request.setReturnImages(false); + + ParseTaskStatus status = service.submit(request); + + Assert.assertEquals("task-1", status.getTaskId()); + Assert.assertTrue(client.lastSubmitRequest.getReturnMarkdown()); + Assert.assertTrue(client.lastSubmitRequest.getReturnMiddleJson()); + Assert.assertTrue(client.lastSubmitRequest.getReturnContentList()); + Assert.assertTrue(client.lastSubmitRequest.getReturnModelOutput()); + Assert.assertTrue(client.lastSubmitRequest.getReturnImages()); + } + + @Test + public void shouldUseTaskMetadataWhenQueryingAsyncZipResult() { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper); + + ParseResponse response = service.queryResult("task-1"); + + Assert.assertEquals("vlm-http-client", response.getBackend()); + Assert.assertEquals("3.0.9", response.getVersion()); + Assert.assertEquals(1, response.getResults().size()); + Assert.assertEquals("demo", response.getResults().get(0).getFileName()); + } + + @Test + public void shouldReturnCompletedResultInTaskInfo() { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruDocumentParseService service = new MineruDocumentParseService(defaultProperties(), client, mapper); + + ParseTaskInfo taskInfo = service.queryTaskInfo("task-1"); + + Assert.assertEquals("completed", taskInfo.getStatus()); + Assert.assertNotNull(taskInfo.getResult()); + Assert.assertEquals(1, taskInfo.getResult().getResults().size()); + Assert.assertEquals(1, client.queryResultZipCount); + } + + @Test + public void shouldSendRepeatedLangListFields() { + InspectingMultipartClient client = new InspectingMultipartClient(defaultProperties()); + ParseRequest request = buildRequest(); + request.setLanguages(java.util.Arrays.asList("zh", "en")); + + client.parse(request); + + Assert.assertEquals(2, countOccurrences(client.lastMultipartBody, "name=\"lang_list\"")); + Assert.assertTrue(client.lastMultipartBody.contains("\r\nzh\r\n")); + Assert.assertTrue(client.lastMultipartBody.contains("\r\nen\r\n")); + } + + private ParseRequest buildRequest() { + ParseRequest request = new ParseRequest(); + request.addFile(ParseFile.of("demo.pptx", "ppt".getBytes(StandardCharsets.UTF_8))); + return request; + } + + private MineruProperties defaultProperties() { + MineruProperties properties = new MineruProperties(); + properties.setBaseUrl("http://127.0.0.1:8000"); + properties.setResultTimeoutMs(50); + properties.setPollIntervalMs(1); + return properties; + } + + private int countOccurrences(String source, String token) { + int count = 0; + int index = 0; + while (source != null && token != null && !token.isEmpty() && (index = source.indexOf(token, index)) >= 0) { + count++; + index += token.length(); + } + return count; + } + + private static class RecordingClient extends MineruClient { + + private ParseRequest lastSubmitRequest; + private int queryResultZipCount; + + private RecordingClient(MineruProperties properties) { + super(properties, new MineruMapper(properties)); + } + + @Override + public MineruTaskStatus submit(ParseRequest request) { + this.lastSubmitRequest = request; + MineruTaskStatus taskStatus = new MineruTaskStatus(); + taskStatus.setTaskId("task-1"); + taskStatus.setStatus("pending"); + return taskStatus; + } + + @Override + public MineruTaskStatus queryTask(String taskId) { + MineruTaskStatus taskStatus = new MineruTaskStatus(); + taskStatus.setTaskId(taskId); + taskStatus.setStatus("completed"); + taskStatus.setBackend("vlm-http-client"); + taskStatus.setVersion("3.0.9"); + return taskStatus; + } + + @Override + public byte[] queryResultZip(String taskId) { + queryResultZipCount++; + try { + return buildZipResult(); + } catch (IOException exception) { + throw new IllegalStateException("Failed to build test ZIP", exception); + } + } + + private static byte[] buildZipResult() throws IOException { + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + try (ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream)) { + addEntry(zipOutputStream, "demo/vlm/demo.md", "# title"); + addEntry(zipOutputStream, "demo/vlm/demo_middle.json", middleJson().toJSONString()); + addEntry(zipOutputStream, "demo/vlm/demo_content_list.json", contentList().toJSONString()); + } + return outputStream.toByteArray(); + } + + private static void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException { + zipOutputStream.putNextEntry(new ZipEntry(name)); + zipOutputStream.write(content.getBytes(StandardCharsets.UTF_8)); + zipOutputStream.closeEntry(); + } + + private static JSONObject middleJson() { + JSONObject middleJson = new JSONObject(); + middleJson.put("_backend", "vlm"); + middleJson.put("_version_name", "3.0.9"); + middleJson.put("pdf_info", new com.alibaba.fastjson2.JSONArray()); + return middleJson; + } + + private static com.alibaba.fastjson2.JSONArray contentList() { + com.alibaba.fastjson2.JSONArray contentList = new com.alibaba.fastjson2.JSONArray(); + JSONObject text = new JSONObject(); + text.put("type", "text"); + text.put("text", "title"); + text.put("page_idx", 0); + text.put("bbox", new com.alibaba.fastjson2.JSONArray()); + contentList.add(text); + return contentList; + } + } + + private static class InspectingMultipartClient extends MineruClient { + + private String lastMultipartBody; + + private InspectingMultipartClient(MineruProperties properties) { + super(properties, new MineruMapper(properties)); + } + + @Override + protected JSONObject executeJsonRequest(String path, Request request) { + try { + Buffer buffer = new Buffer(); + request.body().writeTo(buffer); + this.lastMultipartBody = buffer.readUtf8(); + } catch (IOException exception) { + throw new IllegalStateException("Failed to inspect multipart body", exception); + } + return new JSONObject(); + } + } +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentParseService.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentParseService.java index 6908ca4..f32ebd2 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentParseService.java +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentParseService.java @@ -1,6 +1,7 @@ package com.easyagents.document.pdf; import com.easyagents.document.core.DocumentParseService; +import com.easyagents.document.core.entity.PdfParseRequest; /** * PDF 文档解析服务。 @@ -8,5 +9,5 @@ import com.easyagents.document.core.DocumentParseService; * @author Codex * @since 2026-04-14 */ -public interface PdfDocumentParseService extends DocumentParseService { +public interface PdfDocumentParseService extends DocumentParseService { } diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java index 8132893..7238afb 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java @@ -1,31 +1,23 @@ package com.easyagents.document.pdf.mineru; -import com.easyagents.core.util.StringUtil; -import com.easyagents.document.core.exception.DocumentParseException; -import com.easyagents.document.core.model.ParseRequest; -import com.easyagents.document.core.model.ParseResponse; -import com.easyagents.document.core.model.ParseTaskInfo; -import com.easyagents.document.core.model.ParseTaskStatus; +import com.easyagents.document.core.mineru.MineruClient; +import com.easyagents.document.core.mineru.MineruDocumentParseService; +import com.easyagents.document.core.mineru.MineruMapper; +import com.easyagents.document.core.mineru.MineruProperties; +import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.PdfParseRequest; import com.easyagents.document.pdf.PdfDocumentProvider; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; +import com.easyagents.core.util.StringUtil; /** * 基于 MinerU API 的 PDF 解析服务。 * * @author Codex - * @since 2026-04-14 + * @since 2026-04-16 */ -public class MineruPdfDocumentParseService implements PdfDocumentProvider { +public class MineruPdfDocumentParseService extends MineruDocumentParseService implements PdfDocumentProvider { public static final String PROVIDER_NAME = "mineru"; - private static final Logger LOG = LoggerFactory.getLogger(MineruPdfDocumentParseService.class); - - private final MineruProperties properties; - private final MineruPdfClient client; - private final MineruMapper mapper; /** * 创建默认服务实例。 @@ -33,7 +25,7 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider { * @param properties MinerU 配置 */ public MineruPdfDocumentParseService(MineruProperties properties) { - this(properties, new MineruMapper(properties)); + super(properties); } /** @@ -43,7 +35,7 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider { * @param mapper 结果映射器 */ public MineruPdfDocumentParseService(MineruProperties properties, MineruMapper mapper) { - this(properties, new MineruPdfClient(properties, mapper), mapper); + super(properties, mapper); } /** @@ -53,10 +45,8 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider { * @param client HTTP 客户端 * @param mapper 结果映射器 */ - public MineruPdfDocumentParseService(MineruProperties properties, MineruPdfClient client, MineruMapper mapper) { - this.properties = properties; - this.client = client; - this.mapper = mapper; + public MineruPdfDocumentParseService(MineruProperties properties, MineruClient client, MineruMapper mapper) { + super(properties, client, mapper); } @Override @@ -65,145 +55,21 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider { } @Override - public ParseResponse parse(ParseRequest request) { - ParseRequest normalizedRequest = normalizeRequest(request); - LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}, parseMethod={}", - PROVIDER_NAME, - normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(), - normalizedRequest.getBackend(), - normalizedRequest.getParseMethod()); - ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest)); - LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}", - PROVIDER_NAME, - normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(), - response == null || response.getResults() == null ? 0 : response.getResults().size()); - return response; - } - - @Override - public ParseTaskStatus submit(ParseRequest request) { - ParseRequest normalizedRequest = normalizeRequest(request); - // 异步结果固定走全量 ZIP,调用方无需传入裁剪参数。 - normalizedRequest.setReturnMarkdown(true); - normalizedRequest.setReturnMiddleJson(true); - normalizedRequest.setReturnContentList(true); - normalizedRequest.setReturnModelOutput(true); - normalizedRequest.setReturnImages(true); - LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}, parseMethod={}", - PROVIDER_NAME, - normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(), - normalizedRequest.getBackend(), - normalizedRequest.getParseMethod()); - ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest)); - LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}", - PROVIDER_NAME, - taskStatus == null ? null : taskStatus.getTaskId(), - taskStatus == null ? null : taskStatus.getStatus()); - return taskStatus; - } - - @Override - public ParseTaskStatus queryTask(String taskId) { - validateTaskId(taskId); - ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId)); - LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}", - PROVIDER_NAME, - taskId, - taskStatus == null ? null : taskStatus.getStatus()); - return taskStatus; - } - - @Override - public ParseResponse queryResult(String taskId) { - validateTaskId(taskId); - LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId); - MineruTaskStatus taskStatus = waitForTaskCompleted(taskId); - ParseResponse response = mapper.fromZip(client.queryResultZip(taskId)); - mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion()); - LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}", - PROVIDER_NAME, - taskId, - response == null || response.getResults() == null ? 0 : response.getResults().size()); - return response; - } - - @Override - public ParseTaskInfo queryTaskInfo(String taskId) { - validateTaskId(taskId); - MineruTaskStatus taskStatus = client.queryTask(taskId); - ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(mapper.toParseTaskStatus(taskStatus)); - if ("completed".equalsIgnoreCase(taskStatus.getStatus())) { - ParseResponse response = mapper.fromZip(client.queryResultZip(taskId)); - mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion()); - taskInfo.setResult(response); - } - LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}", - PROVIDER_NAME, - taskId, - taskInfo == null ? null : taskInfo.getStatus(), - taskInfo != null && taskInfo.getResult() != null); - return taskInfo; - } - - private ParseRequest normalizeRequest(ParseRequest request) { - if (request == null) { - throw new IllegalArgumentException("ParseRequest must not be null"); - } - if (request.getFiles() == null || request.getFiles().isEmpty()) { - throw new IllegalArgumentException("ParseRequest files must not be empty"); - } - ParseRequest normalizedRequest = new ParseRequest(); - normalizedRequest.setFiles(new ArrayList<>(request.getFiles())); - normalizedRequest.setBackend(StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend()); - normalizedRequest.setParseMethod(StringUtil.hasText(request.getParseMethod()) ? request.getParseMethod() : properties.getDefaultParseMethod()); - normalizedRequest.setLanguages( - request.getLanguages() == null || request.getLanguages().isEmpty() - ? new ArrayList(properties.getDefaultLangList()) - : new ArrayList(request.getLanguages()) + protected ParseRequest normalizeRequest(ParseRequest request) { + PdfParseRequest normalizedRequest = PdfParseRequest.from(request); + ParseRequest commonRequest = super.normalizeRequest(normalizedRequest); + commonRequest.copyCommonFieldsTo(normalizedRequest); + normalizedRequest.setParseMethod( + StringUtil.hasText(normalizedRequest.getParseMethod()) ? normalizedRequest.getParseMethod() : getProperties().getDefaultParseMethod() ); - normalizedRequest.setFormulaEnabled(request.getFormulaEnabled() == null ? properties.getDefaultFormulaEnable() : request.getFormulaEnabled()); - normalizedRequest.setTableEnabled(request.getTableEnabled() == null ? properties.getDefaultTableEnable() : request.getTableEnabled()); - normalizedRequest.setStartPageIndex(request.getStartPageIndex() == null ? 0 : request.getStartPageIndex()); - normalizedRequest.setEndPageIndex(request.getEndPageIndex() == null ? 99999 : request.getEndPageIndex()); - normalizedRequest.setReturnMarkdown(request.getReturnMarkdown()); - normalizedRequest.setReturnMiddleJson(request.getReturnMiddleJson()); - normalizedRequest.setReturnContentList(request.getReturnContentList()); - normalizedRequest.setReturnModelOutput(request.getReturnModelOutput()); - normalizedRequest.setReturnImages(request.getReturnImages()); + normalizedRequest.setFormulaEnabled( + normalizedRequest.getFormulaEnabled() == null ? getProperties().getDefaultFormulaEnable() : normalizedRequest.getFormulaEnabled() + ); + normalizedRequest.setTableEnabled( + normalizedRequest.getTableEnabled() == null ? getProperties().getDefaultTableEnable() : normalizedRequest.getTableEnabled() + ); + normalizedRequest.setStartPageIndex(normalizedRequest.getStartPageIndex() == null ? 0 : normalizedRequest.getStartPageIndex()); + normalizedRequest.setEndPageIndex(normalizedRequest.getEndPageIndex() == null ? 99999 : normalizedRequest.getEndPageIndex()); return normalizedRequest; } - - private void validateTaskId(String taskId) { - if (!StringUtil.hasText(taskId)) { - throw new IllegalArgumentException("taskId must not be empty"); - } - } - - /** - * 轮询任务状态直到完成或失败。 - * - * @param taskId 任务 ID - * @return 已完成的任务状态 - */ - private MineruTaskStatus waitForTaskCompleted(String taskId) { - long deadline = System.currentTimeMillis() + properties.getResultTimeoutMs(); - while (true) { - MineruTaskStatus taskStatus = client.queryTask(taskId); - if ("completed".equals(taskStatus.getStatus())) { - return taskStatus; - } - if ("failed".equals(taskStatus.getStatus())) { - throw new DocumentParseException("MinerU task failed: " + taskStatus.getError()); - } - if (System.currentTimeMillis() >= deadline) { - throw new DocumentParseException("MinerU task result timeout: " + taskId); - } - try { - Thread.sleep(properties.getPollIntervalMs()); - } catch (InterruptedException exception) { - Thread.currentThread().interrupt(); - throw new DocumentParseException("Interrupted while waiting for MinerU task: " + taskId, exception); - } - } - } } diff --git a/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java b/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java index 7fbc349..115a803 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java +++ b/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java @@ -2,10 +2,13 @@ package com.easyagents.document.pdf.mineru; import com.alibaba.fastjson2.JSONArray; import com.alibaba.fastjson2.JSONObject; +import com.easyagents.document.core.mineru.MineruMapper; +import com.easyagents.document.core.mineru.MineruProperties; +import com.easyagents.document.core.mineru.MineruResultPayload; import com.easyagents.document.core.exception.DocumentParseException; -import com.easyagents.document.core.model.ParseRequest; -import com.easyagents.document.core.model.ParseResponse; -import com.easyagents.document.core.model.ParseResult; +import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseResult; import org.junit.Assert; import org.junit.Test; @@ -41,6 +44,7 @@ public class MineruMapperTest { Assert.assertFalse(result.getBlocks().isEmpty()); Assert.assertEquals(1, result.getTables().size()); Assert.assertEquals(2, result.getImages().size()); + Assert.assertNotNull(result.getImages().get(0).getContent()); Assert.assertNotNull(result.getArtifacts().getMiddleJson()); Assert.assertNotNull(result.getArtifacts().getContentList()); } @@ -56,6 +60,7 @@ public class MineruMapperTest { Assert.assertEquals("# title", result.getPlainText()); Assert.assertEquals(1, result.getTables().size()); Assert.assertEquals(2, result.getImages().size()); + Assert.assertNotNull(result.getImages().get(0).getContent()); Assert.assertNotNull(result.getArtifacts().getExtraJsonArtifacts().get("contentListV2")); } diff --git a/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseServiceTest.java b/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseServiceTest.java index 8cab0be..768d255 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseServiceTest.java +++ b/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseServiceTest.java @@ -1,11 +1,16 @@ package com.easyagents.document.pdf.mineru; import com.alibaba.fastjson2.JSONObject; -import com.easyagents.document.core.model.ParseFile; -import com.easyagents.document.core.model.ParseRequest; -import com.easyagents.document.core.model.ParseResponse; -import com.easyagents.document.core.model.ParseTaskInfo; -import com.easyagents.document.core.model.ParseTaskStatus; +import com.easyagents.document.core.mineru.MineruClient; +import com.easyagents.document.core.mineru.MineruMapper; +import com.easyagents.document.core.mineru.MineruProperties; +import com.easyagents.document.core.mineru.MineruResultPayload; +import com.easyagents.document.core.mineru.MineruTaskStatus; +import com.easyagents.document.core.entity.ParseFile; +import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseTaskInfo; +import com.easyagents.document.core.entity.ParseTaskStatus; import okhttp3.Request; import okio.Buffer; import org.junit.Assert; @@ -147,7 +152,7 @@ public class MineruPdfDocumentParseServiceTest { return count; } - private static class RecordingClient extends MineruPdfClient { + private static class RecordingClient extends MineruClient { private ParseRequest lastParseRequest; private ParseRequest lastSubmitRequest; @@ -248,7 +253,7 @@ public class MineruPdfDocumentParseServiceTest { } } - private static class InspectingMultipartClient extends MineruPdfClient { + private static class InspectingMultipartClient extends MineruClient { private String lastMultipartBody; diff --git a/easy-agents-document/easy-agents-document-pptx/pom.xml b/easy-agents-document/easy-agents-document-pptx/pom.xml new file mode 100644 index 0000000..30ed3e0 --- /dev/null +++ b/easy-agents-document/easy-agents-document-pptx/pom.xml @@ -0,0 +1,44 @@ + + + 4.0.0 + + + com.easyagents + easy-agents-document + ${revision} + + + easy-agents-document-pptx + easy-agents-document-pptx + + + 8 + 8 + UTF-8 + + + + + com.easyagents + easy-agents-document-core + + + + com.easyagents + easy-agents-core + + + + com.alibaba.fastjson2 + fastjson2 + + + + junit + junit + test + + + diff --git a/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/PptxDocumentParseService.java b/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/PptxDocumentParseService.java new file mode 100644 index 0000000..b00846b --- /dev/null +++ b/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/PptxDocumentParseService.java @@ -0,0 +1,13 @@ +package com.easyagents.document.pptx; + +import com.easyagents.document.core.DocumentParseService; +import com.easyagents.document.core.entity.PptxParseRequest; + +/** + * PPTX 文档解析服务。 + * + * @author Codex + * @since 2026-04-16 + */ +public interface PptxDocumentParseService extends DocumentParseService { +} diff --git a/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/PptxDocumentProvider.java b/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/PptxDocumentProvider.java new file mode 100644 index 0000000..108569e --- /dev/null +++ b/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/PptxDocumentProvider.java @@ -0,0 +1,17 @@ +package com.easyagents.document.pptx; + +/** + * PPTX provider SPI。 + * + * @author Codex + * @since 2026-04-16 + */ +public interface PptxDocumentProvider extends PptxDocumentParseService { + + /** + * 获取 provider 标识。 + * + * @return provider 名称 + */ + String getProvider(); +} diff --git a/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/mineru/MineruPptxDocumentParseService.java b/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/mineru/MineruPptxDocumentParseService.java new file mode 100644 index 0000000..6fd4f0a --- /dev/null +++ b/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/mineru/MineruPptxDocumentParseService.java @@ -0,0 +1,408 @@ +package com.easyagents.document.pptx.mineru; + +import com.easyagents.core.util.StringUtil; +import com.easyagents.document.core.async.DocumentAsyncTaskManager; +import com.easyagents.document.core.async.DocumentAsyncTaskRepository; +import com.easyagents.document.core.async.DocumentAsyncTaskUpdater; +import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository; +import com.easyagents.document.core.mineru.MineruClient; +import com.easyagents.document.core.mineru.MineruMapper; +import com.easyagents.document.core.mineru.MineruProperties; +import com.easyagents.document.core.entity.DocumentBlock; +import com.easyagents.document.core.entity.DocumentImage; +import com.easyagents.document.core.entity.DocumentPage; +import com.easyagents.document.core.entity.DocumentTable; +import com.easyagents.document.core.entity.ParseFile; +import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseResult; +import com.easyagents.document.core.entity.PptxParseRequest; +import com.easyagents.document.core.support.AbstractAsyncDocumentParseService; +import com.easyagents.document.pptx.PptxDocumentProvider; +import com.easyagents.document.pptx.model.PptxParseArtifact; +import com.easyagents.document.pptx.model.PptxSlideArtifact; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFSlide; + +import javax.imageio.ImageIO; +import java.awt.Color; +import java.awt.Dimension; +import java.awt.Graphics2D; +import java.awt.RenderingHints; +import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * 基于 MinerU 的 PPTX 文档解析服务。 + * + * @author Codex + * @since 2026-04-16 + */ +public class MineruPptxDocumentParseService extends AbstractAsyncDocumentParseService implements PptxDocumentProvider { + + public static final String PROVIDER_NAME = "mineru"; + + private final MineruProperties properties; + private final MineruClient client; + private final MineruMapper mapper; + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + */ + public MineruPptxDocumentParseService(MineruProperties properties) { + this(properties, new MineruMapper(properties)); + } + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + * @param mapper MinerU 映射器 + */ + public MineruPptxDocumentParseService(MineruProperties properties, MineruMapper mapper) { + this(properties, new MineruClient(properties, mapper), mapper, defaultTaskManager()); + } + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + * @param taskManager 异步任务管理器 + */ + public MineruPptxDocumentParseService(MineruProperties properties, DocumentAsyncTaskManager taskManager) { + this(properties, new MineruMapper(properties), taskManager); + } + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + * @param mapper MinerU 映射器 + * @param taskManager 异步任务管理器 + */ + public MineruPptxDocumentParseService(MineruProperties properties, + MineruMapper mapper, + DocumentAsyncTaskManager taskManager) { + this(properties, new MineruClient(properties, mapper), mapper, taskManager); + } + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + * @param client MinerU 客户端 + * @param mapper MinerU 映射器 + * @param taskManager 异步任务管理器 + */ + public MineruPptxDocumentParseService(MineruProperties properties, + MineruClient client, + MineruMapper mapper, + DocumentAsyncTaskManager taskManager) { + super(taskManager); + this.properties = properties; + this.client = client; + this.mapper = mapper; + } + + @Override + public String getProvider() { + return PROVIDER_NAME; + } + + @Override + protected PptxParseRequest normalizeRequest(ParseRequest request) { + PptxParseRequest normalized = PptxParseRequest.from(request); + if (normalized.getFiles() == null || normalized.getFiles().isEmpty()) { + throw new IllegalArgumentException("PptxParseRequest files must not be empty"); + } + normalized.setBackend(StringUtil.hasText(normalized.getBackend()) ? normalized.getBackend() : properties.getDefaultBackend()); + if (normalized.getLanguages() == null || normalized.getLanguages().isEmpty()) { + normalized.setLanguages(new ArrayList(properties.getDefaultLangList())); + } + normalized.setReturnMarkdown(normalized.getReturnMarkdown() == null ? Boolean.TRUE : normalized.getReturnMarkdown()); + normalized.setReturnMiddleJson(normalized.getReturnMiddleJson() == null ? Boolean.TRUE : normalized.getReturnMiddleJson()); + normalized.setReturnContentList(normalized.getReturnContentList() == null ? Boolean.TRUE : normalized.getReturnContentList()); + normalized.setReturnModelOutput(normalized.getReturnModelOutput() == null ? Boolean.FALSE : normalized.getReturnModelOutput()); + normalized.setReturnImages(normalized.getReturnImages() == null ? Boolean.TRUE : normalized.getReturnImages()); + normalized.setRenderScale(normalized.getRenderScale() == null || normalized.getRenderScale() <= 0 ? 2.0d : normalized.getRenderScale()); + normalized.setImageFormat(normalizeImageFormat(normalized.getImageFormat())); + normalized.setIncludeSlideImageReference( + normalized.getIncludeSlideImageReference() == null ? Boolean.TRUE : normalized.getIncludeSlideImageReference() + ); + return normalized; + } + + @Override + protected ParseResponse doParse(PptxParseRequest request, DocumentAsyncTaskUpdater updater) { + ParseResponse response = new ParseResponse(); + List results = new ArrayList(); + int totalSlides = countSlides(request); + int processedSlides = 0; + String backend = null; + String version = null; + + for (ParseFile file : request.getFiles()) { + ParseResult result = parseSinglePptx(file, request, updater, processedSlides, totalSlides); + processedSlides += Integer.parseInt(String.valueOf(result.getMetadata().get("slideCount"))); + if (backend == null) { + backend = (String) result.getMetadata().get("ocrBackend"); + } + if (version == null) { + version = (String) result.getMetadata().get("ocrVersion"); + } + result.getMetadata().remove("slideCount"); + result.getMetadata().remove("ocrBackend"); + result.getMetadata().remove("ocrVersion"); + results.add(result); + } + response.setBackend(StringUtil.hasText(backend) ? backend : request.getBackend()); + response.setVersion(version); + response.setResults(results); + return response; + } + + private ParseResult parseSinglePptx(ParseFile file, + PptxParseRequest request, + DocumentAsyncTaskUpdater updater, + int processedSlidesBefore, + int totalSlides) { + ParseResult aggregate = new ParseResult(); + aggregate.setFileName(file.getFileName()); + StringBuilder markdownBuilder = new StringBuilder(); + PptxParseArtifact artifact = new PptxParseArtifact(); + String backend = null; + String version = null; + int slideCount = 0; + + try (XMLSlideShow slideShow = new XMLSlideShow(new ByteArrayInputStream(file.getContent()))) { + List slides = slideShow.getSlides(); + Dimension pageSize = slideShow.getPageSize(); + int startSlide = request.getStartSlideIndex() == null ? 0 : Math.max(request.getStartSlideIndex(), 0); + int endSlide = request.getEndSlideIndex() == null + ? slides.size() - 1 + : Math.min(request.getEndSlideIndex(), slides.size() - 1); + if (endSlide < startSlide) { + endSlide = startSlide - 1; + } + + for (int slideIndex = startSlide; slideIndex <= endSlide; slideIndex++) { + XSLFSlide slide = slides.get(slideIndex); + slideCount++; + updateProgress(updater, "extracting", processedSlidesBefore + slideCount - 1, totalSlides, + "正在渲染第 " + (slideIndex + 1) + " 页幻灯片"); + + byte[] imageBytes = renderSlide(slide, pageSize, request.getRenderScale(), request.getImageFormat()); + String imagePath = buildImagePath(slideIndex, request.getImageFormat()); + String imageName = buildImageName(slideIndex); + + updateProgress(updater, "ocr", processedSlidesBefore + slideCount - 1, totalSlides, + "正在识别第 " + (slideIndex + 1) + " 页幻灯片"); + ParseResult ocrResult = parseSlideImage(slideIndex, imageBytes, request, imagePath); + + if (!StringUtil.hasText(backend)) { + backend = (String) ocrResult.getMetadata().get("middleBackend"); + } + if (!StringUtil.hasText(version)) { + version = (String) ocrResult.getMetadata().get("middleVersion"); + } + + appendSlideMarkdown(markdownBuilder, slideIndex, imageName, imagePath, request, ocrResult.getMarkdown()); + aggregate.getImages().add(buildSlideImage(slideIndex, imageName, imagePath, request.getImageFormat(), imageBytes)); + aggregate.getPages().add(buildPage(slideIndex, pageSize, request.getRenderScale())); + mergeOcrResult(aggregate, slideIndex, ocrResult); + artifact.getSlides().add(buildSlideArtifact(slideIndex, slide, imageName, imagePath, ocrResult)); + } + } catch (IOException exception) { + throw new IllegalStateException("Failed to parse PPTX file: " + file.getFileName(), exception); + } + + updateProgress(updater, "assembling", processedSlidesBefore + slideCount, totalSlides, "正在汇总 PPTX 解析结果"); + aggregate.setMarkdown(markdownBuilder.toString().trim()); + aggregate.setPlainText(aggregate.getMarkdown()); + aggregate.getArtifacts().getExtraJsonArtifacts().put("pptx", artifact); + aggregate.getMetadata().put("slideCount", slideCount); + aggregate.getMetadata().put("ocrBackend", backend); + aggregate.getMetadata().put("ocrVersion", version); + return aggregate; + } + + private ParseResult parseSlideImage(int slideIndex, byte[] imageBytes, PptxParseRequest request, String imagePath) { + ParseRequest imageRequest = new ParseRequest(); + imageRequest.addFile(ParseFile.of("slide-" + (slideIndex + 1) + "." + request.getImageFormat(), imageBytes, "image/" + request.getImageFormat())); + imageRequest.setBackend(request.getBackend()); + imageRequest.setLanguages(request.getLanguages()); + imageRequest.setReturnMarkdown(true); + imageRequest.setReturnMiddleJson(true); + imageRequest.setReturnContentList(true); + imageRequest.setReturnModelOutput(false); + imageRequest.setReturnImages(false); + ParseResponse response = mapper.toParseResponse(client.parse(imageRequest)); + ParseResult result = response.getResults().isEmpty() ? new ParseResult() : response.getResults().get(0); + if (!StringUtil.hasText(result.getMarkdown())) { + result.setMarkdown(result.getPlainText()); + } + result.getMetadata().put("slideImagePath", imagePath); + return result; + } + + private void appendSlideMarkdown(StringBuilder markdownBuilder, + int slideIndex, + String imageName, + String imagePath, + PptxParseRequest request, + String ocrMarkdown) { + if (markdownBuilder.length() > 0) { + markdownBuilder.append("\n\n"); + } + markdownBuilder.append("# Slide ").append(slideIndex + 1).append("\n\n"); + if (Boolean.TRUE.equals(request.getIncludeSlideImageReference())) { + markdownBuilder.append("![").append(imageName).append("](").append(imagePath).append(")\n\n"); + } + if (StringUtil.hasText(ocrMarkdown)) { + markdownBuilder.append(ocrMarkdown.trim()); + } + } + + private DocumentImage buildSlideImage(int slideIndex, String imageName, String imagePath, String imageFormat, byte[] imageBytes) { + DocumentImage image = new DocumentImage(); + image.setPageIndex(slideIndex); + image.setName(imageName); + image.setSourcePath(imagePath); + image.setMimeType("image/" + imageFormat); + image.setContent(imageBytes); + return image; + } + + private DocumentPage buildPage(int slideIndex, Dimension pageSize, Double renderScale) { + DocumentPage page = new DocumentPage(); + page.setPageIndex(slideIndex); + page.setWidth(pageSize.getWidth() * renderScale); + page.setHeight(pageSize.getHeight() * renderScale); + return page; + } + + private void mergeOcrResult(ParseResult aggregate, int slideIndex, ParseResult ocrResult) { + for (DocumentBlock block : ocrResult.getBlocks()) { + block.setPageIndex(slideIndex); + aggregate.getBlocks().add(block); + } + for (DocumentTable table : ocrResult.getTables()) { + table.setPageIndex(slideIndex); + aggregate.getTables().add(table); + } + for (String warning : ocrResult.getWarnings()) { + aggregate.getWarnings().add("Slide " + (slideIndex + 1) + ": " + warning); + } + } + + private PptxSlideArtifact buildSlideArtifact(int slideIndex, + XSLFSlide slide, + String imageName, + String imagePath, + ParseResult ocrResult) { + PptxSlideArtifact artifact = new PptxSlideArtifact(); + artifact.setSlideIndex(slideIndex); + artifact.setTitle(slide.getTitle()); + artifact.setImageName(imageName); + artifact.setImagePath(imagePath); + artifact.setOcrMarkdown(ocrResult.getMarkdown()); + artifact.setMiddleJson(ocrResult.getArtifacts().getMiddleJson()); + artifact.setContentList(ocrResult.getArtifacts().getContentList()); + artifact.setWarnings(new ArrayList(ocrResult.getWarnings())); + return artifact; + } + + private byte[] renderSlide(XSLFSlide slide, Dimension pageSize, Double renderScale, String imageFormat) throws IOException { + double scale = renderScale == null ? 2.0d : renderScale; + int width = Math.max(1, (int) Math.round(pageSize.getWidth() * scale)); + int height = Math.max(1, (int) Math.round(pageSize.getHeight() * scale)); + BufferedImage image = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); + Graphics2D graphics = image.createGraphics(); + try { + graphics.setColor(Color.WHITE); + graphics.fillRect(0, 0, width, height); + graphics.scale(scale, scale); + graphics.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON); + graphics.setRenderingHint(RenderingHints.KEY_RENDERING, RenderingHints.VALUE_RENDER_QUALITY); + graphics.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC); + slide.draw(graphics); + } finally { + graphics.dispose(); + } + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + ImageIO.write(image, imageFormat, outputStream); + return outputStream.toByteArray(); + } + + private int countSlides(PptxParseRequest request) { + int totalSlides = 0; + for (ParseFile file : request.getFiles()) { + try (XMLSlideShow slideShow = new XMLSlideShow(new ByteArrayInputStream(file.getContent()))) { + int slideSize = slideShow.getSlides().size(); + int startSlide = request.getStartSlideIndex() == null ? 0 : Math.max(request.getStartSlideIndex(), 0); + int endSlide = request.getEndSlideIndex() == null + ? slideSize - 1 + : Math.min(request.getEndSlideIndex(), slideSize - 1); + if (endSlide >= startSlide) { + totalSlides += endSlide - startSlide + 1; + } + } catch (IOException exception) { + throw new IllegalStateException("Failed to inspect PPTX slide count: " + file.getFileName(), exception); + } + } + return totalSlides; + } + + private void updateProgress(DocumentAsyncTaskUpdater updater, + String stage, + int processedItems, + int totalItems, + String message) { + if (updater == null) { + return; + } + int safeTotal = totalItems <= 0 ? 1 : totalItems; + int percent = (int) Math.min(99, Math.round(processedItems * 100.0d / safeTotal)); + updater.update(stage, percent, processedItems, totalItems, message); + } + + private String normalizeImageFormat(String imageFormat) { + if ("jpg".equalsIgnoreCase(imageFormat) || "jpeg".equalsIgnoreCase(imageFormat)) { + return "jpg"; + } + return "png"; + } + + private String buildImagePath(int slideIndex, String imageFormat) { + return "images/slide-" + formatIndex(slideIndex) + "/page." + imageFormat; + } + + private String buildImageName(int slideIndex) { + return "slide-" + formatIndex(slideIndex) + "-page"; + } + + private String formatIndex(int slideIndex) { + int displayIndex = slideIndex + 1; + if (displayIndex < 10) { + return "00" + displayIndex; + } + if (displayIndex < 100) { + return "0" + displayIndex; + } + return String.valueOf(displayIndex); + } + + private static DocumentAsyncTaskManager defaultTaskManager() { + DocumentAsyncTaskRepository repository = new InMemoryDocumentAsyncTaskRepository(); + ExecutorService executorService = Executors.newFixedThreadPool(2); + return new DocumentAsyncTaskManager(repository, executorService); + } +} diff --git a/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/model/PptxParseArtifact.java b/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/model/PptxParseArtifact.java new file mode 100644 index 0000000..99d5942 --- /dev/null +++ b/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/model/PptxParseArtifact.java @@ -0,0 +1,23 @@ +package com.easyagents.document.pptx.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * PPTX 结构化工件。 + * + * @author Codex + * @since 2026-04-16 + */ +public class PptxParseArtifact { + + private List slides = new ArrayList(); + + public List getSlides() { + return slides; + } + + public void setSlides(List slides) { + this.slides = slides == null ? new ArrayList() : slides; + } +} diff --git a/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/model/PptxSlideArtifact.java b/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/model/PptxSlideArtifact.java new file mode 100644 index 0000000..bd692f0 --- /dev/null +++ b/easy-agents-document/easy-agents-document-pptx/src/main/java/com/easyagents/document/pptx/model/PptxSlideArtifact.java @@ -0,0 +1,86 @@ +package com.easyagents.document.pptx.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * 单页幻灯片工件。 + * + * @author Codex + * @since 2026-04-16 + */ +public class PptxSlideArtifact { + + private Integer slideIndex; + private String title; + private String imageName; + private String imagePath; + private String ocrMarkdown; + private Object middleJson; + private Object contentList; + private List warnings = new ArrayList(); + + public Integer getSlideIndex() { + return slideIndex; + } + + public void setSlideIndex(Integer slideIndex) { + this.slideIndex = slideIndex; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getImageName() { + return imageName; + } + + public void setImageName(String imageName) { + this.imageName = imageName; + } + + public String getImagePath() { + return imagePath; + } + + public void setImagePath(String imagePath) { + this.imagePath = imagePath; + } + + public String getOcrMarkdown() { + return ocrMarkdown; + } + + public void setOcrMarkdown(String ocrMarkdown) { + this.ocrMarkdown = ocrMarkdown; + } + + public Object getMiddleJson() { + return middleJson; + } + + public void setMiddleJson(Object middleJson) { + this.middleJson = middleJson; + } + + public Object getContentList() { + return contentList; + } + + public void setContentList(Object contentList) { + this.contentList = contentList; + } + + public List getWarnings() { + return warnings; + } + + public void setWarnings(List warnings) { + this.warnings = warnings == null ? new ArrayList() : warnings; + } +} diff --git a/easy-agents-document/easy-agents-document-pptx/src/test/java/com/easyagents/document/pptx/mineru/MineruPptxDocumentParseServiceTest.java b/easy-agents-document/easy-agents-document-pptx/src/test/java/com/easyagents/document/pptx/mineru/MineruPptxDocumentParseServiceTest.java new file mode 100644 index 0000000..f71607e --- /dev/null +++ b/easy-agents-document/easy-agents-document-pptx/src/test/java/com/easyagents/document/pptx/mineru/MineruPptxDocumentParseServiceTest.java @@ -0,0 +1,170 @@ +package com.easyagents.document.pptx.mineru; + +import com.alibaba.fastjson2.JSONArray; +import com.alibaba.fastjson2.JSONObject; +import com.easyagents.document.core.async.DocumentAsyncTaskManager; +import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository; +import com.easyagents.document.core.mineru.MineruClient; +import com.easyagents.document.core.mineru.MineruMapper; +import com.easyagents.document.core.mineru.MineruProperties; +import com.easyagents.document.core.mineru.MineruResultPayload; +import com.easyagents.document.core.entity.ParseFile; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseResult; +import com.easyagents.document.core.entity.ParseTaskInfo; +import com.easyagents.document.core.entity.ParseTaskStatus; +import com.easyagents.document.core.entity.PptxParseRequest; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFSlide; +import org.apache.poi.xslf.usermodel.XSLFTextBox; +import org.junit.Assert; +import org.junit.Test; + +import java.awt.Rectangle; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.concurrent.Executor; + +/** + * PPTX MinerU 服务测试。 + * + * @author Codex + * @since 2026-04-16 + */ +public class MineruPptxDocumentParseServiceTest { + + @Test + public void shouldBuildMarkdownAndImagesForSlides() throws IOException { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruPptxDocumentParseService service = new MineruPptxDocumentParseService( + defaultProperties(), + client, + mapper, + new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor()) + ); + + PptxParseRequest request = new PptxParseRequest(); + request.addFile(ParseFile.of("demo.pptx", buildPptxBytes())); + + ParseResponse response = service.parse(request); + + Assert.assertEquals(1, response.getResults().size()); + ParseResult result = response.getResults().get(0); + Assert.assertTrue(result.getMarkdown().contains("# Slide 1")); + Assert.assertTrue(result.getMarkdown().contains("images/slide-001/page.png")); + Assert.assertTrue(result.getMarkdown().contains("slide-ocr-1")); + Assert.assertEquals(2, result.getImages().size()); + Assert.assertNotNull(result.getImages().get(0).getContent()); + Assert.assertNotNull(result.getArtifacts().getExtraJsonArtifacts().get("pptx")); + } + + @Test + public void shouldSupportAsyncTaskFlow() throws IOException { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruPptxDocumentParseService service = new MineruPptxDocumentParseService( + defaultProperties(), + client, + mapper, + new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor()) + ); + + PptxParseRequest request = new PptxParseRequest(); + request.addFile(ParseFile.of("demo.pptx", buildPptxBytes())); + + ParseTaskStatus status = service.submit(request); + ParseTaskInfo taskInfo = service.queryTaskInfo(status.getTaskId()); + + Assert.assertEquals("completed", taskInfo.getStatus()); + Assert.assertNotNull(taskInfo.getResult()); + Assert.assertEquals(1, taskInfo.getResult().getResults().size()); + } + + private byte[] buildPptxBytes() throws IOException { + XMLSlideShow slideShow = new XMLSlideShow(); + slideShow.setPageSize(new java.awt.Dimension(640, 360)); + createSlide(slideShow, "第一页"); + createSlide(slideShow, "第二页"); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + slideShow.write(outputStream); + slideShow.close(); + return outputStream.toByteArray(); + } + + private void createSlide(XMLSlideShow slideShow, String text) { + XSLFSlide slide = slideShow.createSlide(); + XSLFTextBox textBox = slide.createTextBox(); + textBox.setAnchor(new Rectangle(20, 20, 300, 80)); + textBox.setText(text); + } + + private MineruProperties defaultProperties() { + MineruProperties properties = new MineruProperties(); + properties.setBaseUrl("http://127.0.0.1:8000"); + return properties; + } + + private Executor directExecutor() { + return new Executor() { + @Override + public void execute(Runnable command) { + command.run(); + } + }; + } + + private static class RecordingClient extends MineruClient { + + private int parseCount; + + private RecordingClient(MineruProperties properties) { + super(properties, new MineruMapper(properties)); + } + + @Override + public MineruResultPayload parse(com.easyagents.document.core.entity.ParseRequest request) { + parseCount++; + return new MineruMapper(testProperties()).toResultPayload(syncPayload(parseCount)); + } + + private JSONObject syncPayload(int index) { + JSONObject payload = new JSONObject(); + payload.put("backend", "vlm-http-client"); + payload.put("version", "3.0.9"); + JSONObject result = new JSONObject(); + result.put("md_content", "slide-ocr-" + index); + result.put("middle_json", middleJson()); + result.put("content_list", contentList(index)); + JSONObject results = new JSONObject(); + results.put("slide-" + index, result); + payload.put("results", results); + return payload; + } + + private JSONObject middleJson() { + JSONObject middleJson = new JSONObject(); + middleJson.put("_backend", "vlm-http-client"); + middleJson.put("_version_name", "3.0.9"); + middleJson.put("pdf_info", new JSONArray()); + return middleJson; + } + + private JSONArray contentList(int index) { + JSONArray contentList = new JSONArray(); + JSONObject text = new JSONObject(); + text.put("type", "text"); + text.put("text", "slide-ocr-" + index); + text.put("page_idx", 0); + text.put("bbox", new JSONArray()); + contentList.add(text); + return contentList; + } + + private static MineruProperties testProperties() { + MineruProperties properties = new MineruProperties(); + properties.setBaseUrl("http://127.0.0.1:8000"); + return properties; + } + } +} diff --git a/easy-agents-document/easy-agents-document-xlsx/pom.xml b/easy-agents-document/easy-agents-document-xlsx/pom.xml new file mode 100644 index 0000000..63b8270 --- /dev/null +++ b/easy-agents-document/easy-agents-document-xlsx/pom.xml @@ -0,0 +1,44 @@ + + + 4.0.0 + + + com.easyagents + easy-agents-document + ${revision} + + + easy-agents-document-xlsx + easy-agents-document-xlsx + + + 8 + 8 + UTF-8 + + + + + com.easyagents + easy-agents-document-core + + + + com.easyagents + easy-agents-core + + + + com.alibaba.fastjson2 + fastjson2 + + + + junit + junit + test + + + diff --git a/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/XlsxDocumentParseService.java b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/XlsxDocumentParseService.java new file mode 100644 index 0000000..9efd80e --- /dev/null +++ b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/XlsxDocumentParseService.java @@ -0,0 +1,13 @@ +package com.easyagents.document.xlsx; + +import com.easyagents.document.core.DocumentParseService; +import com.easyagents.document.core.entity.XlsxParseRequest; + +/** + * XLSX 文档解析服务。 + * + * @author Codex + * @since 2026-04-16 + */ +public interface XlsxDocumentParseService extends DocumentParseService { +} diff --git a/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/XlsxDocumentProvider.java b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/XlsxDocumentProvider.java new file mode 100644 index 0000000..f881b09 --- /dev/null +++ b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/XlsxDocumentProvider.java @@ -0,0 +1,17 @@ +package com.easyagents.document.xlsx; + +/** + * XLSX provider SPI。 + * + * @author Codex + * @since 2026-04-16 + */ +public interface XlsxDocumentProvider extends XlsxDocumentParseService { + + /** + * 获取 provider 标识。 + * + * @return provider 名称 + */ + String getProvider(); +} diff --git a/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/mineru/MineruXlsxDocumentParseService.java b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/mineru/MineruXlsxDocumentParseService.java new file mode 100644 index 0000000..8d68499 --- /dev/null +++ b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/mineru/MineruXlsxDocumentParseService.java @@ -0,0 +1,625 @@ +package com.easyagents.document.xlsx.mineru; + +import com.easyagents.core.util.StringUtil; +import com.easyagents.document.core.async.DocumentAsyncTaskManager; +import com.easyagents.document.core.async.DocumentAsyncTaskRepository; +import com.easyagents.document.core.async.DocumentAsyncTaskUpdater; +import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository; +import com.easyagents.document.core.mineru.MineruClient; +import com.easyagents.document.core.mineru.MineruMapper; +import com.easyagents.document.core.mineru.MineruProperties; +import com.easyagents.document.core.entity.DocumentImage; +import com.easyagents.document.core.entity.ParseFile; +import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseResult; +import com.easyagents.document.core.entity.XlsxParseRequest; +import com.easyagents.document.core.support.AbstractAsyncDocumentParseService; +import com.easyagents.document.xlsx.XlsxDocumentProvider; +import com.easyagents.document.xlsx.model.XlsxCellArtifact; +import com.easyagents.document.xlsx.model.XlsxCellImageArtifact; +import com.easyagents.document.xlsx.model.XlsxParseArtifact; +import com.easyagents.document.xlsx.model.XlsxRowArtifact; +import com.easyagents.document.xlsx.model.XlsxSheetArtifact; +import com.easyagents.document.xlsx.model.XlsxSheetImagesArtifact; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.usermodel.FormulaEvaluator; +import org.apache.poi.ss.util.CellReference; +import org.apache.poi.xssf.usermodel.XSSFClientAnchor; +import org.apache.poi.xssf.usermodel.XSSFDrawing; +import org.apache.poi.xssf.usermodel.XSSFPicture; +import org.apache.poi.xssf.usermodel.XSSFPictureData; +import org.apache.poi.xssf.usermodel.XSSFShape; +import org.apache.poi.xssf.usermodel.XSSFSheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +import java.io.ByteArrayInputStream; +import java.net.URLConnection; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * XLSX 文档解析服务,OCR 由 mineru 提供支持 + * + * @author Codex + * @since 2026-04-16 + */ +public class MineruXlsxDocumentParseService extends AbstractAsyncDocumentParseService implements XlsxDocumentProvider { + + public static final String PROVIDER_NAME = "mineru"; + + private final MineruProperties properties; + private final MineruClient client; + private final MineruMapper mapper; + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + */ + public MineruXlsxDocumentParseService(MineruProperties properties) { + this(properties, new MineruMapper(properties)); + } + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + * @param mapper MinerU 映射器 + */ + public MineruXlsxDocumentParseService(MineruProperties properties, MineruMapper mapper) { + this(properties, new MineruClient(properties, mapper), mapper, defaultTaskManager()); + } + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + * @param taskManager 异步任务管理器 + */ + public MineruXlsxDocumentParseService(MineruProperties properties, DocumentAsyncTaskManager taskManager) { + this(properties, new MineruMapper(properties), taskManager); + } + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + * @param mapper MinerU 映射器 + * @param taskManager 异步任务管理器 + */ + public MineruXlsxDocumentParseService(MineruProperties properties, + MineruMapper mapper, + DocumentAsyncTaskManager taskManager) { + this(properties, new MineruClient(properties, mapper), mapper, taskManager); + } + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + * @param client MinerU 客户端 + * @param mapper MinerU 映射器 + * @param taskManager 异步任务管理器 + */ + public MineruXlsxDocumentParseService(MineruProperties properties, + MineruClient client, + MineruMapper mapper, + DocumentAsyncTaskManager taskManager) { + super(taskManager); + this.properties = properties; + this.client = client; + this.mapper = mapper; + } + + @Override + public String getProvider() { + return PROVIDER_NAME; + } + + @Override + protected XlsxParseRequest normalizeRequest(ParseRequest request) { + XlsxParseRequest normalized = XlsxParseRequest.from(request); + if (normalized.getFiles() == null || normalized.getFiles().isEmpty()) { + throw new IllegalArgumentException("XlsxParseRequest files must not be empty"); + } + normalized.setBackend(StringUtil.hasText(normalized.getBackend()) ? normalized.getBackend() : properties.getDefaultBackend()); + if (normalized.getLanguages() == null || normalized.getLanguages().isEmpty()) { + normalized.setLanguages(new ArrayList(properties.getDefaultLangList())); + } + normalized.setReturnMarkdown(normalized.getReturnMarkdown() == null ? Boolean.TRUE : normalized.getReturnMarkdown()); + normalized.setReturnMiddleJson(normalized.getReturnMiddleJson() == null ? Boolean.FALSE : normalized.getReturnMiddleJson()); + normalized.setReturnContentList(normalized.getReturnContentList() == null ? Boolean.FALSE : normalized.getReturnContentList()); + normalized.setReturnModelOutput(normalized.getReturnModelOutput() == null ? Boolean.FALSE : normalized.getReturnModelOutput()); + normalized.setReturnImages(normalized.getReturnImages() == null ? Boolean.TRUE : normalized.getReturnImages()); + normalized.setIncludeHiddenSheets(normalized.getIncludeHiddenSheets() == null ? Boolean.FALSE : normalized.getIncludeHiddenSheets()); + normalized.setOcrEmbeddedImages(normalized.getOcrEmbeddedImages() == null ? Boolean.TRUE : normalized.getOcrEmbeddedImages()); + normalized.setIncludeImageAppendix(normalized.getIncludeImageAppendix() == null ? Boolean.TRUE : normalized.getIncludeImageAppendix()); + return normalized; + } + + @Override + protected ParseResponse doParse(XlsxParseRequest request, DocumentAsyncTaskUpdater updater) { + ParseResponse response = new ParseResponse(); + List results = new ArrayList(); + String backend = null; + int processedFiles = 0; + int totalFiles = request.getFiles().size(); + + for (ParseFile file : request.getFiles()) { + updateProgress(updater, "extracting", processedFiles, totalFiles, "正在读取工作簿结构"); + ParseResult result = parseSingleWorkbook(file, request, updater); + processedFiles++; + if (backend == null) { + backend = (String) result.getMetadata().get("ocrBackend"); + } + result.getMetadata().remove("ocrBackend"); + results.add(result); + } + + updateProgress(updater, "assembling", processedFiles, totalFiles, "正在汇总 XLSX 解析结果"); + response.setBackend(StringUtil.hasText(backend) ? backend : request.getBackend()); + response.setResults(results); + return response; + } + + private ParseResult parseSingleWorkbook(ParseFile file, XlsxParseRequest request, DocumentAsyncTaskUpdater updater) { + ParseResult aggregate = new ParseResult(); + aggregate.setFileName(file.getFileName()); + XlsxParseArtifact artifact = new XlsxParseArtifact(); + artifact.setWorkbookName(file.getFileName()); + StringBuilder markdownBuilder = new StringBuilder(); + String backend = null; + + try (XSSFWorkbook workbook = new XSSFWorkbook(new ByteArrayInputStream(file.getContent()))) { + FormulaEvaluator evaluator = workbook.getCreationHelper().createFormulaEvaluator(); + DataFormatter formatter = new DataFormatter(); + List sheetIndexes = resolveSheetIndexes(workbook, request); + int processedSheets = 0; + + for (Integer sheetIndex : sheetIndexes) { + XSSFSheet sheet = workbook.getSheetAt(sheetIndex); + updateProgress(updater, "extracting", processedSheets, sheetIndexes.size(), "正在读取 Sheet " + sheet.getSheetName()); + SheetExtraction sheetExtraction = extractSheet(sheet, sheetIndex, formatter, evaluator, request, updater); + artifact.getSheets().add(sheetExtraction.sheetArtifact); + artifact.getCellImages().addAll(sheetExtraction.imageArtifacts); + artifact.getSheetImages().add(sheetExtraction.sheetImagesArtifact); + artifact.getMergedRanges().addAll(sheetExtraction.mergedRanges); + aggregate.getImages().addAll(sheetExtraction.documentImages); + if (markdownBuilder.length() > 0) { + markdownBuilder.append("\n\n"); + } + markdownBuilder.append(sheetExtraction.markdown); + if (backend == null) { + backend = sheetExtraction.ocrBackend; + } + processedSheets++; + } + } catch (Exception exception) { + throw new IllegalStateException("Failed to parse XLSX file: " + file.getFileName(), exception); + } + + aggregate.setMarkdown(markdownBuilder.toString().trim()); + aggregate.setPlainText(aggregate.getMarkdown()); + aggregate.getArtifacts().getExtraJsonArtifacts().put("xlsx", artifact); + aggregate.getMetadata().put("ocrBackend", backend); + return aggregate; + } + + private SheetExtraction extractSheet(XSSFSheet sheet, + int sheetIndex, + DataFormatter formatter, + FormulaEvaluator evaluator, + XlsxParseRequest request, + DocumentAsyncTaskUpdater updater) { + SheetExtraction extraction = new SheetExtraction(); + extraction.sheetArtifact = new XlsxSheetArtifact(); + extraction.sheetArtifact.setSheetName(sheet.getSheetName()); + extraction.sheetArtifact.setSheetIndex(sheetIndex); + extraction.sheetArtifact.setHidden(Boolean.valueOf(sheet.getWorkbook().isSheetHidden(sheetIndex) + || sheet.getWorkbook().isSheetVeryHidden(sheetIndex))); + extraction.sheetImagesArtifact = new XlsxSheetImagesArtifact(); + extraction.sheetImagesArtifact.setSheetName(sheet.getSheetName()); + extraction.sheetImagesArtifact.setSheetIndex(sheetIndex); + + Map> imagesByCell = new LinkedHashMap>(); + List sheetImages = extractImages(sheet, sheetIndex, request, updater); + List imageArtifacts = new ArrayList(); + for (SheetImageExtraction sheetImage : sheetImages) { + XlsxCellImageArtifact imageArtifact = sheetImage.imageArtifact; + imageArtifacts.add(imageArtifact); + extraction.imageArtifacts.add(imageArtifact); + extraction.sheetImagesArtifact.getReferenceKeys().add(imageArtifact.getReferenceKey()); + extraction.sheetImagesArtifact.getSourcePaths().add(imageArtifact.getSourcePath()); + String anchorCell = imageArtifact.getAnchorCell(); + List cellImages = imagesByCell.get(anchorCell); + if (cellImages == null) { + cellImages = new ArrayList(); + imagesByCell.put(anchorCell, cellImages); + } + cellImages.add(imageArtifact); + extraction.documentImages.add(sheetImage.documentImage); + } + + int maxRow = resolveMaxRow(sheet, request.getMaxRowsPerSheet()); + int maxCol = resolveMaxCol(sheet, maxRow, imagesByCell); + extraction.sheetArtifact.setRowCount(maxRow + 1); + extraction.sheetArtifact.setColumnCount(maxCol); + appendSheetHeader(extraction.markdown, sheet.getSheetName()); + + if (maxRow < 0 || maxCol <= 0) { + extraction.markdown.append("_empty sheet_"); + if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) { + appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts); + } + return extraction; + } + + List> markdownRows = new ArrayList>(); + for (int rowIndex = 0; rowIndex <= maxRow; rowIndex++) { + org.apache.poi.ss.usermodel.Row row = sheet.getRow(rowIndex); + XlsxRowArtifact rowArtifact = new XlsxRowArtifact(); + rowArtifact.setRowIndex(rowIndex); + List rowValues = new ArrayList(); + for (int colIndex = 0; colIndex < maxCol; colIndex++) { + String cellRef = new CellReference(rowIndex, colIndex).formatAsString(); + String cellText = readCellText(row, colIndex, formatter, evaluator); + List cellImages = imagesByCell.get(cellRef); + String displayValue = mergeDisplayValue(cellText, cellImages); + rowValues.add(escapeMarkdown(displayValue)); + + XlsxCellArtifact cellArtifact = new XlsxCellArtifact(); + cellArtifact.setRowIndex(rowIndex); + cellArtifact.setColumnIndex(colIndex); + cellArtifact.setCellRef(cellRef); + cellArtifact.setText(cellText); + if (cellImages != null) { + List imageKeys = new ArrayList(); + for (XlsxCellImageArtifact cellImage : cellImages) { + imageKeys.add(cellImage.getReferenceKey()); + } + cellArtifact.setImageKeys(imageKeys); + } + rowArtifact.getCells().add(cellArtifact); + } + extraction.sheetArtifact.getRows().add(rowArtifact); + markdownRows.add(rowValues); + } + + appendMarkdownTable(extraction.markdown, markdownRows); + extraction.mergedRanges.addAll(extractMergedRanges(sheet)); + if (Boolean.TRUE.equals(request.getIncludeImageAppendix()) && !imageArtifacts.isEmpty()) { + appendImageAppendix(extraction.markdown, sheet.getSheetName(), imageArtifacts); + } + return extraction; + } + + private List extractImages(XSSFSheet sheet, + int sheetIndex, + XlsxParseRequest request, + DocumentAsyncTaskUpdater updater) { + List images = new ArrayList(); + XSSFDrawing drawing = sheet.getDrawingPatriarch(); + if (drawing == null) { + return images; + } + String sheetKey = buildSheetKey(sheet.getSheetName(), sheetIndex); + int imageIndex = 0; + for (XSSFShape shape : drawing.getShapes()) { + if (!(shape instanceof XSSFPicture)) { + continue; + } + imageIndex++; + XSSFPicture picture = (XSSFPicture) shape; + XSSFClientAnchor anchor = picture.getPreferredSize(); + if (anchor == null) { + continue; + } + XSSFPictureData pictureData = picture.getPictureData(); + String extension = pictureData == null || !StringUtil.hasText(pictureData.suggestFileExtension()) + ? "png" + : pictureData.suggestFileExtension(); + String imageName = buildImageName(sheetKey, anchor.getRow1(), anchor.getCol1(), imageIndex); + String sourcePath = buildImageSourcePath(sheetKey, anchor.getRow1(), anchor.getCol1(), imageIndex, extension); + + XlsxCellImageArtifact imageArtifact = new XlsxCellImageArtifact(); + imageArtifact.setSheetName(sheet.getSheetName()); + imageArtifact.setAnchorCell(new CellReference(anchor.getRow1(), anchor.getCol1()).formatAsString()); + imageArtifact.setFromRow(anchor.getRow1()); + imageArtifact.setFromCol((int) anchor.getCol1()); + imageArtifact.setToRow(anchor.getRow2()); + imageArtifact.setToCol((int) anchor.getCol2()); + imageArtifact.setName(imageName); + imageArtifact.setReferenceKey(imageName); + imageArtifact.setSourcePath(sourcePath); + if (Boolean.TRUE.equals(request.getOcrEmbeddedImages()) && pictureData != null) { + updateProgress(updater, "ocr", imageIndex - 1, drawing.getShapes().size(), "正在识别 Sheet " + sheet.getSheetName() + " 中的图片"); + imageArtifact.setOcrText(parseImageOcr(pictureData.getData(), extension, request, imageName)); + } + DocumentImage documentImage = new DocumentImage(); + documentImage.setName(imageName); + documentImage.setSourcePath(sourcePath); + documentImage.setMimeType(detectImageMimeType(sourcePath)); + documentImage.setContent(pictureData == null ? null : pictureData.getData()); + + SheetImageExtraction sheetImage = new SheetImageExtraction(); + sheetImage.imageArtifact = imageArtifact; + sheetImage.documentImage = documentImage; + images.add(sheetImage); + } + return images; + } + + private String parseImageOcr(byte[] imageBytes, String extension, XlsxParseRequest request, String imageName) { + ParseRequest imageRequest = new ParseRequest(); + imageRequest.addFile(ParseFile.of(imageName + "." + extension, imageBytes, "image/" + extension)); + imageRequest.setBackend(request.getBackend()); + imageRequest.setLanguages(request.getLanguages()); + imageRequest.setReturnMarkdown(true); + imageRequest.setReturnMiddleJson(false); + imageRequest.setReturnContentList(false); + imageRequest.setReturnModelOutput(false); + imageRequest.setReturnImages(false); + ParseResponse response = mapper.toParseResponse(client.parse(imageRequest)); + if (response.getResults().isEmpty()) { + return null; + } + ParseResult result = response.getResults().get(0); + return StringUtil.hasText(result.getMarkdown()) ? result.getMarkdown() : result.getPlainText(); + } + + private List resolveSheetIndexes(XSSFWorkbook workbook, XlsxParseRequest request) { + List indexes = new ArrayList(); + for (int index = 0; index < workbook.getNumberOfSheets(); index++) { + String sheetName = workbook.getSheetName(index); + if (!Boolean.TRUE.equals(request.getIncludeHiddenSheets()) + && (workbook.isSheetHidden(index) || workbook.isSheetVeryHidden(index))) { + continue; + } + if (request.getSheetNames() != null && !request.getSheetNames().isEmpty() + && !request.getSheetNames().contains(sheetName)) { + continue; + } + indexes.add(index); + } + return indexes; + } + + private int resolveMaxRow(XSSFSheet sheet, Integer maxRowsPerSheet) { + int lastRow = sheet.getLastRowNum(); + if (lastRow < 0) { + return -1; + } + if (maxRowsPerSheet == null || maxRowsPerSheet <= 0) { + return lastRow; + } + return Math.min(lastRow, maxRowsPerSheet - 1); + } + + private int resolveMaxCol(XSSFSheet sheet, int maxRow, Map> imagesByCell) { + int maxCol = 0; + for (int rowIndex = 0; rowIndex <= maxRow; rowIndex++) { + org.apache.poi.ss.usermodel.Row row = sheet.getRow(rowIndex); + if (row != null && row.getLastCellNum() > maxCol) { + maxCol = row.getLastCellNum(); + } + } + for (String cellRef : imagesByCell.keySet()) { + CellReference reference = new CellReference(cellRef); + if (reference.getCol() + 1 > maxCol) { + maxCol = reference.getCol() + 1; + } + } + return maxCol; + } + + private String readCellText(org.apache.poi.ss.usermodel.Row row, int colIndex, DataFormatter formatter, FormulaEvaluator evaluator) { + if (row == null) { + return ""; + } + org.apache.poi.ss.usermodel.Cell cell = row.getCell(colIndex); + if (cell == null) { + return ""; + } + return formatter.formatCellValue(cell, evaluator); + } + + private String mergeDisplayValue(String cellText, List cellImages) { + StringBuilder builder = new StringBuilder(); + if (StringUtil.hasText(cellText)) { + builder.append(cellText.trim()); + } + if (cellImages != null && !cellImages.isEmpty()) { + for (XlsxCellImageArtifact cellImage : cellImages) { + if (builder.length() > 0) { + builder.append('\n'); + } + builder.append("[IMG:").append(cellImage.getReferenceKey()).append(']'); + } + } + return builder.toString(); + } + + private void appendSheetHeader(StringBuilder markdownBuilder, String sheetName) { + markdownBuilder.append("# ").append(sheetName).append("\n\n"); + } + + private void appendMarkdownTable(StringBuilder markdownBuilder, List> rows) { + if (rows.isEmpty()) { + markdownBuilder.append("_empty sheet_"); + return; + } + List header = rows.get(0); + markdownBuilder.append("| ").append(joinCells(header)).append(" |\n"); + markdownBuilder.append("|"); + for (int index = 0; index < header.size(); index++) { + markdownBuilder.append(" --- |"); + } + markdownBuilder.append("\n"); + for (int rowIndex = 1; rowIndex < rows.size(); rowIndex++) { + markdownBuilder.append("| ").append(joinCells(rows.get(rowIndex))).append(" |\n"); + } + } + + private void appendImageAppendix(StringBuilder markdownBuilder, + String sheetName, + List imageArtifacts) { + markdownBuilder.append("\n## ").append(sheetName).append(" 图片说明\n\n"); + for (XlsxCellImageArtifact imageArtifact : imageArtifacts) { + markdownBuilder.append("![") + .append(imageArtifact.getReferenceKey()) + .append("](") + .append(imageArtifact.getSourcePath()) + .append(")\n\n"); + markdownBuilder.append("- 占位符:[IMG:") + .append(imageArtifact.getReferenceKey()) + .append("]\n"); + markdownBuilder.append("- 锚点:") + .append(imageArtifact.getAnchorCell()) + .append("\n"); + markdownBuilder.append("- OCR:") + .append(StringUtil.hasText(imageArtifact.getOcrText()) ? imageArtifact.getOcrText() : "") + .append("\n\n"); + } + } + + private List extractMergedRanges(XSSFSheet sheet) { + List mergedRanges = new ArrayList(); + for (int index = 0; index < sheet.getNumMergedRegions(); index++) { + mergedRanges.add(sheet.getMergedRegion(index).formatAsString()); + } + return mergedRanges; + } + + private String joinCells(List cells) { + StringBuilder builder = new StringBuilder(); + for (int index = 0; index < cells.size(); index++) { + if (index > 0) { + builder.append(" | "); + } + builder.append(cells.get(index)); + } + return builder.toString(); + } + + private String escapeMarkdown(String text) { + if (!StringUtil.hasText(text)) { + return ""; + } + return text.replace("|", "\\|").replace("\r", " ").replace("\n", "
"); + } + + private String buildImageName(String sheetKey, int rowIndex, int colIndex, int imageIndex) { + return sheetKey + "-r" + (rowIndex + 1) + "c" + (colIndex + 1) + "-" + formatIndex(imageIndex); + } + + private String buildImageSourcePath(String sheetKey, int rowIndex, int colIndex, int imageIndex, String extension) { + return "images/" + sheetKey + "/r" + (rowIndex + 1) + "c" + (colIndex + 1) + "-" + formatIndex(imageIndex) + "." + extension; + } + + private String buildSheetKey(String sheetName, int sheetIndex) { + if (!StringUtil.hasText(sheetName)) { + return "sheet-" + formatIndex(sheetIndex + 1); + } + String lowerCaseName = sheetName.toLowerCase(Locale.ROOT); + StringBuilder builder = new StringBuilder(); + for (int index = 0; index < lowerCaseName.length(); index++) { + char character = lowerCaseName.charAt(index); + if ((character >= 'a' && character <= 'z') || (character >= '0' && character <= '9')) { + builder.append(character); + continue; + } + if (builder.length() > 0 && builder.charAt(builder.length() - 1) != '-') { + builder.append('-'); + } + builder.append('u').append(String.format(Locale.ROOT, "%04x", (int) character)).append('-'); + } + String normalized = builder.toString(); + while (normalized.startsWith("-")) { + normalized = normalized.substring(1); + } + while (normalized.endsWith("-")) { + normalized = normalized.substring(0, normalized.length() - 1); + } + return StringUtil.hasText(normalized) ? normalized : "sheet-" + formatIndex(sheetIndex + 1); + } + + private String formatIndex(int index) { + int displayIndex = index <= 0 ? 1 : index; + if (displayIndex < 10) { + return "00" + displayIndex; + } + if (displayIndex < 100) { + return "0" + displayIndex; + } + return String.valueOf(displayIndex); + } + + private String detectImageMimeType(String path) { + if (!StringUtil.hasText(path)) { + return "application/octet-stream"; + } + String mimeType = URLConnection.guessContentTypeFromName(path); + if (StringUtil.hasText(mimeType)) { + return mimeType; + } + String lowerCasePath = path.toLowerCase(Locale.ROOT); + if (lowerCasePath.endsWith(".jpg") || lowerCasePath.endsWith(".jpeg")) { + return "image/jpeg"; + } + if (lowerCasePath.endsWith(".png")) { + return "image/png"; + } + if (lowerCasePath.endsWith(".gif")) { + return "image/gif"; + } + if (lowerCasePath.endsWith(".bmp")) { + return "image/bmp"; + } + if (lowerCasePath.endsWith(".webp")) { + return "image/webp"; + } + return "application/octet-stream"; + } + + private void updateProgress(DocumentAsyncTaskUpdater updater, + String stage, + int processedItems, + int totalItems, + String message) { + if (updater == null) { + return; + } + int safeTotal = totalItems <= 0 ? 1 : totalItems; + int percent = (int) Math.min(99, Math.round(processedItems * 100.0d / safeTotal)); + updater.update(stage, percent, processedItems, totalItems, message); + } + + private static DocumentAsyncTaskManager defaultTaskManager() { + DocumentAsyncTaskRepository repository = new InMemoryDocumentAsyncTaskRepository(); + ExecutorService executorService = Executors.newFixedThreadPool(2); + return new DocumentAsyncTaskManager(repository, executorService); + } + + private static class SheetExtraction { + + private final StringBuilder markdown = new StringBuilder(); + private final List imageArtifacts = new ArrayList(); + private final List documentImages = new ArrayList(); + private final List mergedRanges = new ArrayList(); + private XlsxSheetArtifact sheetArtifact; + private XlsxSheetImagesArtifact sheetImagesArtifact; + private String ocrBackend; + } + + private static class SheetImageExtraction { + + private XlsxCellImageArtifact imageArtifact; + private DocumentImage documentImage; + } +} diff --git a/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxCellArtifact.java b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxCellArtifact.java new file mode 100644 index 0000000..54508dc --- /dev/null +++ b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxCellArtifact.java @@ -0,0 +1,59 @@ +package com.easyagents.document.xlsx.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * 单元格工件。 + * + * @author Codex + * @since 2026-04-16 + */ +public class XlsxCellArtifact { + + private Integer rowIndex; + private Integer columnIndex; + private String cellRef; + private String text; + private List imageKeys = new ArrayList(); + + public Integer getRowIndex() { + return rowIndex; + } + + public void setRowIndex(Integer rowIndex) { + this.rowIndex = rowIndex; + } + + public Integer getColumnIndex() { + return columnIndex; + } + + public void setColumnIndex(Integer columnIndex) { + this.columnIndex = columnIndex; + } + + public String getCellRef() { + return cellRef; + } + + public void setCellRef(String cellRef) { + this.cellRef = cellRef; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + public List getImageKeys() { + return imageKeys; + } + + public void setImageKeys(List imageKeys) { + this.imageKeys = imageKeys == null ? new ArrayList() : imageKeys; + } +} diff --git a/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxCellImageArtifact.java b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxCellImageArtifact.java new file mode 100644 index 0000000..67885aa --- /dev/null +++ b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxCellImageArtifact.java @@ -0,0 +1,101 @@ +package com.easyagents.document.xlsx.model; + +/** + * 单元格图片工件。 + * + * @author Codex + * @since 2026-04-16 + */ +public class XlsxCellImageArtifact { + + private String sheetName; + private String anchorCell; + private Integer fromRow; + private Integer fromCol; + private Integer toRow; + private Integer toCol; + private String name; + private String referenceKey; + private String sourcePath; + private String ocrText; + + public String getSheetName() { + return sheetName; + } + + public void setSheetName(String sheetName) { + this.sheetName = sheetName; + } + + public String getAnchorCell() { + return anchorCell; + } + + public void setAnchorCell(String anchorCell) { + this.anchorCell = anchorCell; + } + + public Integer getFromRow() { + return fromRow; + } + + public void setFromRow(Integer fromRow) { + this.fromRow = fromRow; + } + + public Integer getFromCol() { + return fromCol; + } + + public void setFromCol(Integer fromCol) { + this.fromCol = fromCol; + } + + public Integer getToRow() { + return toRow; + } + + public void setToRow(Integer toRow) { + this.toRow = toRow; + } + + public Integer getToCol() { + return toCol; + } + + public void setToCol(Integer toCol) { + this.toCol = toCol; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getReferenceKey() { + return referenceKey; + } + + public void setReferenceKey(String referenceKey) { + this.referenceKey = referenceKey; + } + + public String getSourcePath() { + return sourcePath; + } + + public void setSourcePath(String sourcePath) { + this.sourcePath = sourcePath; + } + + public String getOcrText() { + return ocrText; + } + + public void setOcrText(String ocrText) { + this.ocrText = ocrText; + } +} diff --git a/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxParseArtifact.java b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxParseArtifact.java new file mode 100644 index 0000000..35b3974 --- /dev/null +++ b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxParseArtifact.java @@ -0,0 +1,59 @@ +package com.easyagents.document.xlsx.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * XLSX 结构化工件。 + * + * @author Codex + * @since 2026-04-16 + */ +public class XlsxParseArtifact { + + private String workbookName; + private List sheets = new ArrayList(); + private List sheetImages = new ArrayList(); + private List mergedRanges = new ArrayList(); + private List cellImages = new ArrayList(); + + public String getWorkbookName() { + return workbookName; + } + + public void setWorkbookName(String workbookName) { + this.workbookName = workbookName; + } + + public List getSheets() { + return sheets; + } + + public void setSheets(List sheets) { + this.sheets = sheets == null ? new ArrayList() : sheets; + } + + public List getSheetImages() { + return sheetImages; + } + + public void setSheetImages(List sheetImages) { + this.sheetImages = sheetImages == null ? new ArrayList() : sheetImages; + } + + public List getMergedRanges() { + return mergedRanges; + } + + public void setMergedRanges(List mergedRanges) { + this.mergedRanges = mergedRanges == null ? new ArrayList() : mergedRanges; + } + + public List getCellImages() { + return cellImages; + } + + public void setCellImages(List cellImages) { + this.cellImages = cellImages == null ? new ArrayList() : cellImages; + } +} diff --git a/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxRowArtifact.java b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxRowArtifact.java new file mode 100644 index 0000000..5e47bc8 --- /dev/null +++ b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxRowArtifact.java @@ -0,0 +1,32 @@ +package com.easyagents.document.xlsx.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * 行工件。 + * + * @author Codex + * @since 2026-04-16 + */ +public class XlsxRowArtifact { + + private Integer rowIndex; + private List cells = new ArrayList(); + + public Integer getRowIndex() { + return rowIndex; + } + + public void setRowIndex(Integer rowIndex) { + this.rowIndex = rowIndex; + } + + public List getCells() { + return cells; + } + + public void setCells(List cells) { + this.cells = cells == null ? new ArrayList() : cells; + } +} diff --git a/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxSheetArtifact.java b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxSheetArtifact.java new file mode 100644 index 0000000..355213a --- /dev/null +++ b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxSheetArtifact.java @@ -0,0 +1,68 @@ +package com.easyagents.document.xlsx.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * Sheet 工件。 + * + * @author Codex + * @since 2026-04-16 + */ +public class XlsxSheetArtifact { + + private String sheetName; + private Integer sheetIndex; + private Boolean hidden; + private Integer rowCount; + private Integer columnCount; + private List rows = new ArrayList(); + + public String getSheetName() { + return sheetName; + } + + public void setSheetName(String sheetName) { + this.sheetName = sheetName; + } + + public Integer getSheetIndex() { + return sheetIndex; + } + + public void setSheetIndex(Integer sheetIndex) { + this.sheetIndex = sheetIndex; + } + + public Boolean getHidden() { + return hidden; + } + + public void setHidden(Boolean hidden) { + this.hidden = hidden; + } + + public Integer getRowCount() { + return rowCount; + } + + public void setRowCount(Integer rowCount) { + this.rowCount = rowCount; + } + + public Integer getColumnCount() { + return columnCount; + } + + public void setColumnCount(Integer columnCount) { + this.columnCount = columnCount; + } + + public List getRows() { + return rows; + } + + public void setRows(List rows) { + this.rows = rows == null ? new ArrayList() : rows; + } +} diff --git a/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxSheetImagesArtifact.java b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxSheetImagesArtifact.java new file mode 100644 index 0000000..a33f0f1 --- /dev/null +++ b/easy-agents-document/easy-agents-document-xlsx/src/main/java/com/easyagents/document/xlsx/model/XlsxSheetImagesArtifact.java @@ -0,0 +1,50 @@ +package com.easyagents.document.xlsx.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * Sheet 维度的图片索引工件。 + * + * @author Codex + * @since 2026-04-16 + */ +public class XlsxSheetImagesArtifact { + + private String sheetName; + private Integer sheetIndex; + private List referenceKeys = new ArrayList(); + private List sourcePaths = new ArrayList(); + + public String getSheetName() { + return sheetName; + } + + public void setSheetName(String sheetName) { + this.sheetName = sheetName; + } + + public Integer getSheetIndex() { + return sheetIndex; + } + + public void setSheetIndex(Integer sheetIndex) { + this.sheetIndex = sheetIndex; + } + + public List getReferenceKeys() { + return referenceKeys; + } + + public void setReferenceKeys(List referenceKeys) { + this.referenceKeys = referenceKeys == null ? new ArrayList() : referenceKeys; + } + + public List getSourcePaths() { + return sourcePaths; + } + + public void setSourcePaths(List sourcePaths) { + this.sourcePaths = sourcePaths == null ? new ArrayList() : sourcePaths; + } +} diff --git a/easy-agents-document/easy-agents-document-xlsx/src/test/java/com/easyagents/document/xlsx/mineru/MineruXlsxDocumentParseServiceTest.java b/easy-agents-document/easy-agents-document-xlsx/src/test/java/com/easyagents/document/xlsx/mineru/MineruXlsxDocumentParseServiceTest.java new file mode 100644 index 0000000..5bc4c85 --- /dev/null +++ b/easy-agents-document/easy-agents-document-xlsx/src/test/java/com/easyagents/document/xlsx/mineru/MineruXlsxDocumentParseServiceTest.java @@ -0,0 +1,333 @@ +package com.easyagents.document.xlsx.mineru; + +import com.alibaba.fastjson2.JSONObject; +import com.easyagents.document.core.async.DocumentAsyncTaskManager; +import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository; +import com.easyagents.document.core.mineru.MineruClient; +import com.easyagents.document.core.mineru.MineruMapper; +import com.easyagents.document.core.mineru.MineruProperties; +import com.easyagents.document.core.mineru.MineruResultPayload; +import com.easyagents.document.core.entity.ParseArtifacts; +import com.easyagents.document.core.entity.ParseFile; +import com.easyagents.document.core.entity.ParseResponse; +import com.easyagents.document.core.entity.ParseResult; +import com.easyagents.document.core.entity.ParseTaskInfo; +import com.easyagents.document.core.entity.ParseTaskStatus; +import com.easyagents.document.core.entity.XlsxParseRequest; +import com.easyagents.document.core.exception.DocumentParseException; +import com.easyagents.document.xlsx.model.XlsxParseArtifact; +import org.apache.poi.ss.usermodel.ClientAnchor; +import org.apache.poi.xssf.usermodel.XSSFDrawing; +import org.apache.poi.xssf.usermodel.XSSFSheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.Assert; +import org.junit.Test; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.ByteArrayOutputStream; +import java.util.ArrayDeque; +import java.util.Queue; +import java.util.concurrent.Executor; + +/** + * XLSX MinerU 服务测试。 + * + * @author Codex + * @since 2026-04-16 + */ +public class MineruXlsxDocumentParseServiceTest { + + @Test + public void shouldBuildMarkdownAndImageArtifacts() throws Exception { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService( + defaultProperties(), + client, + mapper, + new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor()) + ); + + XlsxParseRequest request = new XlsxParseRequest(); + request.addFile(ParseFile.of("demo.xlsx", buildWorkbookBytes())); + + ParseResponse response = service.parse(request); + + Assert.assertEquals(1, response.getResults().size()); + ParseResult result = response.getResults().get(0); + Assert.assertTrue(result.getMarkdown().contains("# Sheet1")); + Assert.assertTrue(result.getMarkdown().contains("[IMG:sheet1-r2c2-001]")); + Assert.assertTrue(result.getMarkdown().contains("images/sheet1/r2c2-001.png")); + Assert.assertTrue(result.getMarkdown().contains("图片文字描述")); + Assert.assertEquals(1, result.getImages().size()); + Assert.assertNotNull(result.getImages().get(0).getContent()); + + XlsxParseArtifact artifact = extractXlsxArtifact(result); + Assert.assertEquals("demo.xlsx", artifact.getWorkbookName()); + Assert.assertEquals(1, artifact.getSheets().size()); + Assert.assertEquals(1, artifact.getSheetImages().size()); + Assert.assertEquals(1, artifact.getCellImages().size()); + Assert.assertEquals("sheet1-r2c2-001", artifact.getCellImages().get(0).getReferenceKey()); + Assert.assertEquals("images/sheet1/r2c2-001.png", artifact.getCellImages().get(0).getSourcePath()); + Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0)); + Assert.assertEquals("images/sheet1/r2c2-001.png", artifact.getSheetImages().get(0).getSourcePaths().get(0)); + } + + @Test + public void shouldKeepImageKeysUniqueForNonAsciiSheetNames() throws Exception { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService( + defaultProperties(), + client, + mapper, + new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor()) + ); + + XlsxParseRequest request = new XlsxParseRequest(); + request.addFile(ParseFile.of("unicode-sheets.xlsx", buildWorkbookBytesWithUnicodeSheetNames())); + + ParseResponse response = service.parse(request); + ParseResult result = response.getResults().get(0); + + Assert.assertEquals(2, result.getImages().size()); + Assert.assertNotEquals(result.getImages().get(0).getName(), result.getImages().get(1).getName()); + Assert.assertNotEquals(result.getImages().get(0).getSourcePath(), result.getImages().get(1).getSourcePath()); + Assert.assertTrue(result.getMarkdown().contains("[IMG:" + result.getImages().get(0).getName() + "]")); + Assert.assertTrue(result.getMarkdown().contains("[IMG:" + result.getImages().get(1).getName() + "]")); + } + + @Test + public void shouldDetectJpegMimeType() throws Exception { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService( + defaultProperties(), + client, + mapper, + new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor()) + ); + + XlsxParseRequest request = new XlsxParseRequest(); + request.addFile(ParseFile.of("jpeg.xlsx", buildWorkbookBytesWithJpegImage())); + + ParseResponse response = service.parse(request); + ParseResult result = response.getResults().get(0); + + Assert.assertEquals(1, result.getImages().size()); + Assert.assertEquals("image/jpeg", result.getImages().get(0).getMimeType()); + } + + @Test + public void shouldAppendImageReferenceForImageOnlySheet() throws Exception { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService( + defaultProperties(), + client, + mapper, + new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), directExecutor()) + ); + + XlsxParseRequest request = new XlsxParseRequest(); + request.addFile(ParseFile.of("image-only.xlsx", buildWorkbookBytesWithImageOnlySheet())); + + ParseResponse response = service.parse(request); + ParseResult result = response.getResults().get(0); + XlsxParseArtifact artifact = extractXlsxArtifact(result); + + Assert.assertTrue(result.getMarkdown().contains("# Sheet1")); + Assert.assertTrue(result.getMarkdown().contains("_empty sheet_")); + Assert.assertTrue(result.getMarkdown().contains("## Sheet1 图片说明")); + Assert.assertTrue(result.getMarkdown().contains("![sheet1-r2c2-001](images/sheet1/r2c2-001.png)")); + Assert.assertTrue(result.getMarkdown().contains("- 占位符:[IMG:sheet1-r2c2-001]")); + Assert.assertEquals(1, result.getImages().size()); + Assert.assertEquals(1, artifact.getSheetImages().size()); + Assert.assertEquals("sheet1-r2c2-001", artifact.getSheetImages().get(0).getReferenceKeys().get(0)); + } + + @Test + public void shouldTrackAsyncLifecycleAndExposeResult() throws Exception { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + ManualExecutor executor = new ManualExecutor(); + MineruXlsxDocumentParseService service = new MineruXlsxDocumentParseService( + defaultProperties(), + client, + mapper, + new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executor) + ); + + XlsxParseRequest request = new XlsxParseRequest(); + request.addFile(ParseFile.of("async.xlsx", buildWorkbookBytes())); + + ParseTaskStatus submitted = service.submit(request); + Assert.assertEquals("queued", submitted.getStatus()); + Assert.assertEquals("queued", submitted.getCurrentStage()); + Assert.assertEquals(Integer.valueOf(0), submitted.getProgressPercent()); + + ParseTaskInfo queuedInfo = service.queryTaskInfo(submitted.getTaskId()); + Assert.assertNull(queuedInfo.getResult()); + try { + service.queryResult(submitted.getTaskId()); + Assert.fail("任务未完成时应抛出异常"); + } catch (DocumentParseException expected) { + Assert.assertTrue(expected.getMessage().contains(submitted.getTaskId())); + } + + executor.runNext(); + + ParseTaskStatus completed = service.queryTask(submitted.getTaskId()); + Assert.assertEquals("completed", completed.getStatus()); + Assert.assertEquals("completed", completed.getCurrentStage()); + Assert.assertEquals(Integer.valueOf(100), completed.getProgressPercent()); + Assert.assertEquals("任务执行完成", completed.getStatusMessage()); + + ParseTaskInfo completedInfo = service.queryTaskInfo(submitted.getTaskId()); + Assert.assertNotNull(completedInfo.getResult()); + Assert.assertTrue(completedInfo.getResult().getResults().get(0).getMarkdown().contains("[IMG:sheet1-r2c2-001]")); + Assert.assertEquals(completedInfo.getResult(), service.queryResult(submitted.getTaskId())); + } + + private byte[] buildWorkbookBytes() throws Exception { + XSSFWorkbook workbook = new XSSFWorkbook(); + XSSFSheet sheet = workbook.createSheet("Sheet1"); + sheet.createRow(0).createCell(0).setCellValue("商品"); + sheet.getRow(0).createCell(1).setCellValue("图片"); + sheet.createRow(1).createCell(0).setCellValue("手机"); + addPicture(workbook, sheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG); + return writeWorkbook(workbook); + } + + private byte[] buildWorkbookBytesWithUnicodeSheetNames() throws Exception { + XSSFWorkbook workbook = new XSSFWorkbook(); + + XSSFSheet detailSheet = workbook.createSheet("明细"); + detailSheet.createRow(0).createCell(0).setCellValue("图片"); + addPicture(workbook, detailSheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG); + + XSSFSheet summarySheet = workbook.createSheet("汇总"); + summarySheet.createRow(0).createCell(0).setCellValue("图片"); + addPicture(workbook, summarySheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG); + + return writeWorkbook(workbook); + } + + private byte[] buildWorkbookBytesWithJpegImage() throws Exception { + XSSFWorkbook workbook = new XSSFWorkbook(); + XSSFSheet sheet = workbook.createSheet("Sheet1"); + sheet.createRow(0).createCell(0).setCellValue("图片"); + addPicture(workbook, sheet, 1, 1, createImageBytes("jpg"), XSSFWorkbook.PICTURE_TYPE_JPEG); + return writeWorkbook(workbook); + } + + private byte[] buildWorkbookBytesWithImageOnlySheet() throws Exception { + XSSFWorkbook workbook = new XSSFWorkbook(); + XSSFSheet sheet = workbook.createSheet("Sheet1"); + addPicture(workbook, sheet, 1, 1, createImageBytes("png"), XSSFWorkbook.PICTURE_TYPE_PNG); + return writeWorkbook(workbook); + } + + private void addPicture(XSSFWorkbook workbook, + XSSFSheet sheet, + int rowIndex, + int colIndex, + byte[] imageBytes, + int pictureType) { + int pictureIndex = workbook.addPicture(imageBytes, pictureType); + XSSFDrawing drawing = sheet.createDrawingPatriarch(); + ClientAnchor anchor = workbook.getCreationHelper().createClientAnchor(); + anchor.setRow1(rowIndex); + anchor.setCol1(colIndex); + anchor.setRow2(rowIndex + 1); + anchor.setCol2(colIndex + 1); + drawing.createPicture(anchor, pictureIndex); + } + + private byte[] writeWorkbook(XSSFWorkbook workbook) throws Exception { + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + workbook.write(outputStream); + workbook.close(); + return outputStream.toByteArray(); + } + + private byte[] createImageBytes(String format) throws Exception { + BufferedImage image = new BufferedImage(2, 2, BufferedImage.TYPE_INT_RGB); + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + ImageIO.write(image, format, outputStream); + return outputStream.toByteArray(); + } + + private MineruProperties defaultProperties() { + MineruProperties properties = new MineruProperties(); + properties.setBaseUrl("http://127.0.0.1:8000"); + return properties; + } + + private Executor directExecutor() { + return new Executor() { + @Override + public void execute(Runnable command) { + command.run(); + } + }; + } + + private XlsxParseArtifact extractXlsxArtifact(ParseResult result) { + ParseArtifacts artifacts = result.getArtifacts(); + Assert.assertNotNull(artifacts); + Object artifact = artifacts.getExtraJsonArtifacts().get("xlsx"); + Assert.assertTrue(artifact instanceof XlsxParseArtifact); + return (XlsxParseArtifact) artifact; + } + + /** + * 手动执行的测试执行器,用于验证异步任务状态流转。 + */ + private static class ManualExecutor implements Executor { + + private final Queue tasks = new ArrayDeque(); + + @Override + public void execute(Runnable command) { + tasks.offer(command); + } + + private void runNext() { + Runnable task = tasks.poll(); + Assert.assertNotNull("应当存在待执行任务", task); + task.run(); + } + } + + private static class RecordingClient extends MineruClient { + + private RecordingClient(MineruProperties properties) { + super(properties, new MineruMapper(properties)); + } + + @Override + public MineruResultPayload parse(com.easyagents.document.core.entity.ParseRequest request) { + return new MineruMapper(testProperties()).toResultPayload(syncPayload()); + } + + private JSONObject syncPayload() { + JSONObject payload = new JSONObject(); + payload.put("backend", "vlm-http-client"); + payload.put("version", "3.0.9"); + JSONObject result = new JSONObject(); + result.put("md_content", "图片文字描述"); + JSONObject results = new JSONObject(); + results.put("image", result); + payload.put("results", results); + return payload; + } + + private static MineruProperties testProperties() { + MineruProperties properties = new MineruProperties(); + properties.setBaseUrl("http://127.0.0.1:8000"); + return properties; + } + } +} diff --git a/easy-agents-document/pom.xml b/easy-agents-document/pom.xml index 6c6efc9..364d201 100644 --- a/easy-agents-document/pom.xml +++ b/easy-agents-document/pom.xml @@ -17,5 +17,7 @@ easy-agents-document-core easy-agents-document-pdf + easy-agents-document-pptx + easy-agents-document-xlsx diff --git a/easy-agents-spring-boot-starter/pom.xml b/easy-agents-spring-boot-starter/pom.xml index 9def988..7dbd098 100644 --- a/easy-agents-spring-boot-starter/pom.xml +++ b/easy-agents-spring-boot-starter/pom.xml @@ -61,6 +61,16 @@ easy-agents-document-pdf + + com.easyagents + easy-agents-document-pptx + + + + com.easyagents + easy-agents-document-xlsx + + com.easyagents easy-agents-rag-ingestion diff --git a/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/mineru/CommonMineruDocumentProperties.java b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/mineru/CommonMineruDocumentProperties.java new file mode 100644 index 0000000..efd79a5 --- /dev/null +++ b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/mineru/CommonMineruDocumentProperties.java @@ -0,0 +1,119 @@ +package com.easyagents.spring.boot.document.mineru; + +import org.springframework.boot.context.properties.ConfigurationProperties; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * 通用 MinerU 文档配置。 + * + * @author Codex + * @since 2026-04-16 + */ +@ConfigurationProperties(prefix = "easy-agents.document.mineru") +public class CommonMineruDocumentProperties { + + private String baseUrl; + private Integer connectTimeoutMs = 3000; + private Integer readTimeoutMs = 600000; + private Integer writeTimeoutMs = 600000; + private Integer pollIntervalMs = 1000; + private Integer resultTimeoutMs = 1800000; + private String defaultBackend = "vlm-http-client"; + private String defaultParseMethod = "auto"; + private List defaultLangList = new ArrayList(Arrays.asList("ch")); + private Boolean defaultFormulaEnable = true; + private Boolean defaultTableEnable = true; + + public String getBaseUrl() { + return baseUrl; + } + + public void setBaseUrl(String baseUrl) { + this.baseUrl = baseUrl; + } + + public Integer getConnectTimeoutMs() { + return connectTimeoutMs; + } + + public void setConnectTimeoutMs(Integer connectTimeoutMs) { + this.connectTimeoutMs = connectTimeoutMs; + } + + public Integer getReadTimeoutMs() { + return readTimeoutMs; + } + + public void setReadTimeoutMs(Integer readTimeoutMs) { + this.readTimeoutMs = readTimeoutMs; + } + + public Integer getWriteTimeoutMs() { + return writeTimeoutMs; + } + + public void setWriteTimeoutMs(Integer writeTimeoutMs) { + this.writeTimeoutMs = writeTimeoutMs; + } + + public Integer getPollIntervalMs() { + return pollIntervalMs; + } + + public void setPollIntervalMs(Integer pollIntervalMs) { + this.pollIntervalMs = pollIntervalMs; + } + + public Integer getResultTimeoutMs() { + return resultTimeoutMs; + } + + public void setResultTimeoutMs(Integer resultTimeoutMs) { + this.resultTimeoutMs = resultTimeoutMs; + } + + public String getDefaultBackend() { + return defaultBackend; + } + + public void setDefaultBackend(String defaultBackend) { + this.defaultBackend = defaultBackend; + } + + public String getDefaultParseMethod() { + return defaultParseMethod; + } + + public void setDefaultParseMethod(String defaultParseMethod) { + this.defaultParseMethod = defaultParseMethod; + } + + public List getDefaultLangList() { + return defaultLangList; + } + + public void setDefaultLangList(List defaultLangList) { + this.defaultLangList = defaultLangList == null + ? new ArrayList(Arrays.asList("ch")) + : defaultLangList; + } + + public Boolean getDefaultFormulaEnable() { + return defaultFormulaEnable; + } + + public void setDefaultFormulaEnable(Boolean defaultFormulaEnable) { + this.defaultFormulaEnable = defaultFormulaEnable; + } + + public Boolean getDefaultTableEnable() { + return defaultTableEnable; + } + + public void setDefaultTableEnable(Boolean defaultTableEnable) { + this.defaultTableEnable = defaultTableEnable; + } +} diff --git a/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pdf/mineru/MineruPdfAutoConfiguration.java b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pdf/mineru/MineruPdfAutoConfiguration.java index 85e9051..58ed9ec 100644 --- a/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pdf/mineru/MineruPdfAutoConfiguration.java +++ b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pdf/mineru/MineruPdfAutoConfiguration.java @@ -1,9 +1,11 @@ package com.easyagents.spring.boot.document.pdf.mineru; import com.easyagents.document.core.DocumentParseService; +import com.easyagents.document.core.mineru.MineruProperties; import com.easyagents.document.pdf.PdfDocumentParseService; import com.easyagents.document.pdf.mineru.MineruPdfDocumentParseService; -import com.easyagents.document.pdf.mineru.MineruProperties; +import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties; +import com.easyagents.core.util.StringUtil; import org.springframework.boot.autoconfigure.condition.ConditionalOnClass; import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; @@ -20,7 +22,7 @@ import org.springframework.context.annotation.Configuration; @Configuration(proxyBeanMethods = false) @ConditionalOnClass(MineruPdfDocumentParseService.class) @ConditionalOnProperty(prefix = "easy-agents.document.pdf", name = "provider", havingValue = "mineru") -@EnableConfigurationProperties(MineruDocumentProperties.class) +@EnableConfigurationProperties({MineruDocumentProperties.class, CommonMineruDocumentProperties.class}) public class MineruPdfAutoConfiguration { /** @@ -31,8 +33,9 @@ public class MineruPdfAutoConfiguration { */ @Bean @ConditionalOnMissingBean(PdfDocumentParseService.class) - public PdfDocumentParseService pdfDocumentParseService(MineruDocumentProperties properties) { - return new MineruPdfDocumentParseService(toMineruProperties(properties)); + public PdfDocumentParseService pdfDocumentParseService(MineruDocumentProperties properties, + CommonMineruDocumentProperties commonProperties) { + return new MineruPdfDocumentParseService(toMineruProperties(properties, commonProperties)); } /** @@ -47,19 +50,21 @@ public class MineruPdfAutoConfiguration { return pdfDocumentParseService; } - private MineruProperties toMineruProperties(MineruDocumentProperties properties) { + private MineruProperties toMineruProperties(MineruDocumentProperties properties, + CommonMineruDocumentProperties commonProperties) { MineruProperties mineruProperties = new MineruProperties(); - mineruProperties.setBaseUrl(properties.getBaseUrl()); - mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs()); - mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs()); - mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs()); - mineruProperties.setPollIntervalMs(properties.getPollIntervalMs()); - mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs()); - mineruProperties.setDefaultBackend(properties.getDefaultBackend()); - mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod()); - mineruProperties.setDefaultLangList(properties.getDefaultLangList()); - mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable()); - mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable()); + boolean useCommon = commonProperties != null && StringUtil.hasText(commonProperties.getBaseUrl()); + mineruProperties.setBaseUrl(useCommon ? commonProperties.getBaseUrl() : properties.getBaseUrl()); + mineruProperties.setConnectTimeoutMs(useCommon ? commonProperties.getConnectTimeoutMs() : properties.getConnectTimeoutMs()); + mineruProperties.setReadTimeoutMs(useCommon ? commonProperties.getReadTimeoutMs() : properties.getReadTimeoutMs()); + mineruProperties.setWriteTimeoutMs(useCommon ? commonProperties.getWriteTimeoutMs() : properties.getWriteTimeoutMs()); + mineruProperties.setPollIntervalMs(useCommon ? commonProperties.getPollIntervalMs() : properties.getPollIntervalMs()); + mineruProperties.setResultTimeoutMs(useCommon ? commonProperties.getResultTimeoutMs() : properties.getResultTimeoutMs()); + mineruProperties.setDefaultBackend(useCommon ? commonProperties.getDefaultBackend() : properties.getDefaultBackend()); + mineruProperties.setDefaultParseMethod(useCommon ? commonProperties.getDefaultParseMethod() : properties.getDefaultParseMethod()); + mineruProperties.setDefaultLangList(useCommon ? commonProperties.getDefaultLangList() : properties.getDefaultLangList()); + mineruProperties.setDefaultFormulaEnable(useCommon ? commonProperties.getDefaultFormulaEnable() : properties.getDefaultFormulaEnable()); + mineruProperties.setDefaultTableEnable(useCommon ? commonProperties.getDefaultTableEnable() : properties.getDefaultTableEnable()); return mineruProperties; } } diff --git a/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pptx/MineruPptxAutoConfiguration.java b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pptx/MineruPptxAutoConfiguration.java new file mode 100644 index 0000000..3e1078a --- /dev/null +++ b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pptx/MineruPptxAutoConfiguration.java @@ -0,0 +1,61 @@ +package com.easyagents.spring.boot.document.pptx; + +import com.easyagents.document.core.async.DocumentAsyncTaskManager; +import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository; +import com.easyagents.document.core.mineru.MineruProperties; +import com.easyagents.document.pptx.PptxDocumentParseService; +import com.easyagents.document.pptx.mineru.MineruPptxDocumentParseService; +import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties; +import org.springframework.boot.autoconfigure.condition.ConditionalOnClass; +import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.boot.context.properties.EnableConfigurationProperties; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * MinerU PPTX 自动装配。 + * + * @author Codex + * @since 2026-04-16 + */ +@Configuration(proxyBeanMethods = false) +@ConditionalOnClass(MineruPptxDocumentParseService.class) +@ConditionalOnProperty(prefix = "easy-agents.document.pptx", name = "enabled", havingValue = "true") +@EnableConfigurationProperties({CommonMineruDocumentProperties.class, PptxDocumentProperties.class}) +public class MineruPptxAutoConfiguration { + + @Bean + @ConditionalOnMissingBean(name = "pptxDocumentAsyncTaskManager") + public DocumentAsyncTaskManager pptxDocumentAsyncTaskManager(PptxDocumentProperties properties) { + int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads(); + ExecutorService executorService = Executors.newFixedThreadPool(threadCount); + return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService); + } + + @Bean + @ConditionalOnMissingBean(PptxDocumentParseService.class) + public PptxDocumentParseService pptxDocumentParseService(CommonMineruDocumentProperties commonProperties, + DocumentAsyncTaskManager pptxDocumentAsyncTaskManager) { + return new MineruPptxDocumentParseService(toMineruProperties(commonProperties), pptxDocumentAsyncTaskManager); + } + + private MineruProperties toMineruProperties(CommonMineruDocumentProperties properties) { + MineruProperties mineruProperties = new MineruProperties(); + mineruProperties.setBaseUrl(properties.getBaseUrl()); + mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs()); + mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs()); + mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs()); + mineruProperties.setPollIntervalMs(properties.getPollIntervalMs()); + mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs()); + mineruProperties.setDefaultBackend(properties.getDefaultBackend()); + mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod()); + mineruProperties.setDefaultLangList(properties.getDefaultLangList()); + mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable()); + mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable()); + return mineruProperties; + } +} diff --git a/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pptx/PptxDocumentProperties.java b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pptx/PptxDocumentProperties.java new file mode 100644 index 0000000..d170f82 --- /dev/null +++ b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/pptx/PptxDocumentProperties.java @@ -0,0 +1,32 @@ +package com.easyagents.spring.boot.document.pptx; + +import org.springframework.boot.context.properties.ConfigurationProperties; + +/** + * PPTX 文档配置。 + * + * @author Codex + * @since 2026-04-16 + */ +@ConfigurationProperties(prefix = "easy-agents.document.pptx") +public class PptxDocumentProperties { + + private Boolean enabled = false; + private Integer asyncThreads = 2; + + public Boolean getEnabled() { + return enabled; + } + + public void setEnabled(Boolean enabled) { + this.enabled = enabled; + } + + public Integer getAsyncThreads() { + return asyncThreads; + } + + public void setAsyncThreads(Integer asyncThreads) { + this.asyncThreads = asyncThreads; + } +} diff --git a/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/xlsx/MineruXlsxAutoConfiguration.java b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/xlsx/MineruXlsxAutoConfiguration.java new file mode 100644 index 0000000..519b5ef --- /dev/null +++ b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/xlsx/MineruXlsxAutoConfiguration.java @@ -0,0 +1,61 @@ +package com.easyagents.spring.boot.document.xlsx; + +import com.easyagents.document.core.async.DocumentAsyncTaskManager; +import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository; +import com.easyagents.document.core.mineru.MineruProperties; +import com.easyagents.document.xlsx.XlsxDocumentParseService; +import com.easyagents.document.xlsx.mineru.MineruXlsxDocumentParseService; +import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties; +import org.springframework.boot.autoconfigure.condition.ConditionalOnClass; +import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.boot.context.properties.EnableConfigurationProperties; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * MinerU XLSX 自动装配。 + * + * @author Codex + * @since 2026-04-16 + */ +@Configuration(proxyBeanMethods = false) +@ConditionalOnClass(MineruXlsxDocumentParseService.class) +@ConditionalOnProperty(prefix = "easy-agents.document.xlsx", name = "enabled", havingValue = "true") +@EnableConfigurationProperties({CommonMineruDocumentProperties.class, XlsxDocumentProperties.class}) +public class MineruXlsxAutoConfiguration { + + @Bean + @ConditionalOnMissingBean(name = "xlsxDocumentAsyncTaskManager") + public DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager(XlsxDocumentProperties properties) { + int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads(); + ExecutorService executorService = Executors.newFixedThreadPool(threadCount); + return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService); + } + + @Bean + @ConditionalOnMissingBean(XlsxDocumentParseService.class) + public XlsxDocumentParseService xlsxDocumentParseService(CommonMineruDocumentProperties commonProperties, + DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager) { + return new MineruXlsxDocumentParseService(toMineruProperties(commonProperties), xlsxDocumentAsyncTaskManager); + } + + private MineruProperties toMineruProperties(CommonMineruDocumentProperties properties) { + MineruProperties mineruProperties = new MineruProperties(); + mineruProperties.setBaseUrl(properties.getBaseUrl()); + mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs()); + mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs()); + mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs()); + mineruProperties.setPollIntervalMs(properties.getPollIntervalMs()); + mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs()); + mineruProperties.setDefaultBackend(properties.getDefaultBackend()); + mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod()); + mineruProperties.setDefaultLangList(properties.getDefaultLangList()); + mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable()); + mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable()); + return mineruProperties; + } +} diff --git a/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/xlsx/XlsxDocumentProperties.java b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/xlsx/XlsxDocumentProperties.java new file mode 100644 index 0000000..7c6818b --- /dev/null +++ b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/document/xlsx/XlsxDocumentProperties.java @@ -0,0 +1,32 @@ +package com.easyagents.spring.boot.document.xlsx; + +import org.springframework.boot.context.properties.ConfigurationProperties; + +/** + * XLSX 文档配置。 + * + * @author Codex + * @since 2026-04-16 + */ +@ConfigurationProperties(prefix = "easy-agents.document.xlsx") +public class XlsxDocumentProperties { + + private Boolean enabled = false; + private Integer asyncThreads = 2; + + public Boolean getEnabled() { + return enabled; + } + + public void setEnabled(Boolean enabled) { + this.enabled = enabled; + } + + public Integer getAsyncThreads() { + return asyncThreads; + } + + public void setAsyncThreads(Integer asyncThreads) { + this.asyncThreads = asyncThreads; + } +} diff --git a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports index 7b154cb..c3573de 100644 --- a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports +++ b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports @@ -9,3 +9,5 @@ com.easyagents.spring.boot.store.elasticsearch.ElasticSearchAutoConfiguration com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration +com.easyagents.spring.boot.document.pptx.MineruPptxAutoConfiguration +com.easyagents.spring.boot.document.xlsx.MineruXlsxAutoConfiguration diff --git a/easy-agents-spring-boot-starter/src/test/java/com/easyagents/spring/boot/autoconfigure/StarterConditionalAutoConfigurationTest.java b/easy-agents-spring-boot-starter/src/test/java/com/easyagents/spring/boot/autoconfigure/StarterConditionalAutoConfigurationTest.java index 16a8998..8eb5092 100644 --- a/easy-agents-spring-boot-starter/src/test/java/com/easyagents/spring/boot/autoconfigure/StarterConditionalAutoConfigurationTest.java +++ b/easy-agents-spring-boot-starter/src/test/java/com/easyagents/spring/boot/autoconfigure/StarterConditionalAutoConfigurationTest.java @@ -2,8 +2,12 @@ package com.easyagents.spring.boot.autoconfigure; import com.easyagents.document.core.DocumentParseService; import com.easyagents.document.pdf.PdfDocumentParseService; +import com.easyagents.document.pptx.PptxDocumentParseService; +import com.easyagents.document.xlsx.XlsxDocumentParseService; import com.easyagents.llm.ollama.OllamaChatModel; +import com.easyagents.spring.boot.document.pptx.MineruPptxAutoConfiguration; import com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration; +import com.easyagents.spring.boot.document.xlsx.MineruXlsxAutoConfiguration; import com.easyagents.spring.boot.llm.ollama.OllamaAutoConfiguration; import com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration; import com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration; @@ -18,7 +22,9 @@ public class StarterConditionalAutoConfigurationTest { RagIngestionAutoConfiguration.class, OllamaAutoConfiguration.class, OpenSearchAutoConfiguration.class, - MineruPdfAutoConfiguration.class + MineruPdfAutoConfiguration.class, + MineruPptxAutoConfiguration.class, + MineruXlsxAutoConfiguration.class ); @Test @@ -51,4 +57,19 @@ public class StarterConditionalAutoConfigurationTest { Assert.assertNotNull(context.getBean(DocumentParseService.class)); }); } + + @Test + public void shouldCreatePptxAndXlsxBeansWhenEnabled() { + contextRunner + .withPropertyValues( + "easy-agents.document.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api", + "easy-agents.document.pptx.enabled=true", + "easy-agents.document.xlsx.enabled=true" + ) + .run(context -> { + Assert.assertNotNull(context.getBean(PptxDocumentParseService.class)); + Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class)); + Assert.assertFalse(context.containsBean("documentParseService")); + }); + } } diff --git a/pom.xml b/pom.xml index 931f1f9..4d69dfc 100644 --- a/pom.xml +++ b/pom.xml @@ -132,6 +132,18 @@ ${revision} + + com.easyagents + easy-agents-document-pptx + ${revision} + + + + com.easyagents + easy-agents-document-xlsx + ${revision} + + com.easyagents easy-agents-rag-core