From aa3e90b990d1c4e3769de8d2c01a56eaddf63965 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E5=AD=90=E9=BB=98?= <925456043@qq.com> Date: Tue, 14 Apr 2026 19:57:32 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=AE=8C=E6=88=90L09=E7=BB=9F=E4=B8=80?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E8=A7=A3=E6=9E=90=E6=A8=A1=E5=9D=97=E4=B8=8E?= =?UTF-8?q?MinerU=20PDF=20Provider=E6=8E=A5=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 easy-agents-document 聚合、document-core 与 document-pdf 模块 - 接入 MinerU PDF provider,支持同步解析、异步任务与 ZIP 结果映射 - 移除 easy-agents-rag-ocr 空壳并补齐 starter 自动装配 --- README.md | 2 +- easy-agents-bom/pom.xml | 15 +- .../easy-agents-document-core}/pom.xml | 10 +- .../document/core/DocumentParseService.java | 72 ++ .../exception/DocumentParseException.java | 18 + .../document/core/model/DocumentBlock.java | 88 ++ .../document/core/model/DocumentImage.java | 86 ++ .../document/core/model/DocumentPage.java | 50 ++ .../document/core/model/DocumentTable.java | 68 ++ .../document/core/model/ParseArtifacts.java | 63 ++ .../document/core/model/ParseFile.java | 77 ++ .../document/core/model/ParseRequest.java | 144 +++ .../document/core/model/ParseResponse.java | 41 + .../document/core/model/ParseResult.java | 106 +++ .../document/core/model/ParseTaskInfo.java | 58 ++ .../document/core/model/ParseTaskStatus.java | 113 +++ .../easy-agents-document-pdf/pom.xml | 44 + .../document/pdf/PdfDocumentParseService.java | 12 + .../document/pdf/PdfDocumentProvider.java | 17 + .../document/pdf/mineru/MineruMapper.java | 830 ++++++++++++++++++ .../document/pdf/mineru/MineruPdfClient.java | 211 +++++ .../mineru/MineruPdfDocumentParseService.java | 171 ++++ .../document/pdf/mineru/MineruProperties.java | 116 +++ .../pdf/mineru/MineruResultPayload.java | 43 + .../document/pdf/mineru/MineruTaskStatus.java | 131 +++ .../document/pdf/mineru/MineruMapperTest.java | 366 ++++++++ .../MineruPdfDocumentParseServiceTest.java | 271 ++++++ easy-agents-document/pom.xml | 21 + easy-agents-rag/TECH-PLAN.md | 19 +- easy-agents-rag/pom.xml | 1 - easy-agents-spring-boot-starter/pom.xml | 5 + ...ot.autoconfigure.AutoConfiguration.imports | 1 + ...arterConditionalAutoConfigurationTest.java | 25 +- pom.xml | 19 +- 34 files changed, 3280 insertions(+), 34 deletions(-) rename {easy-agents-rag/easy-agents-rag-ocr => easy-agents-document/easy-agents-document-core}/pom.xml (74%) create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/DocumentParseService.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/exception/DocumentParseException.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentBlock.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentImage.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentPage.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentTable.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseArtifacts.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseFile.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseRequest.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResponse.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResult.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskInfo.java create mode 100644 easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskStatus.java create mode 100644 easy-agents-document/easy-agents-document-pdf/pom.xml create mode 100644 easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentParseService.java create mode 100644 easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentProvider.java create mode 100644 easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java create mode 100644 easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfClient.java create mode 100644 easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java create mode 100644 easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruProperties.java create mode 100644 easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruResultPayload.java create mode 100644 easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruTaskStatus.java create mode 100644 easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java create mode 100644 easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseServiceTest.java create mode 100644 easy-agents-document/pom.xml diff --git a/README.md b/README.md index d4a2d8f..4605b14 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Easy-Agents 是一个轻量、可扩展的 Java AI 应用开发框架,覆盖 - `easy-agents-bom`:依赖版本管理(BOM)。 - `easy-agents-core`:核心抽象与基础能力。 +- `easy-agents-document`:统一文档解析能力域,当前提供 PDF 解析抽象与 MinerU provider。 - `easy-agents-chat`:对话模型接入实现集合。 - `easy-agents-embedding`:向量化模型实现集合。 - `easy-agents-rerank`:重排模型实现集合。 @@ -84,4 +85,3 @@ public static void main(String[] args) { ``` - diff --git a/easy-agents-bom/pom.xml b/easy-agents-bom/pom.xml index 99b7079..d02204d 100644 --- a/easy-agents-bom/pom.xml +++ b/easy-agents-bom/pom.xml @@ -56,6 +56,16 @@ + + com.easyagents + easy-agents-document-core + + + + com.easyagents + easy-agents-document-pdf + + com.easyagents easy-agents-rag-core @@ -66,11 +76,6 @@ easy-agents-rag-ingestion - - com.easyagents - easy-agents-rag-ocr - - com.easyagents easy-agents-rag-enhance diff --git a/easy-agents-rag/easy-agents-rag-ocr/pom.xml b/easy-agents-document/easy-agents-document-core/pom.xml similarity index 74% rename from easy-agents-rag/easy-agents-rag-ocr/pom.xml rename to easy-agents-document/easy-agents-document-core/pom.xml index c4b861b..d2dfd2c 100644 --- a/easy-agents-rag/easy-agents-rag-ocr/pom.xml +++ b/easy-agents-document/easy-agents-document-core/pom.xml @@ -6,12 +6,12 @@ com.easyagents - easy-agents-rag + easy-agents-document ${revision} - easy-agents-rag-ocr - easy-agents-rag-ocr + easy-agents-document-core + easy-agents-document-core 8 @@ -24,9 +24,5 @@ com.easyagents easy-agents-core - - com.easyagents - easy-agents-rag-core - diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/DocumentParseService.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/DocumentParseService.java new file mode 100644 index 0000000..e5a736a --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/DocumentParseService.java @@ -0,0 +1,72 @@ +package com.easyagents.document.core; + +import com.easyagents.document.core.model.ParseRequest; +import com.easyagents.document.core.model.ParseResponse; +import com.easyagents.document.core.model.ParseTaskInfo; +import com.easyagents.document.core.model.ParseTaskStatus; + +/** + * 统一文档解析服务抽象。 + * + * @author Codex + * @since 2026-04-14 + */ +public interface DocumentParseService { + + /** + * 同步解析文档并直接返回结果。 + * + * @param request 解析请求 + * @return 解析结果 + */ + ParseResponse parse(ParseRequest request); + + /** + * 异步提交文档解析任务。 + * + * @param request 解析请求 + * @return 任务状态 + */ + ParseTaskStatus submit(ParseRequest request); + + /** + * 查询异步解析任务状态。 + * + * @param taskId 任务 ID + * @return 任务状态 + */ + ParseTaskStatus queryTask(String taskId); + + /** + * 获取异步任务最终结果。 + * + *

该方法面向“结果读取”语义,provider 可以在内部等待任务完成后再返回最终结果, + * 因此不适合用于高频轻量轮询;如果调用方希望统一查看“当前状态 + 已完成结果”, + * 应优先使用 {@link #queryTaskInfo(String)}。

+ * + * @param taskId 任务 ID + * @return 解析结果 + */ + ParseResponse queryResult(String taskId); + + /** + * 聚合查询异步任务信息。 + * + *

该方法会先查询任务状态;如果任务仍在处理中,则仅返回当前状态。 + * 如果任务已经完成,则会继续查询并附带最终解析结果。

+ * + *

注意:该方法在不同 provider 下可能触发结果下载或阻塞等待, + * 因此更适合用于“状态+结果”一体化查询,而不是高频轻量轮询。

+ * + * @param taskId 任务 ID + * @return 聚合查询结果 + */ + default ParseTaskInfo queryTaskInfo(String taskId) { + ParseTaskStatus taskStatus = queryTask(taskId); + ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(taskStatus); + if (taskStatus != null && "completed".equalsIgnoreCase(taskStatus.getStatus())) { + taskInfo.setResult(queryResult(taskId)); + } + return taskInfo; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/exception/DocumentParseException.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/exception/DocumentParseException.java new file mode 100644 index 0000000..a0d9c65 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/exception/DocumentParseException.java @@ -0,0 +1,18 @@ +package com.easyagents.document.core.exception; + +/** + * 文档解析异常。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentParseException extends RuntimeException { + + public DocumentParseException(String message) { + super(message); + } + + public DocumentParseException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentBlock.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentBlock.java new file mode 100644 index 0000000..f525d4f --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentBlock.java @@ -0,0 +1,88 @@ +package com.easyagents.document.core.model; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +/** + * 结构化内容块。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentBlock { + + private String type; + private Integer pageIndex; + private String text; + private Integer level; + private String html; + private String imagePath; + private List boundingBox = new ArrayList(); + private Map metadata = new LinkedHashMap(); + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public Integer getPageIndex() { + return pageIndex; + } + + public void setPageIndex(Integer pageIndex) { + this.pageIndex = pageIndex; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + public Integer getLevel() { + return level; + } + + public void setLevel(Integer level) { + this.level = level; + } + + public String getHtml() { + return html; + } + + public void setHtml(String html) { + this.html = html; + } + + public String getImagePath() { + return imagePath; + } + + public void setImagePath(String imagePath) { + this.imagePath = imagePath; + } + + public List getBoundingBox() { + return boundingBox; + } + + public void setBoundingBox(List boundingBox) { + this.boundingBox = boundingBox == null ? new ArrayList() : boundingBox; + } + + public Map getMetadata() { + return metadata; + } + + public void setMetadata(Map metadata) { + this.metadata = metadata == null ? new LinkedHashMap() : metadata; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentImage.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentImage.java new file mode 100644 index 0000000..1e36938 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentImage.java @@ -0,0 +1,86 @@ +package com.easyagents.document.core.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * 图片结果。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentImage { + + private Integer pageIndex; + private String name; + private String mimeType; + private String sourcePath; + private String dataUrl; + private List boundingBox = new ArrayList(); + private List captions = new ArrayList(); + private List footnotes = new ArrayList(); + + public Integer getPageIndex() { + return pageIndex; + } + + public void setPageIndex(Integer pageIndex) { + this.pageIndex = pageIndex; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getMimeType() { + return mimeType; + } + + public void setMimeType(String mimeType) { + this.mimeType = mimeType; + } + + public String getSourcePath() { + return sourcePath; + } + + public void setSourcePath(String sourcePath) { + this.sourcePath = sourcePath; + } + + public String getDataUrl() { + return dataUrl; + } + + public void setDataUrl(String dataUrl) { + this.dataUrl = dataUrl; + } + + public List getBoundingBox() { + return boundingBox; + } + + public void setBoundingBox(List boundingBox) { + this.boundingBox = boundingBox == null ? new ArrayList() : boundingBox; + } + + public List getCaptions() { + return captions; + } + + public void setCaptions(List captions) { + this.captions = captions == null ? new ArrayList() : captions; + } + + public List getFootnotes() { + return footnotes; + } + + public void setFootnotes(List footnotes) { + this.footnotes = footnotes == null ? new ArrayList() : footnotes; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentPage.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentPage.java new file mode 100644 index 0000000..a6a2f18 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentPage.java @@ -0,0 +1,50 @@ +package com.easyagents.document.core.model; + +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * 页面信息。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentPage { + + private Integer pageIndex; + private Double width; + private Double height; + private Map metadata = new LinkedHashMap(); + + public Integer getPageIndex() { + return pageIndex; + } + + public void setPageIndex(Integer pageIndex) { + this.pageIndex = pageIndex; + } + + public Double getWidth() { + return width; + } + + public void setWidth(Double width) { + this.width = width; + } + + public Double getHeight() { + return height; + } + + public void setHeight(Double height) { + this.height = height; + } + + public Map getMetadata() { + return metadata; + } + + public void setMetadata(Map metadata) { + this.metadata = metadata == null ? new LinkedHashMap() : metadata; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentTable.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentTable.java new file mode 100644 index 0000000..4c688cf --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/DocumentTable.java @@ -0,0 +1,68 @@ +package com.easyagents.document.core.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * 表格结果。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentTable { + + private Integer pageIndex; + private List boundingBox = new ArrayList(); + private String html; + private String imagePath; + private List captions = new ArrayList(); + private List footnotes = new ArrayList(); + + public Integer getPageIndex() { + return pageIndex; + } + + public void setPageIndex(Integer pageIndex) { + this.pageIndex = pageIndex; + } + + public List getBoundingBox() { + return boundingBox; + } + + public void setBoundingBox(List boundingBox) { + this.boundingBox = boundingBox == null ? new ArrayList() : boundingBox; + } + + public String getHtml() { + return html; + } + + public void setHtml(String html) { + this.html = html; + } + + public String getImagePath() { + return imagePath; + } + + public void setImagePath(String imagePath) { + this.imagePath = imagePath; + } + + public List getCaptions() { + return captions; + } + + public void setCaptions(List captions) { + this.captions = captions == null ? new ArrayList() : captions; + } + + public List getFootnotes() { + return footnotes; + } + + public void setFootnotes(List footnotes) { + this.footnotes = footnotes == null ? new ArrayList() : footnotes; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseArtifacts.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseArtifacts.java new file mode 100644 index 0000000..7e08d2b --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseArtifacts.java @@ -0,0 +1,63 @@ +package com.easyagents.document.core.model; + +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * 解析工件集合。 + * + * @author Codex + * @since 2026-04-14 + */ +public class ParseArtifacts { + + private Object middleJson; + private Object contentList; + private Object modelOutput; + private Map extraJsonArtifacts = new LinkedHashMap(); + private Map extraBinaryArtifacts = new LinkedHashMap(); + + public Object getMiddleJson() { + return middleJson; + } + + public void setMiddleJson(Object middleJson) { + this.middleJson = middleJson; + } + + public Object getContentList() { + return contentList; + } + + public void setContentList(Object contentList) { + this.contentList = contentList; + } + + public Object getModelOutput() { + return modelOutput; + } + + public void setModelOutput(Object modelOutput) { + this.modelOutput = modelOutput; + } + + public Map getExtraJsonArtifacts() { + return extraJsonArtifacts; + } + + public void setExtraJsonArtifacts(Map extraJsonArtifacts) { + this.extraJsonArtifacts = extraJsonArtifacts == null + ? new LinkedHashMap() + : extraJsonArtifacts; + } + + public Map getExtraBinaryArtifacts() { + return extraBinaryArtifacts; + } + + public void setExtraBinaryArtifacts(Map extraBinaryArtifacts) { + this.extraBinaryArtifacts = extraBinaryArtifacts == null + ? new LinkedHashMap() + : extraBinaryArtifacts; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseFile.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseFile.java new file mode 100644 index 0000000..e303390 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseFile.java @@ -0,0 +1,77 @@ +package com.easyagents.document.core.model; + +import java.util.Arrays; + +/** + * 待解析文件。 + * + * @author Codex + * @since 2026-04-14 + */ +public class ParseFile { + + private String fileName; + private byte[] content; + private String contentType; + + /** + * 创建文件对象。 + * + * @param fileName 文件名 + * @param content 文件内容 + * @return 文件对象 + */ + public static ParseFile of(String fileName, byte[] content) { + return of(fileName, content, null); + } + + /** + * 创建文件对象。 + * + * @param fileName 文件名 + * @param content 文件内容 + * @param contentType MIME 类型 + * @return 文件对象 + */ + public static ParseFile of(String fileName, byte[] content, String contentType) { + ParseFile parseFile = new ParseFile(); + parseFile.setFileName(fileName); + parseFile.setContent(content); + parseFile.setContentType(contentType); + return parseFile; + } + + public String getFileName() { + return fileName; + } + + public void setFileName(String fileName) { + this.fileName = fileName; + } + + public byte[] getContent() { + return content; + } + + public void setContent(byte[] content) { + this.content = content; + } + + public String getContentType() { + return contentType; + } + + public void setContentType(String contentType) { + this.contentType = contentType; + } + + @Override + public String toString() { + return "ParseFile{" + + "fileName='" + fileName + '\'' + + ", contentLength=" + (content == null ? 0 : content.length) + + ", contentType='" + contentType + '\'' + + ", checksum=" + Arrays.hashCode(content) + + '}'; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseRequest.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseRequest.java new file mode 100644 index 0000000..1c815a0 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseRequest.java @@ -0,0 +1,144 @@ +package com.easyagents.document.core.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * 统一解析请求。 + * + * @author Codex + * @since 2026-04-14 + */ +public class ParseRequest { + + private List files = new ArrayList(); + private String backend; + private String parseMethod = "auto"; + private List languages = new ArrayList(); + private Boolean formulaEnabled = true; + private Boolean tableEnabled = true; + private Integer startPageIndex = 0; + private Integer endPageIndex = 99999; + private Boolean returnMarkdown = true; + private Boolean returnMiddleJson = true; + private Boolean returnContentList = true; + private Boolean returnModelOutput = false; + private Boolean returnImages = true; + + /** + * 追加待解析文件。 + * + * @param parseFile 文件 + * @return 当前请求 + */ + public ParseRequest addFile(ParseFile parseFile) { + if (parseFile != null) { + this.files.add(parseFile); + } + return this; + } + + public List getFiles() { + return files; + } + + public void setFiles(List files) { + this.files = files == null ? new ArrayList() : files; + } + + public String getBackend() { + return backend; + } + + public void setBackend(String backend) { + this.backend = backend; + } + + public String getParseMethod() { + return parseMethod; + } + + public void setParseMethod(String parseMethod) { + this.parseMethod = parseMethod; + } + + public List getLanguages() { + return languages; + } + + public void setLanguages(List languages) { + this.languages = languages == null ? new ArrayList() : languages; + } + + public Boolean getFormulaEnabled() { + return formulaEnabled; + } + + public void setFormulaEnabled(Boolean formulaEnabled) { + this.formulaEnabled = formulaEnabled; + } + + public Boolean getTableEnabled() { + return tableEnabled; + } + + public void setTableEnabled(Boolean tableEnabled) { + this.tableEnabled = tableEnabled; + } + + public Integer getStartPageIndex() { + return startPageIndex; + } + + public void setStartPageIndex(Integer startPageIndex) { + this.startPageIndex = startPageIndex; + } + + public Integer getEndPageIndex() { + return endPageIndex; + } + + public void setEndPageIndex(Integer endPageIndex) { + this.endPageIndex = endPageIndex; + } + + public Boolean getReturnMarkdown() { + return returnMarkdown; + } + + public void setReturnMarkdown(Boolean returnMarkdown) { + this.returnMarkdown = returnMarkdown; + } + + public Boolean getReturnMiddleJson() { + return returnMiddleJson; + } + + public void setReturnMiddleJson(Boolean returnMiddleJson) { + this.returnMiddleJson = returnMiddleJson; + } + + public Boolean getReturnContentList() { + return returnContentList; + } + + public void setReturnContentList(Boolean returnContentList) { + this.returnContentList = returnContentList; + } + + public Boolean getReturnModelOutput() { + return returnModelOutput; + } + + public void setReturnModelOutput(Boolean returnModelOutput) { + this.returnModelOutput = returnModelOutput; + } + + public Boolean getReturnImages() { + return returnImages; + } + + public void setReturnImages(Boolean returnImages) { + this.returnImages = returnImages; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResponse.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResponse.java new file mode 100644 index 0000000..16c3cad --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResponse.java @@ -0,0 +1,41 @@ +package com.easyagents.document.core.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * 批量解析响应。 + * + * @author Codex + * @since 2026-04-14 + */ +public class ParseResponse { + + private String backend; + private String version; + private List results = new ArrayList(); + + public String getBackend() { + return backend; + } + + public void setBackend(String backend) { + this.backend = backend; + } + + public String getVersion() { + return version; + } + + public void setVersion(String version) { + this.version = version; + } + + public List getResults() { + return results; + } + + public void setResults(List results) { + this.results = results == null ? new ArrayList() : results; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResult.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResult.java new file mode 100644 index 0000000..b007f53 --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseResult.java @@ -0,0 +1,106 @@ +package com.easyagents.document.core.model; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +/** + * 单文件解析结果。 + * + * @author Codex + * @since 2026-04-14 + */ +public class ParseResult { + + private String fileName; + private String plainText; + private String markdown; + private List pages = new ArrayList(); + private List blocks = new ArrayList(); + private List tables = new ArrayList(); + private List images = new ArrayList(); + private List warnings = new ArrayList(); + private Map metadata = new LinkedHashMap(); + private ParseArtifacts artifacts = new ParseArtifacts(); + + public String getFileName() { + return fileName; + } + + public void setFileName(String fileName) { + this.fileName = fileName; + } + + public String getPlainText() { + return plainText; + } + + public void setPlainText(String plainText) { + this.plainText = plainText; + } + + public String getMarkdown() { + return markdown; + } + + public void setMarkdown(String markdown) { + this.markdown = markdown; + } + + public List getPages() { + return pages; + } + + public void setPages(List pages) { + this.pages = pages == null ? new ArrayList() : pages; + } + + public List getBlocks() { + return blocks; + } + + public void setBlocks(List blocks) { + this.blocks = blocks == null ? new ArrayList() : blocks; + } + + public List getTables() { + return tables; + } + + public void setTables(List tables) { + this.tables = tables == null ? new ArrayList() : tables; + } + + public List getImages() { + return images; + } + + public void setImages(List images) { + this.images = images == null ? new ArrayList() : images; + } + + public List getWarnings() { + return warnings; + } + + public void setWarnings(List warnings) { + this.warnings = warnings == null ? new ArrayList() : warnings; + } + + public Map getMetadata() { + return metadata; + } + + public void setMetadata(Map metadata) { + this.metadata = metadata == null ? new LinkedHashMap() : metadata; + } + + public ParseArtifacts getArtifacts() { + return artifacts; + } + + public void setArtifacts(ParseArtifacts artifacts) { + this.artifacts = artifacts == null ? new ParseArtifacts() : artifacts; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskInfo.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskInfo.java new file mode 100644 index 0000000..603d2bb --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskInfo.java @@ -0,0 +1,58 @@ +package com.easyagents.document.core.model; + +/** + * 异步任务聚合查询结果。 + * + *

该对象在任务状态字段基础上,按需附带最终解析结果。 + * 当任务尚未完成时只返回状态信息;当任务已完成时可同时返回结果内容。

+ * + * @author Codex + * @since 2026-04-14 + */ +public class ParseTaskInfo extends ParseTaskStatus { + + private ParseResponse result; + + /** + * 基于任务状态创建聚合查询结果。 + * + * @param status 任务状态 + * @return 聚合查询结果 + */ + public static ParseTaskInfo fromStatus(ParseTaskStatus status) { + ParseTaskInfo taskInfo = new ParseTaskInfo(); + if (status == null) { + return taskInfo; + } + taskInfo.setTaskId(status.getTaskId()); + taskInfo.setStatus(status.getStatus()); + taskInfo.setBackend(status.getBackend()); + taskInfo.setFileNames(status.getFileNames()); + taskInfo.setCreatedAt(status.getCreatedAt()); + taskInfo.setStartedAt(status.getStartedAt()); + taskInfo.setCompletedAt(status.getCompletedAt()); + taskInfo.setError(status.getError()); + taskInfo.setStatusUrl(status.getStatusUrl()); + taskInfo.setResultUrl(status.getResultUrl()); + taskInfo.setQueuedAhead(status.getQueuedAhead()); + return taskInfo; + } + + /** + * 获取最终解析结果。 + * + * @return 解析结果;任务未完成时可能为空 + */ + public ParseResponse getResult() { + return result; + } + + /** + * 设置最终解析结果。 + * + * @param result 解析结果 + */ + public void setResult(ParseResponse result) { + this.result = result; + } +} diff --git a/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskStatus.java b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskStatus.java new file mode 100644 index 0000000..4c742cb --- /dev/null +++ b/easy-agents-document/easy-agents-document-core/src/main/java/com/easyagents/document/core/model/ParseTaskStatus.java @@ -0,0 +1,113 @@ +package com.easyagents.document.core.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * 异步任务状态。 + * + * @author Codex + * @since 2026-04-14 + */ +public class ParseTaskStatus { + + private String taskId; + private String status; + private String backend; + private List fileNames = new ArrayList(); + private String createdAt; + private String startedAt; + private String completedAt; + private String error; + private String statusUrl; + private String resultUrl; + private Integer queuedAhead; + + public String getTaskId() { + return taskId; + } + + public void setTaskId(String taskId) { + this.taskId = taskId; + } + + public String getStatus() { + return status; + } + + public void setStatus(String status) { + this.status = status; + } + + public String getBackend() { + return backend; + } + + public void setBackend(String backend) { + this.backend = backend; + } + + public List getFileNames() { + return fileNames; + } + + public void setFileNames(List fileNames) { + this.fileNames = fileNames == null ? new ArrayList() : fileNames; + } + + public String getCreatedAt() { + return createdAt; + } + + public void setCreatedAt(String createdAt) { + this.createdAt = createdAt; + } + + public String getStartedAt() { + return startedAt; + } + + public void setStartedAt(String startedAt) { + this.startedAt = startedAt; + } + + public String getCompletedAt() { + return completedAt; + } + + public void setCompletedAt(String completedAt) { + this.completedAt = completedAt; + } + + public String getError() { + return error; + } + + public void setError(String error) { + this.error = error; + } + + public String getStatusUrl() { + return statusUrl; + } + + public void setStatusUrl(String statusUrl) { + this.statusUrl = statusUrl; + } + + public String getResultUrl() { + return resultUrl; + } + + public void setResultUrl(String resultUrl) { + this.resultUrl = resultUrl; + } + + public Integer getQueuedAhead() { + return queuedAhead; + } + + public void setQueuedAhead(Integer queuedAhead) { + this.queuedAhead = queuedAhead; + } +} diff --git a/easy-agents-document/easy-agents-document-pdf/pom.xml b/easy-agents-document/easy-agents-document-pdf/pom.xml new file mode 100644 index 0000000..92c4430 --- /dev/null +++ b/easy-agents-document/easy-agents-document-pdf/pom.xml @@ -0,0 +1,44 @@ + + + 4.0.0 + + + com.easyagents + easy-agents-document + ${revision} + + + easy-agents-document-pdf + easy-agents-document-pdf + + + 8 + 8 + UTF-8 + + + + + com.easyagents + easy-agents-document-core + + + + com.easyagents + easy-agents-core + + + + com.alibaba.fastjson2 + fastjson2 + + + + junit + junit + test + + + diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentParseService.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentParseService.java new file mode 100644 index 0000000..6908ca4 --- /dev/null +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentParseService.java @@ -0,0 +1,12 @@ +package com.easyagents.document.pdf; + +import com.easyagents.document.core.DocumentParseService; + +/** + * PDF 文档解析服务。 + * + * @author Codex + * @since 2026-04-14 + */ +public interface PdfDocumentParseService extends DocumentParseService { +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentProvider.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentProvider.java new file mode 100644 index 0000000..eac0683 --- /dev/null +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/PdfDocumentProvider.java @@ -0,0 +1,17 @@ +package com.easyagents.document.pdf; + +/** + * PDF provider SPI。 + * + * @author Codex + * @since 2026-04-14 + */ +public interface PdfDocumentProvider extends PdfDocumentParseService { + + /** + * 获取 provider 标识。 + * + * @return provider 名称 + */ + String getProvider(); +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java new file mode 100644 index 0000000..7f97a1c --- /dev/null +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java @@ -0,0 +1,830 @@ +package com.easyagents.document.pdf.mineru; + +import com.alibaba.fastjson2.JSON; +import com.alibaba.fastjson2.JSONArray; +import com.alibaba.fastjson2.JSONObject; +import com.easyagents.core.util.StringUtil; +import com.easyagents.document.core.exception.DocumentParseException; +import com.easyagents.document.core.model.DocumentBlock; +import com.easyagents.document.core.model.DocumentImage; +import com.easyagents.document.core.model.DocumentPage; +import com.easyagents.document.core.model.DocumentTable; +import com.easyagents.document.core.model.ParseArtifacts; +import com.easyagents.document.core.model.ParseRequest; +import com.easyagents.document.core.model.ParseResponse; +import com.easyagents.document.core.model.ParseResult; +import com.easyagents.document.core.model.ParseTaskStatus; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.URLConnection; +import java.util.ArrayList; +import java.util.Base64; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +/** + * MinerU 原始协议与统一模型之间的映射器。 + * + * @author Codex + * @since 2026-04-14 + */ +public class MineruMapper { + + private final MineruProperties properties; + + /** + * 创建映射器。 + * + * @param properties MinerU 配置 + */ + public MineruMapper(MineruProperties properties) { + this.properties = properties; + } + + /** + * 构建同步请求表单字段。 + * + * @param request 解析请求 + * @return 表单字段 + */ + public Map> buildSyncFormFields(ParseRequest request) { + Map> fields = buildBaseFormFields(request); + putSingleValue(fields, "return_md", String.valueOf(isTrue(request.getReturnMarkdown()))); + putSingleValue(fields, "return_middle_json", String.valueOf(isTrue(request.getReturnMiddleJson()))); + putSingleValue(fields, "return_content_list", String.valueOf(isTrue(request.getReturnContentList()))); + putSingleValue(fields, "return_model_output", String.valueOf(isTrue(request.getReturnModelOutput()))); + putSingleValue(fields, "return_images", String.valueOf(isTrue(request.getReturnImages()))); + putSingleValue(fields, "response_format_zip", "false"); + return fields; + } + + /** + * 构建异步请求表单字段。 + * + * @param request 解析请求 + * @return 表单字段 + */ + public Map> buildAsyncFormFields(ParseRequest request) { + Map> fields = buildBaseFormFields(request); + // 异步结果固定按全量 ZIP 返回,避免超大结果通过 JSON 传输。 + putSingleValue(fields, "return_md", "true"); + putSingleValue(fields, "return_middle_json", "true"); + putSingleValue(fields, "return_content_list", "true"); + putSingleValue(fields, "return_model_output", "true"); + putSingleValue(fields, "return_images", "true"); + putSingleValue(fields, "response_format_zip", "true"); + return fields; + } + + /** + * 将原始 JSON 转为 MinerU 任务状态 DTO。 + * + * @param jsonObject 原始 JSON + * @return 任务状态 DTO + */ + public MineruTaskStatus toTaskStatus(JSONObject jsonObject) { + MineruTaskStatus taskStatus = new MineruTaskStatus(); + taskStatus.setTaskId(jsonObject.getString("task_id")); + taskStatus.setStatus(jsonObject.getString("status")); + taskStatus.setBackend(jsonObject.getString("backend")); + taskStatus.setFileNames(toStringList(jsonObject.getJSONArray("file_names"))); + taskStatus.setCreatedAt(jsonObject.getString("created_at")); + taskStatus.setStartedAt(jsonObject.getString("started_at")); + taskStatus.setCompletedAt(jsonObject.getString("completed_at")); + taskStatus.setError(jsonObject.getString("error")); + taskStatus.setStatusUrl(jsonObject.getString("status_url")); + taskStatus.setResultUrl(jsonObject.getString("result_url")); + taskStatus.setQueuedAhead(jsonObject.getInteger("queued_ahead")); + taskStatus.setVersion(jsonObject.getString("version")); + taskStatus.setMessage(jsonObject.getString("message")); + return taskStatus; + } + + /** + * 将原始 JSON 转为 MinerU 结果 DTO。 + * + * @param jsonObject 原始 JSON + * @return 结果 DTO + */ + public MineruResultPayload toResultPayload(JSONObject jsonObject) { + MineruResultPayload payload = new MineruResultPayload(); + payload.setBackend(jsonObject.getString("backend")); + payload.setVersion(jsonObject.getString("version")); + Map results = new LinkedHashMap(); + JSONObject resultJson = jsonObject.getJSONObject("results"); + if (resultJson != null) { + for (String key : resultJson.keySet()) { + results.put(key, resultJson.getJSONObject(key)); + } + } + payload.setResults(results); + return payload; + } + + /** + * 将 MinerU 任务状态转为统一模型。 + * + * @param taskStatus 原始任务状态 + * @return 统一任务状态 + */ + public ParseTaskStatus toParseTaskStatus(MineruTaskStatus taskStatus) { + ParseTaskStatus status = new ParseTaskStatus(); + status.setTaskId(taskStatus.getTaskId()); + status.setStatus(taskStatus.getStatus()); + status.setBackend(taskStatus.getBackend()); + status.setFileNames(taskStatus.getFileNames()); + status.setCreatedAt(taskStatus.getCreatedAt()); + status.setStartedAt(taskStatus.getStartedAt()); + status.setCompletedAt(taskStatus.getCompletedAt()); + status.setError(taskStatus.getError()); + status.setStatusUrl(taskStatus.getStatusUrl()); + status.setResultUrl(taskStatus.getResultUrl()); + status.setQueuedAhead(taskStatus.getQueuedAhead()); + return status; + } + + /** + * 将同步 JSON 结果转为统一响应。 + * + * @param payload MinerU 结果 DTO + * @return 统一响应 + */ + public ParseResponse toParseResponse(MineruResultPayload payload) { + ParseResponse response = new ParseResponse(); + response.setBackend(payload.getBackend()); + response.setVersion(payload.getVersion()); + List parseResults = new ArrayList(); + for (Map.Entry entry : payload.getResults().entrySet()) { + parseResults.add(mapSingleResult(entry.getKey(), entry.getValue())); + } + response.setResults(parseResults); + return response; + } + + /** + * 将 ZIP 结果转为统一响应。 + * + * @param zipBytes ZIP 二进制 + * @return 统一响应 + */ + public ParseResponse fromZip(byte[] zipBytes) { + Map bundles = unzip(zipBytes); + if (bundles.isEmpty()) { + throw new DocumentParseException("MinerU ZIP result does not contain any parse artifacts"); + } + ParseResponse response = new ParseResponse(); + List parseResults = new ArrayList(); + for (Map.Entry entry : bundles.entrySet()) { + parseResults.add(mapZipBundle(entry.getKey(), entry.getValue())); + } + response.setResults(parseResults); + return response; + } + + /** + * 使用异步任务状态和 ZIP 内部工件回填响应元数据。 + * + * @param response 统一响应 + * @param backend 任务状态中的 backend + * @param version 任务状态中的 version + */ + public void enrichAsyncResponse(ParseResponse response, String backend, String version) { + if (response == null) { + return; + } + response.setBackend(StringUtil.hasText(backend) ? backend : resolveBackendFromResults(response)); + String resolvedVersion = StringUtil.hasText(version) ? version : resolveVersionFromResults(response); + response.setVersion(resolvedVersion); + } + + private Map> buildBaseFormFields(ParseRequest request) { + Map> fields = new LinkedHashMap>(); + putSingleValue(fields, "backend", StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend()); + putSingleValue(fields, "parse_method", StringUtil.hasText(request.getParseMethod()) ? request.getParseMethod() : properties.getDefaultParseMethod()); + putSingleValue(fields, "formula_enable", String.valueOf(boolOrDefault(request.getFormulaEnabled(), properties.getDefaultFormulaEnable()))); + putSingleValue(fields, "table_enable", String.valueOf(boolOrDefault(request.getTableEnabled(), properties.getDefaultTableEnable()))); + putSingleValue(fields, "start_page_id", String.valueOf(intOrDefault(request.getStartPageIndex(), 0))); + putSingleValue(fields, "end_page_id", String.valueOf(intOrDefault(request.getEndPageIndex(), 99999))); + List languages = request.getLanguages(); + if (languages == null || languages.isEmpty()) { + languages = properties.getDefaultLangList(); + } + if (languages != null && !languages.isEmpty()) { + // MinerU 通过重复的 lang_list 表单字段接收多语言参数。 + fields.put("lang_list", new ArrayList(languages)); + } + return fields; + } + + private void putSingleValue(Map> fields, String key, String value) { + List values = new ArrayList(1); + values.add(value); + fields.put(key, values); + } + + private ParseResult mapSingleResult(String fileName, JSONObject fileResult) { + ParseResult result = new ParseResult(); + result.setFileName(fileName); + result.setMarkdown(fileResult.getString("md_content")); + result.setPlainText(result.getMarkdown()); + + ParseArtifacts artifacts = new ParseArtifacts(); + artifacts.setMiddleJson(fileResult.get("middle_json")); + artifacts.setContentList(fileResult.get("content_list")); + artifacts.setModelOutput(fileResult.get("model_output")); + result.setArtifacts(artifacts); + + Map imageDataUrls = toStringMap(fileResult.getJSONObject("images")); + applyStructuredArtifacts(result, imageDataUrls); + if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) { + result.getWarnings().add("MinerU did not return markdown, middle_json or content_list"); + } + return result; + } + + private ParseResult mapZipBundle(String fileName, ZipArtifactBundle bundle) { + ParseResult result = new ParseResult(); + result.setFileName(fileName); + + String markdown = firstText(bundle.entriesBySuffix, ".md"); + result.setMarkdown(markdown); + result.setPlainText(markdown); + + ParseArtifacts artifacts = new ParseArtifacts(); + JSONObject middleJson = firstJsonObject(bundle.entriesBySuffix, "_middle.json"); + JSONArray contentList = firstJsonArray(bundle.entriesBySuffix, "_content_list.json"); + JSONObject modelOutput = firstJsonObject(bundle.entriesBySuffix, "_model.json"); + artifacts.setMiddleJson(middleJson); + artifacts.setContentList(contentList); + artifacts.setModelOutput(modelOutput); + + JSONArray contentListV2 = firstJsonArray(bundle.entriesBySuffix, "_content_list_v2.json"); + if (contentListV2 != null) { + artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2); + } + + for (Map.Entry entry : bundle.otherBinaryEntries.entrySet()) { + artifacts.getExtraBinaryArtifacts().put(entry.getKey(), entry.getValue()); + } + result.setArtifacts(artifacts); + + Map imageDataUrls = new LinkedHashMap(); + for (Map.Entry imageEntry : bundle.images.entrySet()) { + imageDataUrls.put(imageEntry.getKey(), toDataUrl(imageEntry.getKey(), imageEntry.getValue())); + } + applyStructuredArtifacts(result, imageDataUrls); + + if (markdown == null && middleJson == null && contentList == null) { + throw new DocumentParseException("MinerU ZIP result missing critical artifacts for file: " + fileName); + } + return result; + } + + private void applyStructuredArtifacts(ParseResult result, Map imageDataUrls) { + JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson()); + JSONArray contentList = asArray(result.getArtifacts().getContentList()); + + if (middleJson != null) { + fillPages(result, middleJson); + result.getMetadata().put("middleBackend", middleJson.getString("_backend")); + result.getMetadata().put("middleVersion", middleJson.getString("_version_name")); + } + + if (contentList != null) { + fillFromContentList(result, contentList, imageDataUrls); + } else if (middleJson != null) { + fillFromMiddleJson(result, middleJson, imageDataUrls); + } + + if ((result.getImages() == null || result.getImages().isEmpty()) && imageDataUrls != null && !imageDataUrls.isEmpty()) { + for (Map.Entry entry : imageDataUrls.entrySet()) { + DocumentImage image = new DocumentImage(); + image.setName(baseName(entry.getKey())); + image.setSourcePath(entry.getKey()); + image.setDataUrl(entry.getValue()); + image.setMimeType(detectMimeType(entry.getKey())); + result.getImages().add(image); + } + } + } + + private void fillPages(ParseResult result, JSONObject middleJson) { + JSONArray pdfInfo = middleJson.getJSONArray("pdf_info"); + if (pdfInfo == null) { + return; + } + List pages = new ArrayList(); + for (int index = 0; index < pdfInfo.size(); index++) { + JSONObject pageJson = pdfInfo.getJSONObject(index); + DocumentPage page = new DocumentPage(); + page.setPageIndex(pageJson.getInteger("page_idx")); + JSONArray pageSize = pageJson.getJSONArray("page_size"); + if (pageSize != null && pageSize.size() >= 2) { + page.setWidth(pageSize.getDouble(0)); + page.setHeight(pageSize.getDouble(1)); + } + page.getMetadata().put("raw", pageJson); + pages.add(page); + } + result.setPages(pages); + } + + private void fillFromContentList(ParseResult result, JSONArray contentList, Map imageDataUrls) { + for (int index = 0; index < contentList.size(); index++) { + JSONObject item = contentList.getJSONObject(index); + if (item == null) { + continue; + } + DocumentBlock block = new DocumentBlock(); + block.setType(item.getString("type")); + block.setPageIndex(item.getInteger("page_idx")); + block.setBoundingBox(toDoubleList(item.getJSONArray("bbox"))); + block.setLevel(item.getInteger("text_level")); + block.setText(extractBlockText(item)); + block.setHtml(item.getString("table_body")); + block.setImagePath(item.getString("img_path")); + block.getMetadata().put("raw", item); + result.getBlocks().add(block); + + if ("table".equals(item.getString("type"))) { + DocumentTable table = new DocumentTable(); + table.setPageIndex(item.getInteger("page_idx")); + table.setBoundingBox(toDoubleList(item.getJSONArray("bbox"))); + table.setHtml(item.getString("table_body")); + table.setImagePath(item.getString("img_path")); + table.setCaptions(toStringList(item.getJSONArray("table_caption"))); + table.setFootnotes(toStringList(item.getJSONArray("table_footnote"))); + result.getTables().add(table); + } + + if (isVisualType(item.getString("type"))) { + DocumentImage image = new DocumentImage(); + image.setPageIndex(item.getInteger("page_idx")); + image.setBoundingBox(toDoubleList(item.getJSONArray("bbox"))); + image.setSourcePath(item.getString("img_path")); + image.setName(baseName(item.getString("img_path"))); + image.setMimeType(detectMimeType(item.getString("img_path"))); + image.setCaptions(extractCaptions(item)); + image.setFootnotes(extractFootnotes(item)); + image.setDataUrl(matchDataUrl(item.getString("img_path"), imageDataUrls)); + result.getImages().add(image); + } + } + } + + private void fillFromMiddleJson(ParseResult result, JSONObject middleJson, Map imageDataUrls) { + JSONArray pages = middleJson.getJSONArray("pdf_info"); + if (pages == null) { + return; + } + for (int pageIndex = 0; pageIndex < pages.size(); pageIndex++) { + JSONObject page = pages.getJSONObject(pageIndex); + fillBlocksFromMiddlePage(result, page.getJSONArray("para_blocks"), page.getInteger("page_idx")); + fillVisualsFromMiddlePage(result, page.getJSONArray("tables"), page.getInteger("page_idx"), true, imageDataUrls); + fillVisualsFromMiddlePage(result, page.getJSONArray("images"), page.getInteger("page_idx"), false, imageDataUrls); + } + } + + private void fillBlocksFromMiddlePage(ParseResult result, JSONArray blocks, Integer pageIndex) { + if (blocks == null) { + return; + } + for (int index = 0; index < blocks.size(); index++) { + JSONObject blockJson = blocks.getJSONObject(index); + if (blockJson == null) { + continue; + } + DocumentBlock block = new DocumentBlock(); + block.setType(blockJson.getString("type")); + block.setPageIndex(pageIndex); + block.setBoundingBox(toDoubleList(blockJson.getJSONArray("bbox"))); + block.setText(extractTextFromMiddleBlock(blockJson)); + block.setImagePath(extractImagePathFromMiddleBlock(blockJson)); + block.getMetadata().put("raw", blockJson); + result.getBlocks().add(block); + } + } + + private void fillVisualsFromMiddlePage(ParseResult result, JSONArray blocks, Integer pageIndex, boolean table, Map imageDataUrls) { + if (blocks == null) { + return; + } + for (int index = 0; index < blocks.size(); index++) { + JSONObject blockJson = blocks.getJSONObject(index); + if (blockJson == null) { + continue; + } + if (table) { + DocumentTable documentTable = new DocumentTable(); + documentTable.setPageIndex(pageIndex); + documentTable.setBoundingBox(toDoubleList(blockJson.getJSONArray("bbox"))); + documentTable.setCaptions(extractTextsByType(blockJson, "table_caption")); + documentTable.setFootnotes(extractTextsByType(blockJson, "table_footnote")); + documentTable.setImagePath(extractImagePathByType(blockJson, "table_body")); + result.getTables().add(documentTable); + } else { + DocumentImage documentImage = new DocumentImage(); + documentImage.setPageIndex(pageIndex); + documentImage.setBoundingBox(toDoubleList(blockJson.getJSONArray("bbox"))); + documentImage.setCaptions(extractTextsByType(blockJson, "image_caption")); + documentImage.setFootnotes(extractTextsByType(blockJson, "image_footnote")); + documentImage.setSourcePath(extractImagePathByType(blockJson, "image_body")); + documentImage.setName(baseName(documentImage.getSourcePath())); + documentImage.setMimeType(detectMimeType(documentImage.getSourcePath())); + documentImage.setDataUrl(matchDataUrl(documentImage.getSourcePath(), imageDataUrls)); + result.getImages().add(documentImage); + } + } + } + + private String resolveBackendFromResults(ParseResponse response) { + if (response.getResults() == null || response.getResults().isEmpty()) { + return properties.getDefaultBackend(); + } + for (ParseResult result : response.getResults()) { + Object middleBackend = result.getMetadata().get("middleBackend"); + if (middleBackend instanceof String && StringUtil.hasText((String) middleBackend)) { + return (String) middleBackend; + } + } + return properties.getDefaultBackend(); + } + + private String resolveVersionFromResults(ParseResponse response) { + if (response.getResults() == null || response.getResults().isEmpty()) { + return null; + } + for (ParseResult result : response.getResults()) { + Object middleVersion = result.getMetadata().get("middleVersion"); + if (middleVersion instanceof String && StringUtil.hasText((String) middleVersion)) { + return (String) middleVersion; + } + } + return null; + } + + private Map unzip(byte[] zipBytes) { + Map bundles = new LinkedHashMap(); + try (ZipInputStream zipInputStream = new ZipInputStream(new ByteArrayInputStream(zipBytes))) { + ZipEntry entry; + while ((entry = zipInputStream.getNextEntry()) != null) { + if (entry.isDirectory()) { + continue; + } + byte[] entryBytes = readBytes(zipInputStream); + String entryName = entry.getName(); + String fileName = resolveFileName(entryName); + ZipArtifactBundle bundle = bundles.get(fileName); + if (bundle == null) { + bundle = new ZipArtifactBundle(); + bundles.put(fileName, bundle); + } + if (entryName.contains("/images/")) { + bundle.images.put(entryName, entryBytes); + } else if (entryName.endsWith(".md") + || entryName.endsWith("_middle.json") + || entryName.endsWith("_content_list.json") + || entryName.endsWith("_content_list_v2.json") + || entryName.endsWith("_model.json")) { + bundle.entriesBySuffix.put(entryName, entryBytes); + } else { + bundle.otherBinaryEntries.put(entryName, entryBytes); + } + } + } catch (IOException exception) { + throw new DocumentParseException("Failed to unzip MinerU result", exception); + } + return bundles; + } + + private byte[] readBytes(ZipInputStream zipInputStream) throws IOException { + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + byte[] buffer = new byte[8192]; + int length; + while ((length = zipInputStream.read(buffer)) >= 0) { + outputStream.write(buffer, 0, length); + } + return outputStream.toByteArray(); + } + + private String resolveFileName(String entryName) { + String[] segments = entryName.split("/"); + if (segments.length > 0 && StringUtil.hasText(segments[0])) { + return segments[0]; + } + String fileName = baseName(entryName); + int dotIndex = fileName.indexOf('.'); + return dotIndex > 0 ? fileName.substring(0, dotIndex) : fileName; + } + + private String firstText(Map entries, String suffix) { + for (Map.Entry entry : entries.entrySet()) { + if (entry.getKey().endsWith(suffix)) { + return new String(entry.getValue()); + } + } + return null; + } + + private JSONObject firstJsonObject(Map entries, String suffix) { + String text = firstText(entries, suffix); + if (!StringUtil.hasText(text)) { + return null; + } + return JSON.parseObject(text); + } + + private JSONArray firstJsonArray(Map entries, String suffix) { + String text = firstText(entries, suffix); + if (!StringUtil.hasText(text)) { + return null; + } + return JSON.parseArray(text); + } + + private JSONObject asObject(Object value) { + if (value instanceof JSONObject) { + return (JSONObject) value; + } + if (value == null) { + return null; + } + return JSON.parseObject(JSON.toJSONString(value)); + } + + private JSONArray asArray(Object value) { + if (value instanceof JSONArray) { + return (JSONArray) value; + } + if (value == null) { + return null; + } + return JSON.parseArray(JSON.toJSONString(value)); + } + + private List toStringList(JSONArray jsonArray) { + if (jsonArray == null || jsonArray.isEmpty()) { + return new ArrayList(); + } + List values = new ArrayList(); + for (int index = 0; index < jsonArray.size(); index++) { + values.add(jsonArray.getString(index)); + } + return values; + } + + private Map toStringMap(JSONObject jsonObject) { + if (jsonObject == null || jsonObject.isEmpty()) { + return new LinkedHashMap(); + } + Map values = new LinkedHashMap(); + for (String key : jsonObject.keySet()) { + values.put(key, jsonObject.getString(key)); + } + return values; + } + + private List toDoubleList(JSONArray jsonArray) { + if (jsonArray == null || jsonArray.isEmpty()) { + return new ArrayList(); + } + List values = new ArrayList(); + for (int index = 0; index < jsonArray.size(); index++) { + values.add(jsonArray.getDouble(index)); + } + return values; + } + + private List extractCaptions(JSONObject item) { + List texts = new ArrayList(); + texts.addAll(toStringList(item.getJSONArray("image_caption"))); + texts.addAll(toStringList(item.getJSONArray("table_caption"))); + return texts; + } + + private List extractFootnotes(JSONObject item) { + List texts = new ArrayList(); + texts.addAll(toStringList(item.getJSONArray("image_footnote"))); + texts.addAll(toStringList(item.getJSONArray("table_footnote"))); + return texts; + } + + private boolean isVisualType(String type) { + return "image".equals(type) || "table".equals(type) || "chart".equals(type) || "seal".equals(type); + } + + private String extractBlockText(JSONObject item) { + String type = item.getString("type"); + if ("text".equals(type) || "header".equals(type) || "footer".equals(type) + || "page_number".equals(type) || "aside_text".equals(type) || "page_footnote".equals(type) + || "equation".equals(type)) { + return item.getString("text"); + } + if ("list".equals(type)) { + return joinList(toStringList(item.getJSONArray("list_items"))); + } + if ("code".equals(type)) { + return item.getString("code_body"); + } + if ("image".equals(type)) { + return joinList(toStringList(item.getJSONArray("image_caption"))); + } + if ("table".equals(type)) { + return joinList(toStringList(item.getJSONArray("table_caption"))); + } + return item.getString("text"); + } + + private String extractTextFromMiddleBlock(JSONObject blockJson) { + List texts = new ArrayList(); + JSONArray blocks = blockJson.getJSONArray("blocks"); + if (blocks == null) { + return null; + } + for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) { + JSONObject childBlock = blocks.getJSONObject(blockIndex); + JSONArray lines = childBlock.getJSONArray("lines"); + if (lines == null) { + continue; + } + for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) { + JSONObject line = lines.getJSONObject(lineIndex); + JSONArray spans = line.getJSONArray("spans"); + if (spans == null) { + continue; + } + for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) { + JSONObject span = spans.getJSONObject(spanIndex); + if (span.containsKey("content")) { + texts.add(span.getString("content")); + } + } + } + } + return joinList(texts); + } + + private String extractImagePathFromMiddleBlock(JSONObject blockJson) { + JSONArray blocks = blockJson.getJSONArray("blocks"); + if (blocks == null) { + return null; + } + for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) { + JSONObject childBlock = blocks.getJSONObject(blockIndex); + JSONArray lines = childBlock.getJSONArray("lines"); + if (lines == null) { + continue; + } + for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) { + JSONObject line = lines.getJSONObject(lineIndex); + JSONArray spans = line.getJSONArray("spans"); + if (spans == null) { + continue; + } + for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) { + JSONObject span = spans.getJSONObject(spanIndex); + if (span.containsKey("img_path")) { + return span.getString("img_path"); + } + } + } + } + return null; + } + + private List extractTextsByType(JSONObject visualBlock, String expectedType) { + List texts = new ArrayList(); + JSONArray blocks = visualBlock.getJSONArray("blocks"); + if (blocks == null) { + return texts; + } + for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) { + JSONObject childBlock = blocks.getJSONObject(blockIndex); + if (!expectedType.equals(childBlock.getString("type"))) { + continue; + } + JSONArray lines = childBlock.getJSONArray("lines"); + if (lines == null) { + continue; + } + for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) { + JSONObject line = lines.getJSONObject(lineIndex); + JSONArray spans = line.getJSONArray("spans"); + if (spans == null) { + continue; + } + for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) { + JSONObject span = spans.getJSONObject(spanIndex); + if (span.containsKey("content")) { + texts.add(span.getString("content")); + } + } + } + } + return texts; + } + + private String extractImagePathByType(JSONObject visualBlock, String expectedType) { + JSONArray blocks = visualBlock.getJSONArray("blocks"); + if (blocks == null) { + return null; + } + for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) { + JSONObject childBlock = blocks.getJSONObject(blockIndex); + if (!expectedType.equals(childBlock.getString("type"))) { + continue; + } + JSONArray lines = childBlock.getJSONArray("lines"); + if (lines == null) { + continue; + } + for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) { + JSONObject line = lines.getJSONObject(lineIndex); + JSONArray spans = line.getJSONArray("spans"); + if (spans == null) { + continue; + } + for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) { + JSONObject span = spans.getJSONObject(spanIndex); + if (span.containsKey("img_path")) { + return span.getString("img_path"); + } + } + } + } + return null; + } + + private String matchDataUrl(String imagePath, Map imageDataUrls) { + if (imageDataUrls == null || imageDataUrls.isEmpty()) { + return null; + } + if (StringUtil.hasText(imagePath) && imageDataUrls.containsKey(imagePath)) { + return imageDataUrls.get(imagePath); + } + String baseName = baseName(imagePath); + for (Map.Entry entry : imageDataUrls.entrySet()) { + if (baseName.equals(baseName(entry.getKey()))) { + return entry.getValue(); + } + } + return null; + } + + private String baseName(String path) { + if (!StringUtil.hasText(path)) { + return null; + } + int slashIndex = path.lastIndexOf('/'); + return slashIndex >= 0 ? path.substring(slashIndex + 1) : path; + } + + private String detectMimeType(String path) { + if (!StringUtil.hasText(path)) { + return null; + } + String mimeType = URLConnection.guessContentTypeFromName(path); + return StringUtil.hasText(mimeType) ? mimeType : "application/octet-stream"; + } + + private String toDataUrl(String path, byte[] content) { + return "data:" + detectMimeType(path) + ";base64," + Base64.getEncoder().encodeToString(content); + } + + private String joinList(List values) { + if (values == null || values.isEmpty()) { + return null; + } + StringBuilder builder = new StringBuilder(); + for (int index = 0; index < values.size(); index++) { + if (index > 0) { + builder.append('\n'); + } + builder.append(values.get(index)); + } + return builder.toString(); + } + + private boolean boolOrDefault(Boolean value, Boolean defaultValue) { + return value == null ? isTrue(defaultValue) : value; + } + + private boolean isTrue(Boolean value) { + return value != null && value; + } + + private int intOrDefault(Integer value, int defaultValue) { + return value == null ? defaultValue : value; + } + + private static class ZipArtifactBundle { + private final Map entriesBySuffix = new LinkedHashMap(); + private final Map images = new LinkedHashMap(); + private final Map otherBinaryEntries = new LinkedHashMap(); + } +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfClient.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfClient.java new file mode 100644 index 0000000..53f6cf9 --- /dev/null +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfClient.java @@ -0,0 +1,211 @@ +package com.easyagents.document.pdf.mineru; + +import com.alibaba.fastjson2.JSON; +import com.alibaba.fastjson2.JSONObject; +import com.easyagents.core.util.StringUtil; +import com.easyagents.document.core.exception.DocumentParseException; +import com.easyagents.document.core.model.ParseFile; +import com.easyagents.document.core.model.ParseRequest; +import okhttp3.MediaType; +import okhttp3.MultipartBody; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import okhttp3.RequestBody; +import okhttp3.Response; +import okhttp3.ResponseBody; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +/** + * MinerU HTTP 客户端。 + * + * @author Codex + * @since 2026-04-14 + */ +public class MineruPdfClient { + + private static final MediaType DEFAULT_PDF_MEDIA_TYPE = MediaType.parse("application/pdf"); + + private final String baseUrl; + private final OkHttpClient okHttpClient; + private final MineruMapper mineruMapper; + + /** + * 创建客户端。 + * + * @param properties MinerU 配置 + * @param mineruMapper DTO 映射器 + */ + public MineruPdfClient(MineruProperties properties, MineruMapper mineruMapper) { + this( + properties, + new OkHttpClient.Builder() + .connectTimeout(properties.getConnectTimeoutMs(), TimeUnit.MILLISECONDS) + .readTimeout(properties.getReadTimeoutMs(), TimeUnit.MILLISECONDS) + .writeTimeout(properties.getWriteTimeoutMs(), TimeUnit.MILLISECONDS) + .build(), + mineruMapper + ); + } + + /** + * 创建客户端。 + * + * @param properties MinerU 配置 + * @param okHttpClient HTTP 客户端 + * @param mineruMapper DTO 映射器 + */ + public MineruPdfClient(MineruProperties properties, OkHttpClient okHttpClient, MineruMapper mineruMapper) { + if (properties == null || !StringUtil.hasText(properties.getBaseUrl())) { + throw new IllegalArgumentException("MinerU baseUrl must not be empty"); + } + this.baseUrl = normalizeBaseUrl(properties.getBaseUrl()); + this.okHttpClient = okHttpClient; + this.mineruMapper = mineruMapper; + } + + /** + * 调用同步解析接口。 + * + * @param request 解析请求 + * @return 原始结果 + */ + public MineruResultPayload parse(ParseRequest request) { + return mineruMapper.toResultPayload(executeJsonMultipart("/file_parse", request, buildSyncFormFields(request))); + } + + /** + * 提交异步解析任务。 + * + * @param request 解析请求 + * @return 原始任务状态 + */ + public MineruTaskStatus submit(ParseRequest request) { + return mineruMapper.toTaskStatus(executeJsonMultipart("/tasks", request, buildAsyncFormFields(request))); + } + + /** + * 查询任务状态。 + * + * @param taskId 任务 ID + * @return 原始任务状态 + */ + public MineruTaskStatus queryTask(String taskId) { + return mineruMapper.toTaskStatus(executeJsonGet("/tasks/" + taskId)); + } + + /** + * 下载异步结果 ZIP。 + * + * @param taskId 任务 ID + * @return ZIP 二进制 + */ + public byte[] queryResultZip(String taskId) { + String path = "/tasks/" + taskId + "/result"; + Request request = new Request.Builder().url(baseUrl + path).get().build(); + try (Response response = okHttpClient.newCall(request).execute()) { + ResponseBody body = response.body(); + byte[] responseBytes = body == null ? new byte[0] : body.bytes(); + if (!response.isSuccessful()) { + throw buildHttpException(path, response.code(), responseBytes); + } + String contentType = response.header("Content-Type"); + if (contentType != null && contentType.contains("application/json")) { + JSONObject jsonObject = JSON.parseObject(new String(responseBytes)); + throw new DocumentParseException("MinerU async result is not ready: " + jsonObject.toJSONString()); + } + if (responseBytes.length < 2 || responseBytes[0] != 'P' || responseBytes[1] != 'K') { + throw new DocumentParseException("MinerU async result is not a valid ZIP payload"); + } + return responseBytes; + } catch (IOException exception) { + throw new DocumentParseException("Failed to query MinerU result ZIP", exception); + } + } + + protected JSONObject executeJsonMultipart(String path, ParseRequest request, Map> fields) { + MultipartBody.Builder formBuilder = new MultipartBody.Builder().setType(MultipartBody.FORM); + appendFiles(formBuilder, request.getFiles()); + appendStringFields(formBuilder, fields); + Request httpRequest = new Request.Builder() + .url(baseUrl + path) + .post(formBuilder.build()) + .build(); + return executeJsonRequest(path, httpRequest); + } + + protected JSONObject executeJsonGet(String path) { + Request request = new Request.Builder().url(baseUrl + path).get().build(); + return executeJsonRequest(path, request); + } + + protected JSONObject executeJsonRequest(String path, Request request) { + try (Response response = okHttpClient.newCall(request).execute()) { + ResponseBody body = response.body(); + String bodyText = body == null ? "" : body.string(); + if (!response.isSuccessful()) { + throw buildHttpException(path, response.code(), bodyText == null ? new byte[0] : bodyText.getBytes()); + } + return JSON.parseObject(bodyText); + } catch (IOException exception) { + throw new DocumentParseException("Failed to call MinerU endpoint: " + path, exception); + } + } + + private void appendFiles(MultipartBody.Builder formBuilder, List files) { + if (files == null || files.isEmpty()) { + throw new IllegalArgumentException("Parse request must contain at least one file"); + } + for (ParseFile file : files) { + if (file == null || !StringUtil.hasText(file.getFileName()) || file.getContent() == null) { + throw new IllegalArgumentException("Parse request contains an invalid file"); + } + MediaType mediaType = StringUtil.hasText(file.getContentType()) + ? MediaType.parse(file.getContentType()) + : DEFAULT_PDF_MEDIA_TYPE; + formBuilder.addFormDataPart( + "files", + file.getFileName(), + RequestBody.create(file.getContent(), mediaType) + ); + } + } + + private void appendStringFields(MultipartBody.Builder formBuilder, Map> fields) { + for (Map.Entry> entry : fields.entrySet()) { + if (entry.getValue() == null) { + continue; + } + for (String value : entry.getValue()) { + if (value != null) { + formBuilder.addFormDataPart(entry.getKey(), value); + } + } + } + } + + private Map> buildSyncFormFields(ParseRequest request) { + return mineruMapper.buildSyncFormFields(request); + } + + private Map> buildAsyncFormFields(ParseRequest request) { + return mineruMapper.buildAsyncFormFields(request); + } + + private DocumentParseException buildHttpException(String path, int statusCode, byte[] bodyBytes) { + String bodyText = bodyBytes == null ? "" : new String(bodyBytes); + return new DocumentParseException( + "MinerU request failed: path=" + path + ", status=" + statusCode + ", body=" + bodyText + ); + } + + private String normalizeBaseUrl(String baseUrl) { + if (baseUrl.endsWith("/")) { + return baseUrl.substring(0, baseUrl.length() - 1); + } + return baseUrl; + } +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java new file mode 100644 index 0000000..9aa7e2e --- /dev/null +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java @@ -0,0 +1,171 @@ +package com.easyagents.document.pdf.mineru; + +import com.easyagents.core.util.StringUtil; +import com.easyagents.document.core.exception.DocumentParseException; +import com.easyagents.document.core.model.ParseRequest; +import com.easyagents.document.core.model.ParseResponse; +import com.easyagents.document.core.model.ParseTaskInfo; +import com.easyagents.document.core.model.ParseTaskStatus; +import com.easyagents.document.pdf.PdfDocumentProvider; + +import java.util.ArrayList; + +/** + * 基于 MinerU API 的 PDF 解析服务。 + * + * @author Codex + * @since 2026-04-14 + */ +public class MineruPdfDocumentParseService implements PdfDocumentProvider { + + public static final String PROVIDER_NAME = "mineru"; + + private final MineruProperties properties; + private final MineruPdfClient client; + private final MineruMapper mapper; + + /** + * 创建默认服务实例。 + * + * @param properties MinerU 配置 + */ + public MineruPdfDocumentParseService(MineruProperties properties) { + this(properties, new MineruMapper(properties)); + } + + /** + * 创建默认服务实例。 + * + * @param properties MinerU 配置 + * @param mapper 结果映射器 + */ + public MineruPdfDocumentParseService(MineruProperties properties, MineruMapper mapper) { + this(properties, new MineruPdfClient(properties, mapper), mapper); + } + + /** + * 创建服务实例。 + * + * @param properties MinerU 配置 + * @param client HTTP 客户端 + * @param mapper 结果映射器 + */ + public MineruPdfDocumentParseService(MineruProperties properties, MineruPdfClient client, MineruMapper mapper) { + this.properties = properties; + this.client = client; + this.mapper = mapper; + } + + @Override + public String getProvider() { + return PROVIDER_NAME; + } + + @Override + public ParseResponse parse(ParseRequest request) { + ParseRequest normalizedRequest = normalizeRequest(request); + return mapper.toParseResponse(client.parse(normalizedRequest)); + } + + @Override + public ParseTaskStatus submit(ParseRequest request) { + ParseRequest normalizedRequest = normalizeRequest(request); + // 异步结果固定走全量 ZIP,调用方无需传入裁剪参数。 + normalizedRequest.setReturnMarkdown(true); + normalizedRequest.setReturnMiddleJson(true); + normalizedRequest.setReturnContentList(true); + normalizedRequest.setReturnModelOutput(true); + normalizedRequest.setReturnImages(true); + return mapper.toParseTaskStatus(client.submit(normalizedRequest)); + } + + @Override + public ParseTaskStatus queryTask(String taskId) { + validateTaskId(taskId); + return mapper.toParseTaskStatus(client.queryTask(taskId)); + } + + @Override + public ParseResponse queryResult(String taskId) { + validateTaskId(taskId); + MineruTaskStatus taskStatus = waitForTaskCompleted(taskId); + ParseResponse response = mapper.fromZip(client.queryResultZip(taskId)); + mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion()); + return response; + } + + @Override + public ParseTaskInfo queryTaskInfo(String taskId) { + validateTaskId(taskId); + MineruTaskStatus taskStatus = client.queryTask(taskId); + ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(mapper.toParseTaskStatus(taskStatus)); + if ("completed".equalsIgnoreCase(taskStatus.getStatus())) { + ParseResponse response = mapper.fromZip(client.queryResultZip(taskId)); + mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion()); + taskInfo.setResult(response); + } + return taskInfo; + } + + private ParseRequest normalizeRequest(ParseRequest request) { + if (request == null) { + throw new IllegalArgumentException("ParseRequest must not be null"); + } + if (request.getFiles() == null || request.getFiles().isEmpty()) { + throw new IllegalArgumentException("ParseRequest files must not be empty"); + } + ParseRequest normalizedRequest = new ParseRequest(); + normalizedRequest.setFiles(new ArrayList<>(request.getFiles())); + normalizedRequest.setBackend(StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend()); + normalizedRequest.setParseMethod(StringUtil.hasText(request.getParseMethod()) ? request.getParseMethod() : properties.getDefaultParseMethod()); + normalizedRequest.setLanguages( + request.getLanguages() == null || request.getLanguages().isEmpty() + ? new ArrayList(properties.getDefaultLangList()) + : new ArrayList(request.getLanguages()) + ); + normalizedRequest.setFormulaEnabled(request.getFormulaEnabled() == null ? properties.getDefaultFormulaEnable() : request.getFormulaEnabled()); + normalizedRequest.setTableEnabled(request.getTableEnabled() == null ? properties.getDefaultTableEnable() : request.getTableEnabled()); + normalizedRequest.setStartPageIndex(request.getStartPageIndex() == null ? 0 : request.getStartPageIndex()); + normalizedRequest.setEndPageIndex(request.getEndPageIndex() == null ? 99999 : request.getEndPageIndex()); + normalizedRequest.setReturnMarkdown(request.getReturnMarkdown()); + normalizedRequest.setReturnMiddleJson(request.getReturnMiddleJson()); + normalizedRequest.setReturnContentList(request.getReturnContentList()); + normalizedRequest.setReturnModelOutput(request.getReturnModelOutput()); + normalizedRequest.setReturnImages(request.getReturnImages()); + return normalizedRequest; + } + + private void validateTaskId(String taskId) { + if (!StringUtil.hasText(taskId)) { + throw new IllegalArgumentException("taskId must not be empty"); + } + } + + /** + * 轮询任务状态直到完成或失败。 + * + * @param taskId 任务 ID + * @return 已完成的任务状态 + */ + private MineruTaskStatus waitForTaskCompleted(String taskId) { + long deadline = System.currentTimeMillis() + properties.getResultTimeoutMs(); + while (true) { + MineruTaskStatus taskStatus = client.queryTask(taskId); + if ("completed".equals(taskStatus.getStatus())) { + return taskStatus; + } + if ("failed".equals(taskStatus.getStatus())) { + throw new DocumentParseException("MinerU task failed: " + taskStatus.getError()); + } + if (System.currentTimeMillis() >= deadline) { + throw new DocumentParseException("MinerU task result timeout: " + taskId); + } + try { + Thread.sleep(properties.getPollIntervalMs()); + } catch (InterruptedException exception) { + Thread.currentThread().interrupt(); + throw new DocumentParseException("Interrupted while waiting for MinerU task: " + taskId, exception); + } + } + } +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruProperties.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruProperties.java new file mode 100644 index 0000000..c5efd78 --- /dev/null +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruProperties.java @@ -0,0 +1,116 @@ +package com.easyagents.document.pdf.mineru; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * MinerU PDF 解析配置。 + * + * @author Codex + * @since 2026-04-14 + */ +public class MineruProperties { + + private String baseUrl; + private Integer connectTimeoutMs = 3000; + private Integer readTimeoutMs = 600000; + private Integer writeTimeoutMs = 600000; + private Integer pollIntervalMs = 1000; + private Integer resultTimeoutMs = 1800000; + private String defaultBackend = "vlm-http-client"; + private String defaultParseMethod = "auto"; + private List defaultLangList = new ArrayList(Arrays.asList("ch")); + private Boolean defaultFormulaEnable = true; + private Boolean defaultTableEnable = true; + + public String getBaseUrl() { + return baseUrl; + } + + public void setBaseUrl(String baseUrl) { + this.baseUrl = baseUrl; + } + + public Integer getConnectTimeoutMs() { + return connectTimeoutMs; + } + + public void setConnectTimeoutMs(Integer connectTimeoutMs) { + this.connectTimeoutMs = connectTimeoutMs; + } + + public Integer getReadTimeoutMs() { + return readTimeoutMs; + } + + public void setReadTimeoutMs(Integer readTimeoutMs) { + this.readTimeoutMs = readTimeoutMs; + } + + public Integer getWriteTimeoutMs() { + return writeTimeoutMs; + } + + public void setWriteTimeoutMs(Integer writeTimeoutMs) { + this.writeTimeoutMs = writeTimeoutMs; + } + + public Integer getPollIntervalMs() { + return pollIntervalMs; + } + + public void setPollIntervalMs(Integer pollIntervalMs) { + this.pollIntervalMs = pollIntervalMs; + } + + public Integer getResultTimeoutMs() { + return resultTimeoutMs; + } + + public void setResultTimeoutMs(Integer resultTimeoutMs) { + this.resultTimeoutMs = resultTimeoutMs; + } + + public String getDefaultBackend() { + return defaultBackend; + } + + public void setDefaultBackend(String defaultBackend) { + this.defaultBackend = defaultBackend; + } + + public String getDefaultParseMethod() { + return defaultParseMethod; + } + + public void setDefaultParseMethod(String defaultParseMethod) { + this.defaultParseMethod = defaultParseMethod; + } + + public List getDefaultLangList() { + return defaultLangList; + } + + public void setDefaultLangList(List defaultLangList) { + this.defaultLangList = defaultLangList == null + ? new ArrayList(Arrays.asList("ch")) + : defaultLangList; + } + + public Boolean getDefaultFormulaEnable() { + return defaultFormulaEnable; + } + + public void setDefaultFormulaEnable(Boolean defaultFormulaEnable) { + this.defaultFormulaEnable = defaultFormulaEnable; + } + + public Boolean getDefaultTableEnable() { + return defaultTableEnable; + } + + public void setDefaultTableEnable(Boolean defaultTableEnable) { + this.defaultTableEnable = defaultTableEnable; + } +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruResultPayload.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruResultPayload.java new file mode 100644 index 0000000..7ed2b9f --- /dev/null +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruResultPayload.java @@ -0,0 +1,43 @@ +package com.easyagents.document.pdf.mineru; + +import com.alibaba.fastjson2.JSONObject; + +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * MinerU 结果载荷。 + * + * @author Codex + * @since 2026-04-14 + */ +public class MineruResultPayload { + + private String backend; + private String version; + private Map results = new LinkedHashMap(); + + public String getBackend() { + return backend; + } + + public void setBackend(String backend) { + this.backend = backend; + } + + public String getVersion() { + return version; + } + + public void setVersion(String version) { + this.version = version; + } + + public Map getResults() { + return results; + } + + public void setResults(Map results) { + this.results = results == null ? new LinkedHashMap() : results; + } +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruTaskStatus.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruTaskStatus.java new file mode 100644 index 0000000..99b476b --- /dev/null +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruTaskStatus.java @@ -0,0 +1,131 @@ +package com.easyagents.document.pdf.mineru; + +import java.util.ArrayList; +import java.util.List; + +/** + * MinerU 原始任务状态。 + * + * @author Codex + * @since 2026-04-14 + */ +public class MineruTaskStatus { + + private String taskId; + private String status; + private String backend; + private List fileNames = new ArrayList(); + private String createdAt; + private String startedAt; + private String completedAt; + private String error; + private String statusUrl; + private String resultUrl; + private Integer queuedAhead; + private String version; + private String message; + + public String getTaskId() { + return taskId; + } + + public void setTaskId(String taskId) { + this.taskId = taskId; + } + + public String getStatus() { + return status; + } + + public void setStatus(String status) { + this.status = status; + } + + public String getBackend() { + return backend; + } + + public void setBackend(String backend) { + this.backend = backend; + } + + public List getFileNames() { + return fileNames; + } + + public void setFileNames(List fileNames) { + this.fileNames = fileNames == null ? new ArrayList() : fileNames; + } + + public String getCreatedAt() { + return createdAt; + } + + public void setCreatedAt(String createdAt) { + this.createdAt = createdAt; + } + + public String getStartedAt() { + return startedAt; + } + + public void setStartedAt(String startedAt) { + this.startedAt = startedAt; + } + + public String getCompletedAt() { + return completedAt; + } + + public void setCompletedAt(String completedAt) { + this.completedAt = completedAt; + } + + public String getError() { + return error; + } + + public void setError(String error) { + this.error = error; + } + + public String getStatusUrl() { + return statusUrl; + } + + public void setStatusUrl(String statusUrl) { + this.statusUrl = statusUrl; + } + + public String getResultUrl() { + return resultUrl; + } + + public void setResultUrl(String resultUrl) { + this.resultUrl = resultUrl; + } + + public Integer getQueuedAhead() { + return queuedAhead; + } + + public void setQueuedAhead(Integer queuedAhead) { + this.queuedAhead = queuedAhead; + } + + public String getVersion() { + return version; + } + + public void setVersion(String version) { + this.version = version; + } + + public String getMessage() { + return message; + } + + public void setMessage(String message) { + this.message = message; + } +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java b/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java new file mode 100644 index 0000000..b59c7a0 --- /dev/null +++ b/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java @@ -0,0 +1,366 @@ +package com.easyagents.document.pdf.mineru; + +import com.alibaba.fastjson2.JSONArray; +import com.alibaba.fastjson2.JSONObject; +import com.easyagents.document.core.exception.DocumentParseException; +import com.easyagents.document.core.model.ParseRequest; +import com.easyagents.document.core.model.ParseResponse; +import com.easyagents.document.core.model.ParseResult; +import org.junit.Assert; +import org.junit.Test; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; + +/** + * MinerU 结果映射测试。 + * + * @author Codex + * @since 2026-04-14 + */ +public class MineruMapperTest { + + @Test + public void shouldMapSyncResponse() { + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruResultPayload payload = mapper.toResultPayload(syncPayload()); + + ParseResponse response = mapper.toParseResponse(payload); + Assert.assertEquals("vlm-http-client", response.getBackend()); + Assert.assertEquals(1, response.getResults().size()); + + ParseResult result = response.getResults().get(0); + Assert.assertEquals("demo", result.getFileName()); + Assert.assertEquals("# title", result.getMarkdown()); + Assert.assertEquals(1, result.getPages().size()); + Assert.assertFalse(result.getBlocks().isEmpty()); + Assert.assertEquals(1, result.getTables().size()); + Assert.assertEquals(2, result.getImages().size()); + Assert.assertNotNull(result.getArtifacts().getMiddleJson()); + Assert.assertNotNull(result.getArtifacts().getContentList()); + } + + @Test + public void shouldMapZipResponse() throws IOException { + MineruMapper mapper = new MineruMapper(defaultProperties()); + ParseResponse response = mapper.fromZip(buildZip(true)); + + Assert.assertEquals(1, response.getResults().size()); + ParseResult result = response.getResults().get(0); + Assert.assertEquals("demo", result.getFileName()); + Assert.assertEquals("# title", result.getPlainText()); + Assert.assertEquals(1, result.getTables().size()); + Assert.assertEquals(2, result.getImages().size()); + Assert.assertNotNull(result.getArtifacts().getExtraJsonArtifacts().get("contentListV2")); + } + + @Test + public void shouldAvoidDuplicatedVisualsWhenFallbackToMiddleJson() { + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruResultPayload payload = mapper.toResultPayload(syncPayloadWithMiddleJsonFallback()); + + ParseResponse response = mapper.toParseResponse(payload); + + Assert.assertEquals(1, response.getResults().size()); + ParseResult result = response.getResults().get(0); + Assert.assertEquals(2, result.getBlocks().size()); + Assert.assertEquals(1, result.getTables().size()); + Assert.assertEquals(1, result.getImages().size()); + } + + @Test(expected = DocumentParseException.class) + public void shouldRejectZipWithoutCriticalArtifacts() throws IOException { + MineruMapper mapper = new MineruMapper(defaultProperties()); + mapper.fromZip(buildZip(false)); + } + + @Test + public void shouldBuildAsyncFormWithFullArtifacts() { + MineruMapper mapper = new MineruMapper(defaultProperties()); + ParseRequest request = new ParseRequest(); + request.setReturnMarkdown(false); + request.setReturnMiddleJson(false); + request.setReturnContentList(false); + request.setReturnModelOutput(false); + request.setReturnImages(false); + + Map> fields = mapper.buildAsyncFormFields(request); + + Assert.assertEquals("true", fields.get("return_md").get(0)); + Assert.assertEquals("true", fields.get("return_middle_json").get(0)); + Assert.assertEquals("true", fields.get("return_content_list").get(0)); + Assert.assertEquals("true", fields.get("return_model_output").get(0)); + Assert.assertEquals("true", fields.get("return_images").get(0)); + Assert.assertEquals("true", fields.get("response_format_zip").get(0)); + } + + @Test + public void shouldBuildRepeatedLangListFields() { + MineruMapper mapper = new MineruMapper(defaultProperties()); + ParseRequest request = new ParseRequest(); + request.setLanguages(java.util.Arrays.asList("zh", "en")); + + Map> fields = mapper.buildSyncFormFields(request); + + Assert.assertEquals(java.util.Arrays.asList("zh", "en"), fields.get("lang_list")); + } + + @Test + public void shouldFallbackVersionFromMiddleJsonWhenAsyncStatusVersionMissing() throws IOException { + MineruMapper mapper = new MineruMapper(defaultProperties()); + ParseResponse response = mapper.fromZip(buildZip(true)); + + mapper.enrichAsyncResponse(response, null, null); + + Assert.assertEquals("vlm", response.getBackend()); + Assert.assertEquals("3.0.9", response.getVersion()); + } + + private MineruProperties defaultProperties() { + MineruProperties properties = new MineruProperties(); + properties.setBaseUrl("http://127.0.0.1:8000"); + return properties; + } + + private JSONObject syncPayload() { + JSONObject payload = new JSONObject(); + payload.put("backend", "vlm-http-client"); + payload.put("version", "3.0.9"); + + JSONObject result = new JSONObject(); + result.put("md_content", "# title"); + result.put("middle_json", middleJson()); + result.put("content_list", contentList()); + result.put("model_output", new JSONObject()); + + JSONObject images = new JSONObject(); + images.put("figure.png", "data:image/png;base64,ZmFrZQ=="); + result.put("images", images); + + JSONObject results = new JSONObject(); + results.put("demo", result); + payload.put("results", results); + return payload; + } + + private JSONObject middleJson() { + JSONObject middleJson = new JSONObject(); + middleJson.put("_backend", "vlm"); + middleJson.put("_version_name", "3.0.9"); + + JSONObject page = new JSONObject(); + page.put("page_idx", 0); + JSONArray pageSize = new JSONArray(); + pageSize.add(1000); + pageSize.add(2000); + page.put("page_size", pageSize); + page.put("para_blocks", new JSONArray()); + + JSONArray pdfInfo = new JSONArray(); + pdfInfo.add(page); + middleJson.put("pdf_info", pdfInfo); + return middleJson; + } + + private JSONObject middleJsonForFallback() { + JSONObject middleJson = new JSONObject(); + middleJson.put("_backend", "vlm"); + middleJson.put("_version_name", "3.0.9"); + + JSONObject page = new JSONObject(); + page.put("page_idx", 0); + page.put("page_size", bboxPageSize()); + + JSONArray paraBlocks = new JSONArray(); + paraBlocks.add(middleBlock("table", "images/table.png")); + paraBlocks.add(middleBlock("image", "images/figure.png")); + page.put("para_blocks", paraBlocks); + + JSONArray tables = new JSONArray(); + tables.add(middleTable("images/table.png")); + page.put("tables", tables); + + JSONArray images = new JSONArray(); + images.add(middleImage("images/figure.png")); + page.put("images", images); + + JSONArray pdfInfo = new JSONArray(); + pdfInfo.add(page); + middleJson.put("pdf_info", pdfInfo); + return middleJson; + } + + private JSONArray contentList() { + JSONArray contentList = new JSONArray(); + + JSONObject title = new JSONObject(); + title.put("type", "text"); + title.put("text", "title"); + title.put("text_level", 1); + title.put("page_idx", 0); + title.put("bbox", bbox()); + contentList.add(title); + + JSONObject image = new JSONObject(); + image.put("type", "image"); + image.put("img_path", "images/figure.png"); + image.put("image_caption", new JSONArray()); + image.put("image_footnote", new JSONArray()); + image.put("page_idx", 0); + image.put("bbox", bbox()); + contentList.add(image); + + JSONObject table = new JSONObject(); + table.put("type", "table"); + table.put("img_path", "images/table.png"); + table.put("table_body", "
"); + table.put("table_caption", new JSONArray()); + table.put("table_footnote", new JSONArray()); + table.put("page_idx", 0); + table.put("bbox", bbox()); + contentList.add(table); + + return contentList; + } + + private JSONArray contentListV2() { + JSONArray contentList = new JSONArray(); + JSONObject page = new JSONObject(); + page.put("page_idx", 0); + contentList.add(page); + return contentList; + } + + private JSONArray bbox() { + JSONArray bbox = new JSONArray(); + bbox.add(1); + bbox.add(2); + bbox.add(3); + bbox.add(4); + return bbox; + } + + private JSONArray bboxPageSize() { + JSONArray bbox = new JSONArray(); + bbox.add(1000); + bbox.add(2000); + return bbox; + } + + private JSONObject syncPayloadWithMiddleJsonFallback() { + JSONObject payload = new JSONObject(); + payload.put("backend", "vlm-http-client"); + payload.put("version", "3.0.9"); + + JSONObject result = new JSONObject(); + result.put("md_content", "# fallback"); + result.put("middle_json", middleJsonForFallback()); + + JSONObject images = new JSONObject(); + images.put("figure.png", "data:image/png;base64,ZmFrZQ=="); + images.put("table.png", "data:image/png;base64,ZmFrZQ=="); + result.put("images", images); + + JSONObject results = new JSONObject(); + results.put("demo", result); + payload.put("results", results); + return payload; + } + + private JSONObject middleBlock(String type, String imagePath) { + JSONObject block = new JSONObject(); + block.put("type", type); + block.put("bbox", bbox()); + JSONArray blocks = new JSONArray(); + JSONObject childBlock = new JSONObject(); + JSONArray lines = new JSONArray(); + JSONObject line = new JSONObject(); + JSONArray spans = new JSONArray(); + + JSONObject textSpan = new JSONObject(); + textSpan.put("content", type + "-text"); + spans.add(textSpan); + + JSONObject imageSpan = new JSONObject(); + imageSpan.put("img_path", imagePath); + spans.add(imageSpan); + + line.put("spans", spans); + lines.add(line); + childBlock.put("lines", lines); + blocks.add(childBlock); + block.put("blocks", blocks); + return block; + } + + private JSONObject middleTable(String imagePath) { + JSONObject table = new JSONObject(); + table.put("bbox", bbox()); + JSONArray blocks = new JSONArray(); + blocks.add(visualBlock("table_caption", null, "table-caption")); + blocks.add(visualBlock("table_body", imagePath, null)); + table.put("blocks", blocks); + return table; + } + + private JSONObject middleImage(String imagePath) { + JSONObject image = new JSONObject(); + image.put("bbox", bbox()); + JSONArray blocks = new JSONArray(); + blocks.add(visualBlock("image_caption", null, "image-caption")); + blocks.add(visualBlock("image_body", imagePath, null)); + image.put("blocks", blocks); + return image; + } + + private JSONObject visualBlock(String type, String imagePath, String text) { + JSONObject block = new JSONObject(); + block.put("type", type); + JSONArray lines = new JSONArray(); + JSONObject line = new JSONObject(); + JSONArray spans = new JSONArray(); + JSONObject span = new JSONObject(); + if (imagePath != null) { + span.put("img_path", imagePath); + } + if (text != null) { + span.put("content", text); + } + spans.add(span); + line.put("spans", spans); + lines.add(line); + block.put("lines", lines); + return block; + } + + private byte[] buildZip(boolean withArtifacts) throws IOException { + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream); + if (withArtifacts) { + addEntry(zipOutputStream, "demo/vlm/demo.md", "# title"); + addEntry(zipOutputStream, "demo/vlm/demo_middle.json", middleJson().toJSONString()); + addEntry(zipOutputStream, "demo/vlm/demo_content_list.json", contentList().toJSONString()); + addEntry(zipOutputStream, "demo/vlm/demo_content_list_v2.json", contentListV2().toJSONString()); + addEntry(zipOutputStream, "demo/vlm/demo_model.json", "{}"); + } + addBinaryEntry(zipOutputStream, "demo/vlm/images/figure.png", "image".getBytes(StandardCharsets.UTF_8)); + addBinaryEntry(zipOutputStream, "demo/vlm/images/table.png", "image".getBytes(StandardCharsets.UTF_8)); + zipOutputStream.close(); + return outputStream.toByteArray(); + } + + private void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException { + addBinaryEntry(zipOutputStream, name, content.getBytes(StandardCharsets.UTF_8)); + } + + private void addBinaryEntry(ZipOutputStream zipOutputStream, String name, byte[] content) throws IOException { + zipOutputStream.putNextEntry(new ZipEntry(name)); + zipOutputStream.write(content); + zipOutputStream.closeEntry(); + } +} diff --git a/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseServiceTest.java b/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseServiceTest.java new file mode 100644 index 0000000..8cab0be --- /dev/null +++ b/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseServiceTest.java @@ -0,0 +1,271 @@ +package com.easyagents.document.pdf.mineru; + +import com.alibaba.fastjson2.JSONObject; +import com.easyagents.document.core.model.ParseFile; +import com.easyagents.document.core.model.ParseRequest; +import com.easyagents.document.core.model.ParseResponse; +import com.easyagents.document.core.model.ParseTaskInfo; +import com.easyagents.document.core.model.ParseTaskStatus; +import okhttp3.Request; +import okio.Buffer; +import org.junit.Assert; +import org.junit.Test; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; + +/** + * MinerU PDF 服务测试。 + * + * @author Codex + * @since 2026-04-14 + */ +public class MineruPdfDocumentParseServiceTest { + + @Test + public void shouldForceAsyncResultArtifacts() { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruPdfDocumentParseService service = new MineruPdfDocumentParseService(defaultProperties(), client, mapper); + + ParseRequest request = buildRequest(); + request.setReturnMarkdown(false); + request.setReturnMiddleJson(false); + request.setReturnContentList(false); + request.setReturnModelOutput(false); + request.setReturnImages(false); + + ParseTaskStatus status = service.submit(request); + + Assert.assertEquals("task-1", status.getTaskId()); + Assert.assertTrue(client.lastSubmitRequest.getReturnMarkdown()); + Assert.assertTrue(client.lastSubmitRequest.getReturnMiddleJson()); + Assert.assertTrue(client.lastSubmitRequest.getReturnContentList()); + Assert.assertTrue(client.lastSubmitRequest.getReturnModelOutput()); + Assert.assertTrue(client.lastSubmitRequest.getReturnImages()); + } + + @Test + public void shouldUseSyncResultFlagsDuringParse() { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruPdfDocumentParseService service = new MineruPdfDocumentParseService(defaultProperties(), client, mapper); + + ParseRequest request = buildRequest(); + request.setReturnMarkdown(true); + request.setReturnMiddleJson(false); + request.setReturnContentList(true); + request.setReturnModelOutput(false); + request.setReturnImages(false); + + ParseResponse response = service.parse(request); + + Assert.assertEquals(1, response.getResults().size()); + Assert.assertFalse(client.lastParseRequest.getReturnMiddleJson()); + Assert.assertFalse(client.lastParseRequest.getReturnImages()); + } + + @Test + public void shouldUseTaskMetadataWhenQueryingAsyncZipResult() { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruPdfDocumentParseService service = new MineruPdfDocumentParseService(defaultProperties(), client, mapper); + + ParseResponse response = service.queryResult("task-1"); + + Assert.assertEquals("vlm-http-client", response.getBackend()); + Assert.assertEquals("3.0.9", response.getVersion()); + Assert.assertEquals(1, response.getResults().size()); + Assert.assertEquals("demo", response.getResults().get(0).getFileName()); + } + + @Test + public void shouldReturnPendingStatusWithoutFetchingResultInTaskInfo() { + RecordingClient client = new RecordingClient(defaultProperties()); + client.taskStatusValue = "running"; + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruPdfDocumentParseService service = new MineruPdfDocumentParseService(defaultProperties(), client, mapper); + + ParseTaskInfo taskInfo = service.queryTaskInfo("task-1"); + + Assert.assertEquals("running", taskInfo.getStatus()); + Assert.assertNull(taskInfo.getResult()); + Assert.assertEquals(0, client.queryResultZipCount); + } + + @Test + public void shouldReturnCompletedResultInTaskInfo() { + RecordingClient client = new RecordingClient(defaultProperties()); + MineruMapper mapper = new MineruMapper(defaultProperties()); + MineruPdfDocumentParseService service = new MineruPdfDocumentParseService(defaultProperties(), client, mapper); + + ParseTaskInfo taskInfo = service.queryTaskInfo("task-1"); + + Assert.assertEquals("completed", taskInfo.getStatus()); + Assert.assertNotNull(taskInfo.getResult()); + Assert.assertEquals(1, taskInfo.getResult().getResults().size()); + Assert.assertEquals(1, client.queryResultZipCount); + } + + @Test + public void shouldSendRepeatedLangListFields() { + InspectingMultipartClient client = new InspectingMultipartClient(defaultProperties()); + ParseRequest request = buildRequest(); + request.setLanguages(java.util.Arrays.asList("zh", "en")); + + client.parse(request); + + Assert.assertEquals(2, countOccurrences(client.lastMultipartBody, "name=\"lang_list\"")); + Assert.assertTrue(client.lastMultipartBody.contains("\r\nzh\r\n")); + Assert.assertTrue(client.lastMultipartBody.contains("\r\nen\r\n")); + } + + private ParseRequest buildRequest() { + ParseRequest request = new ParseRequest(); + request.addFile(ParseFile.of("demo.pdf", "pdf".getBytes(StandardCharsets.UTF_8))); + return request; + } + + private MineruProperties defaultProperties() { + MineruProperties properties = new MineruProperties(); + properties.setBaseUrl("http://127.0.0.1:8000"); + properties.setResultTimeoutMs(50); + properties.setPollIntervalMs(1); + return properties; + } + + private int countOccurrences(String source, String token) { + int count = 0; + int index = 0; + while (source != null && token != null && !token.isEmpty() && (index = source.indexOf(token, index)) >= 0) { + count++; + index += token.length(); + } + return count; + } + + private static class RecordingClient extends MineruPdfClient { + + private ParseRequest lastParseRequest; + private ParseRequest lastSubmitRequest; + private String taskStatusValue = "completed"; + private int queryResultZipCount; + + private RecordingClient(MineruProperties properties) { + super(properties, new MineruMapper(properties)); + } + + @Override + public MineruResultPayload parse(ParseRequest request) { + this.lastParseRequest = request; + return new MineruMapper(testProperties()).toResultPayload(syncPayload()); + } + + @Override + public MineruTaskStatus submit(ParseRequest request) { + this.lastSubmitRequest = request; + MineruTaskStatus taskStatus = new MineruTaskStatus(); + taskStatus.setTaskId("task-1"); + taskStatus.setStatus("pending"); + return taskStatus; + } + + @Override + public MineruTaskStatus queryTask(String taskId) { + MineruTaskStatus taskStatus = new MineruTaskStatus(); + taskStatus.setTaskId(taskId); + taskStatus.setStatus(taskStatusValue); + taskStatus.setBackend("vlm-http-client"); + taskStatus.setVersion("3.0.9"); + return taskStatus; + } + + @Override + public byte[] queryResultZip(String taskId) { + queryResultZipCount++; + try { + return buildZipResult(); + } catch (IOException exception) { + throw new IllegalStateException("Failed to build test ZIP", exception); + } + } + + private static byte[] buildZipResult() throws IOException { + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + try (ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream)) { + addEntry(zipOutputStream, "demo/vlm/demo.md", "# title"); + addEntry(zipOutputStream, "demo/vlm/demo_middle.json", middleJson().toJSONString()); + addEntry(zipOutputStream, "demo/vlm/demo_content_list.json", contentList().toJSONString()); + } + return outputStream.toByteArray(); + } + + private static void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException { + zipOutputStream.putNextEntry(new ZipEntry(name)); + zipOutputStream.write(content.getBytes(StandardCharsets.UTF_8)); + zipOutputStream.closeEntry(); + } + + private static JSONObject syncPayload() { + JSONObject payload = new JSONObject(); + payload.put("backend", "vlm-http-client"); + payload.put("version", "3.0.9"); + + JSONObject file = new JSONObject(); + file.put("md_content", "# title"); + JSONObject results = new JSONObject(); + results.put("demo", file); + payload.put("results", results); + return payload; + } + + private static JSONObject middleJson() { + JSONObject middleJson = new JSONObject(); + middleJson.put("_backend", "vlm"); + middleJson.put("_version_name", "3.0.9"); + middleJson.put("pdf_info", new com.alibaba.fastjson2.JSONArray()); + return middleJson; + } + + private static com.alibaba.fastjson2.JSONArray contentList() { + com.alibaba.fastjson2.JSONArray contentList = new com.alibaba.fastjson2.JSONArray(); + JSONObject text = new JSONObject(); + text.put("type", "text"); + text.put("text", "title"); + text.put("page_idx", 0); + text.put("bbox", new com.alibaba.fastjson2.JSONArray()); + contentList.add(text); + return contentList; + } + + private static MineruProperties testProperties() { + MineruProperties properties = new MineruProperties(); + properties.setBaseUrl("http://127.0.0.1:8000"); + return properties; + } + } + + private static class InspectingMultipartClient extends MineruPdfClient { + + private String lastMultipartBody; + + private InspectingMultipartClient(MineruProperties properties) { + super(properties, new MineruMapper(properties)); + } + + @Override + protected JSONObject executeJsonRequest(String path, Request request) { + try { + Buffer buffer = new Buffer(); + request.body().writeTo(buffer); + this.lastMultipartBody = buffer.readUtf8(); + } catch (IOException exception) { + throw new IllegalStateException("Failed to inspect multipart body", exception); + } + return new JSONObject(); + } + } +} diff --git a/easy-agents-document/pom.xml b/easy-agents-document/pom.xml new file mode 100644 index 0000000..6c6efc9 --- /dev/null +++ b/easy-agents-document/pom.xml @@ -0,0 +1,21 @@ + + + 4.0.0 + + + com.easyagents + easy-agents + ${revision} + + + easy-agents-document + pom + easy-agents-document + + + easy-agents-document-core + easy-agents-document-pdf + + diff --git a/easy-agents-rag/TECH-PLAN.md b/easy-agents-rag/TECH-PLAN.md index edc6e84..cc086eb 100644 --- a/easy-agents-rag/TECH-PLAN.md +++ b/easy-agents-rag/TECH-PLAN.md @@ -38,16 +38,6 @@ - 章节/问答/段落分块 - 自动推荐拆分策略 -### `easy-agents-rag-ocr` - -定位:OCR 与版面恢复能力。 - -负责内容: -- 图片/PDF OCR -- 页面版面解析 -- 标题、段落、表格等结构恢复 -- PDF 到结构化文本或 Markdown 的转换 - ### `easy-agents-rag-enhance` 定位:索引前增强能力。 @@ -84,19 +74,22 @@ - 控制器与接口 DTO - 业务库持久化 - 前端导入页面 +- OCR / PDF 解析能力 这些能力继续留在业务工程,由业务层依赖 `easy-agents-rag` 提供的能力完成编排。 +其中 OCR / PDF 解析能力改由独立的 `easy-agents-document` 能力域承接,不再归属 `easy-agents-rag`。 + ## 后续演进 后续演进顺序建议如下: 1. 完成 `rag-ingestion` 首批能力迁移并稳定对外接口 -2. 补充 `rag-ocr`,接入 OCR 与版面恢复 -3. 补充 `rag-enhance`,支持图增强、RAPTOR、索引增强 -4. 补充 `rag-retrieval`,统一查询增强与召回后处理 +2. 补充 `rag-enhance`,支持图增强、RAPTOR、索引增强 +3. 补充 `rag-retrieval`,统一查询增强与召回后处理 整体原则: - `easy-agents-core` 保持基础抽象 - `easy-agents-rag` 聚合 RAG 领域实现 +- `easy-agents-document` 承接 OCR、版面理解与 PDF 解析等文档处理能力 - 业务工程只保留编排、持久化与产品层逻辑 diff --git a/easy-agents-rag/pom.xml b/easy-agents-rag/pom.xml index fbd265d..316ae15 100644 --- a/easy-agents-rag/pom.xml +++ b/easy-agents-rag/pom.xml @@ -17,7 +17,6 @@ easy-agents-rag-core easy-agents-rag-ingestion - easy-agents-rag-ocr easy-agents-rag-enhance easy-agents-rag-retrieval diff --git a/easy-agents-spring-boot-starter/pom.xml b/easy-agents-spring-boot-starter/pom.xml index 4a06e63..9def988 100644 --- a/easy-agents-spring-boot-starter/pom.xml +++ b/easy-agents-spring-boot-starter/pom.xml @@ -56,6 +56,11 @@ easy-agents-rag-core + + com.easyagents + easy-agents-document-pdf + + com.easyagents easy-agents-rag-ingestion diff --git a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports index bea3e22..7b154cb 100644 --- a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports +++ b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports @@ -8,3 +8,4 @@ com.easyagents.spring.boot.store.chroma.ChromaAutoConfiguration com.easyagents.spring.boot.store.elasticsearch.ElasticSearchAutoConfiguration com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration +com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration diff --git a/easy-agents-spring-boot-starter/src/test/java/com/easyagents/spring/boot/autoconfigure/StarterConditionalAutoConfigurationTest.java b/easy-agents-spring-boot-starter/src/test/java/com/easyagents/spring/boot/autoconfigure/StarterConditionalAutoConfigurationTest.java index 0d38fc0..16a8998 100644 --- a/easy-agents-spring-boot-starter/src/test/java/com/easyagents/spring/boot/autoconfigure/StarterConditionalAutoConfigurationTest.java +++ b/easy-agents-spring-boot-starter/src/test/java/com/easyagents/spring/boot/autoconfigure/StarterConditionalAutoConfigurationTest.java @@ -1,6 +1,9 @@ package com.easyagents.spring.boot.autoconfigure; +import com.easyagents.document.core.DocumentParseService; +import com.easyagents.document.pdf.PdfDocumentParseService; import com.easyagents.llm.ollama.OllamaChatModel; +import com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration; import com.easyagents.spring.boot.llm.ollama.OllamaAutoConfiguration; import com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration; import com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration; @@ -11,7 +14,12 @@ import org.springframework.boot.test.context.runner.ApplicationContextRunner; public class StarterConditionalAutoConfigurationTest { private final ApplicationContextRunner contextRunner = new ApplicationContextRunner() - .withUserConfiguration(RagIngestionAutoConfiguration.class, OllamaAutoConfiguration.class, OpenSearchAutoConfiguration.class); + .withUserConfiguration( + RagIngestionAutoConfiguration.class, + OllamaAutoConfiguration.class, + OpenSearchAutoConfiguration.class, + MineruPdfAutoConfiguration.class + ); @Test public void shouldNotCreateOptionalBeansWithoutExplicitProperties() { @@ -19,6 +27,8 @@ public class StarterConditionalAutoConfigurationTest { Assert.assertTrue(context.containsBean("ragIngestionService")); Assert.assertFalse(context.containsBean("ollamaLlm")); Assert.assertFalse(context.containsBean("openSearchVectorStore")); + Assert.assertFalse(context.containsBean("pdfDocumentParseService")); + Assert.assertFalse(context.containsBean("documentParseService")); }); } @@ -28,4 +38,17 @@ public class StarterConditionalAutoConfigurationTest { .withPropertyValues("easy-agents.llm.ollama.model=qwen3:8b") .run(context -> Assert.assertNotNull(context.getBean(OllamaChatModel.class))); } + + @Test + public void shouldCreateMineruDocumentBeansWhenConfigured() { + contextRunner + .withPropertyValues( + "easy-agents.document.pdf.provider=mineru", + "easy-agents.document.pdf.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api" + ) + .run(context -> { + Assert.assertNotNull(context.getBean(PdfDocumentParseService.class)); + Assert.assertNotNull(context.getBean(DocumentParseService.class)); + }); + } } diff --git a/pom.xml b/pom.xml index a957978..931f1f9 100644 --- a/pom.xml +++ b/pom.xml @@ -16,6 +16,7 @@ easy-agents-bom easy-agents-core + easy-agents-document easy-agents-rag easy-agents-chat easy-agents-store @@ -119,6 +120,18 @@ ${revision} + + com.easyagents + easy-agents-document-core + ${revision} + + + + com.easyagents + easy-agents-document-pdf + ${revision} + + com.easyagents easy-agents-rag-core @@ -131,12 +144,6 @@ ${revision} - - com.easyagents - easy-agents-rag-ocr - ${revision} - - com.easyagents easy-agents-rag-enhance