feat: 完成L09统一文档解析模块与MinerU PDF Provider接入

- 新增 easy-agents-document 聚合、document-core 与 document-pdf 模块

- 接入 MinerU PDF provider,支持同步解析、异步任务与 ZIP 结果映射

- 移除 easy-agents-rag-ocr 空壳并补齐 starter 自动装配
This commit is contained in:
2026-04-14 19:57:32 +08:00
parent 090eca5df5
commit aa3e90b990
34 changed files with 3280 additions and 34 deletions

View File

@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.easyagents</groupId>
<artifactId>easy-agents-document</artifactId>
<version>${revision}</version>
</parent>
<artifactId>easy-agents-document-core</artifactId>
<name>easy-agents-document-core</name>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>com.easyagents</groupId>
<artifactId>easy-agents-core</artifactId>
</dependency>
</dependencies>
</project>

View File

@@ -0,0 +1,72 @@
package com.easyagents.document.core;
import com.easyagents.document.core.model.ParseRequest;
import com.easyagents.document.core.model.ParseResponse;
import com.easyagents.document.core.model.ParseTaskInfo;
import com.easyagents.document.core.model.ParseTaskStatus;
/**
* 统一文档解析服务抽象。
*
* @author Codex
* @since 2026-04-14
*/
public interface DocumentParseService {
/**
* 同步解析文档并直接返回结果。
*
* @param request 解析请求
* @return 解析结果
*/
ParseResponse parse(ParseRequest request);
/**
* 异步提交文档解析任务。
*
* @param request 解析请求
* @return 任务状态
*/
ParseTaskStatus submit(ParseRequest request);
/**
* 查询异步解析任务状态。
*
* @param taskId 任务 ID
* @return 任务状态
*/
ParseTaskStatus queryTask(String taskId);
/**
* 获取异步任务最终结果。
*
* <p>该方法面向“结果读取”语义provider 可以在内部等待任务完成后再返回最终结果,
* 因此不适合用于高频轻量轮询;如果调用方希望统一查看“当前状态 + 已完成结果”,
* 应优先使用 {@link #queryTaskInfo(String)}。</p>
*
* @param taskId 任务 ID
* @return 解析结果
*/
ParseResponse queryResult(String taskId);
/**
* 聚合查询异步任务信息。
*
* <p>该方法会先查询任务状态;如果任务仍在处理中,则仅返回当前状态。
* 如果任务已经完成,则会继续查询并附带最终解析结果。</p>
*
* <p>注意:该方法在不同 provider 下可能触发结果下载或阻塞等待,
* 因此更适合用于“状态+结果”一体化查询,而不是高频轻量轮询。</p>
*
* @param taskId 任务 ID
* @return 聚合查询结果
*/
default ParseTaskInfo queryTaskInfo(String taskId) {
ParseTaskStatus taskStatus = queryTask(taskId);
ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(taskStatus);
if (taskStatus != null && "completed".equalsIgnoreCase(taskStatus.getStatus())) {
taskInfo.setResult(queryResult(taskId));
}
return taskInfo;
}
}

View File

@@ -0,0 +1,18 @@
package com.easyagents.document.core.exception;
/**
* 文档解析异常。
*
* @author Codex
* @since 2026-04-14
*/
public class DocumentParseException extends RuntimeException {
public DocumentParseException(String message) {
super(message);
}
public DocumentParseException(String message, Throwable cause) {
super(message, cause);
}
}

View File

@@ -0,0 +1,88 @@
package com.easyagents.document.core.model;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/**
* 结构化内容块。
*
* @author Codex
* @since 2026-04-14
*/
public class DocumentBlock {
private String type;
private Integer pageIndex;
private String text;
private Integer level;
private String html;
private String imagePath;
private List<Double> boundingBox = new ArrayList<Double>();
private Map<String, Object> metadata = new LinkedHashMap<String, Object>();
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public Integer getPageIndex() {
return pageIndex;
}
public void setPageIndex(Integer pageIndex) {
this.pageIndex = pageIndex;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public Integer getLevel() {
return level;
}
public void setLevel(Integer level) {
this.level = level;
}
public String getHtml() {
return html;
}
public void setHtml(String html) {
this.html = html;
}
public String getImagePath() {
return imagePath;
}
public void setImagePath(String imagePath) {
this.imagePath = imagePath;
}
public List<Double> getBoundingBox() {
return boundingBox;
}
public void setBoundingBox(List<Double> boundingBox) {
this.boundingBox = boundingBox == null ? new ArrayList<Double>() : boundingBox;
}
public Map<String, Object> getMetadata() {
return metadata;
}
public void setMetadata(Map<String, Object> metadata) {
this.metadata = metadata == null ? new LinkedHashMap<String, Object>() : metadata;
}
}

View File

@@ -0,0 +1,86 @@
package com.easyagents.document.core.model;
import java.util.ArrayList;
import java.util.List;
/**
* 图片结果。
*
* @author Codex
* @since 2026-04-14
*/
public class DocumentImage {
private Integer pageIndex;
private String name;
private String mimeType;
private String sourcePath;
private String dataUrl;
private List<Double> boundingBox = new ArrayList<Double>();
private List<String> captions = new ArrayList<String>();
private List<String> footnotes = new ArrayList<String>();
public Integer getPageIndex() {
return pageIndex;
}
public void setPageIndex(Integer pageIndex) {
this.pageIndex = pageIndex;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getMimeType() {
return mimeType;
}
public void setMimeType(String mimeType) {
this.mimeType = mimeType;
}
public String getSourcePath() {
return sourcePath;
}
public void setSourcePath(String sourcePath) {
this.sourcePath = sourcePath;
}
public String getDataUrl() {
return dataUrl;
}
public void setDataUrl(String dataUrl) {
this.dataUrl = dataUrl;
}
public List<Double> getBoundingBox() {
return boundingBox;
}
public void setBoundingBox(List<Double> boundingBox) {
this.boundingBox = boundingBox == null ? new ArrayList<Double>() : boundingBox;
}
public List<String> getCaptions() {
return captions;
}
public void setCaptions(List<String> captions) {
this.captions = captions == null ? new ArrayList<String>() : captions;
}
public List<String> getFootnotes() {
return footnotes;
}
public void setFootnotes(List<String> footnotes) {
this.footnotes = footnotes == null ? new ArrayList<String>() : footnotes;
}
}

View File

@@ -0,0 +1,50 @@
package com.easyagents.document.core.model;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* 页面信息。
*
* @author Codex
* @since 2026-04-14
*/
public class DocumentPage {
private Integer pageIndex;
private Double width;
private Double height;
private Map<String, Object> metadata = new LinkedHashMap<String, Object>();
public Integer getPageIndex() {
return pageIndex;
}
public void setPageIndex(Integer pageIndex) {
this.pageIndex = pageIndex;
}
public Double getWidth() {
return width;
}
public void setWidth(Double width) {
this.width = width;
}
public Double getHeight() {
return height;
}
public void setHeight(Double height) {
this.height = height;
}
public Map<String, Object> getMetadata() {
return metadata;
}
public void setMetadata(Map<String, Object> metadata) {
this.metadata = metadata == null ? new LinkedHashMap<String, Object>() : metadata;
}
}

View File

@@ -0,0 +1,68 @@
package com.easyagents.document.core.model;
import java.util.ArrayList;
import java.util.List;
/**
* 表格结果。
*
* @author Codex
* @since 2026-04-14
*/
public class DocumentTable {
private Integer pageIndex;
private List<Double> boundingBox = new ArrayList<Double>();
private String html;
private String imagePath;
private List<String> captions = new ArrayList<String>();
private List<String> footnotes = new ArrayList<String>();
public Integer getPageIndex() {
return pageIndex;
}
public void setPageIndex(Integer pageIndex) {
this.pageIndex = pageIndex;
}
public List<Double> getBoundingBox() {
return boundingBox;
}
public void setBoundingBox(List<Double> boundingBox) {
this.boundingBox = boundingBox == null ? new ArrayList<Double>() : boundingBox;
}
public String getHtml() {
return html;
}
public void setHtml(String html) {
this.html = html;
}
public String getImagePath() {
return imagePath;
}
public void setImagePath(String imagePath) {
this.imagePath = imagePath;
}
public List<String> getCaptions() {
return captions;
}
public void setCaptions(List<String> captions) {
this.captions = captions == null ? new ArrayList<String>() : captions;
}
public List<String> getFootnotes() {
return footnotes;
}
public void setFootnotes(List<String> footnotes) {
this.footnotes = footnotes == null ? new ArrayList<String>() : footnotes;
}
}

View File

@@ -0,0 +1,63 @@
package com.easyagents.document.core.model;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* 解析工件集合。
*
* @author Codex
* @since 2026-04-14
*/
public class ParseArtifacts {
private Object middleJson;
private Object contentList;
private Object modelOutput;
private Map<String, Object> extraJsonArtifacts = new LinkedHashMap<String, Object>();
private Map<String, byte[]> extraBinaryArtifacts = new LinkedHashMap<String, byte[]>();
public Object getMiddleJson() {
return middleJson;
}
public void setMiddleJson(Object middleJson) {
this.middleJson = middleJson;
}
public Object getContentList() {
return contentList;
}
public void setContentList(Object contentList) {
this.contentList = contentList;
}
public Object getModelOutput() {
return modelOutput;
}
public void setModelOutput(Object modelOutput) {
this.modelOutput = modelOutput;
}
public Map<String, Object> getExtraJsonArtifacts() {
return extraJsonArtifacts;
}
public void setExtraJsonArtifacts(Map<String, Object> extraJsonArtifacts) {
this.extraJsonArtifacts = extraJsonArtifacts == null
? new LinkedHashMap<String, Object>()
: extraJsonArtifacts;
}
public Map<String, byte[]> getExtraBinaryArtifacts() {
return extraBinaryArtifacts;
}
public void setExtraBinaryArtifacts(Map<String, byte[]> extraBinaryArtifacts) {
this.extraBinaryArtifacts = extraBinaryArtifacts == null
? new LinkedHashMap<String, byte[]>()
: extraBinaryArtifacts;
}
}

View File

@@ -0,0 +1,77 @@
package com.easyagents.document.core.model;
import java.util.Arrays;
/**
* 待解析文件。
*
* @author Codex
* @since 2026-04-14
*/
public class ParseFile {
private String fileName;
private byte[] content;
private String contentType;
/**
* 创建文件对象。
*
* @param fileName 文件名
* @param content 文件内容
* @return 文件对象
*/
public static ParseFile of(String fileName, byte[] content) {
return of(fileName, content, null);
}
/**
* 创建文件对象。
*
* @param fileName 文件名
* @param content 文件内容
* @param contentType MIME 类型
* @return 文件对象
*/
public static ParseFile of(String fileName, byte[] content, String contentType) {
ParseFile parseFile = new ParseFile();
parseFile.setFileName(fileName);
parseFile.setContent(content);
parseFile.setContentType(contentType);
return parseFile;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public byte[] getContent() {
return content;
}
public void setContent(byte[] content) {
this.content = content;
}
public String getContentType() {
return contentType;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
@Override
public String toString() {
return "ParseFile{" +
"fileName='" + fileName + '\'' +
", contentLength=" + (content == null ? 0 : content.length) +
", contentType='" + contentType + '\'' +
", checksum=" + Arrays.hashCode(content) +
'}';
}
}

View File

@@ -0,0 +1,144 @@
package com.easyagents.document.core.model;
import java.util.ArrayList;
import java.util.List;
/**
* 统一解析请求。
*
* @author Codex
* @since 2026-04-14
*/
public class ParseRequest {
private List<ParseFile> files = new ArrayList<ParseFile>();
private String backend;
private String parseMethod = "auto";
private List<String> languages = new ArrayList<String>();
private Boolean formulaEnabled = true;
private Boolean tableEnabled = true;
private Integer startPageIndex = 0;
private Integer endPageIndex = 99999;
private Boolean returnMarkdown = true;
private Boolean returnMiddleJson = true;
private Boolean returnContentList = true;
private Boolean returnModelOutput = false;
private Boolean returnImages = true;
/**
* 追加待解析文件。
*
* @param parseFile 文件
* @return 当前请求
*/
public ParseRequest addFile(ParseFile parseFile) {
if (parseFile != null) {
this.files.add(parseFile);
}
return this;
}
public List<ParseFile> getFiles() {
return files;
}
public void setFiles(List<ParseFile> files) {
this.files = files == null ? new ArrayList<ParseFile>() : files;
}
public String getBackend() {
return backend;
}
public void setBackend(String backend) {
this.backend = backend;
}
public String getParseMethod() {
return parseMethod;
}
public void setParseMethod(String parseMethod) {
this.parseMethod = parseMethod;
}
public List<String> getLanguages() {
return languages;
}
public void setLanguages(List<String> languages) {
this.languages = languages == null ? new ArrayList<String>() : languages;
}
public Boolean getFormulaEnabled() {
return formulaEnabled;
}
public void setFormulaEnabled(Boolean formulaEnabled) {
this.formulaEnabled = formulaEnabled;
}
public Boolean getTableEnabled() {
return tableEnabled;
}
public void setTableEnabled(Boolean tableEnabled) {
this.tableEnabled = tableEnabled;
}
public Integer getStartPageIndex() {
return startPageIndex;
}
public void setStartPageIndex(Integer startPageIndex) {
this.startPageIndex = startPageIndex;
}
public Integer getEndPageIndex() {
return endPageIndex;
}
public void setEndPageIndex(Integer endPageIndex) {
this.endPageIndex = endPageIndex;
}
public Boolean getReturnMarkdown() {
return returnMarkdown;
}
public void setReturnMarkdown(Boolean returnMarkdown) {
this.returnMarkdown = returnMarkdown;
}
public Boolean getReturnMiddleJson() {
return returnMiddleJson;
}
public void setReturnMiddleJson(Boolean returnMiddleJson) {
this.returnMiddleJson = returnMiddleJson;
}
public Boolean getReturnContentList() {
return returnContentList;
}
public void setReturnContentList(Boolean returnContentList) {
this.returnContentList = returnContentList;
}
public Boolean getReturnModelOutput() {
return returnModelOutput;
}
public void setReturnModelOutput(Boolean returnModelOutput) {
this.returnModelOutput = returnModelOutput;
}
public Boolean getReturnImages() {
return returnImages;
}
public void setReturnImages(Boolean returnImages) {
this.returnImages = returnImages;
}
}

View File

@@ -0,0 +1,41 @@
package com.easyagents.document.core.model;
import java.util.ArrayList;
import java.util.List;
/**
* 批量解析响应。
*
* @author Codex
* @since 2026-04-14
*/
public class ParseResponse {
private String backend;
private String version;
private List<ParseResult> results = new ArrayList<ParseResult>();
public String getBackend() {
return backend;
}
public void setBackend(String backend) {
this.backend = backend;
}
public String getVersion() {
return version;
}
public void setVersion(String version) {
this.version = version;
}
public List<ParseResult> getResults() {
return results;
}
public void setResults(List<ParseResult> results) {
this.results = results == null ? new ArrayList<ParseResult>() : results;
}
}

View File

@@ -0,0 +1,106 @@
package com.easyagents.document.core.model;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/**
* 单文件解析结果。
*
* @author Codex
* @since 2026-04-14
*/
public class ParseResult {
private String fileName;
private String plainText;
private String markdown;
private List<DocumentPage> pages = new ArrayList<DocumentPage>();
private List<DocumentBlock> blocks = new ArrayList<DocumentBlock>();
private List<DocumentTable> tables = new ArrayList<DocumentTable>();
private List<DocumentImage> images = new ArrayList<DocumentImage>();
private List<String> warnings = new ArrayList<String>();
private Map<String, Object> metadata = new LinkedHashMap<String, Object>();
private ParseArtifacts artifacts = new ParseArtifacts();
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public String getPlainText() {
return plainText;
}
public void setPlainText(String plainText) {
this.plainText = plainText;
}
public String getMarkdown() {
return markdown;
}
public void setMarkdown(String markdown) {
this.markdown = markdown;
}
public List<DocumentPage> getPages() {
return pages;
}
public void setPages(List<DocumentPage> pages) {
this.pages = pages == null ? new ArrayList<DocumentPage>() : pages;
}
public List<DocumentBlock> getBlocks() {
return blocks;
}
public void setBlocks(List<DocumentBlock> blocks) {
this.blocks = blocks == null ? new ArrayList<DocumentBlock>() : blocks;
}
public List<DocumentTable> getTables() {
return tables;
}
public void setTables(List<DocumentTable> tables) {
this.tables = tables == null ? new ArrayList<DocumentTable>() : tables;
}
public List<DocumentImage> getImages() {
return images;
}
public void setImages(List<DocumentImage> images) {
this.images = images == null ? new ArrayList<DocumentImage>() : images;
}
public List<String> getWarnings() {
return warnings;
}
public void setWarnings(List<String> warnings) {
this.warnings = warnings == null ? new ArrayList<String>() : warnings;
}
public Map<String, Object> getMetadata() {
return metadata;
}
public void setMetadata(Map<String, Object> metadata) {
this.metadata = metadata == null ? new LinkedHashMap<String, Object>() : metadata;
}
public ParseArtifacts getArtifacts() {
return artifacts;
}
public void setArtifacts(ParseArtifacts artifacts) {
this.artifacts = artifacts == null ? new ParseArtifacts() : artifacts;
}
}

View File

@@ -0,0 +1,58 @@
package com.easyagents.document.core.model;
/**
* 异步任务聚合查询结果。
*
* <p>该对象在任务状态字段基础上,按需附带最终解析结果。
* 当任务尚未完成时只返回状态信息;当任务已完成时可同时返回结果内容。</p>
*
* @author Codex
* @since 2026-04-14
*/
public class ParseTaskInfo extends ParseTaskStatus {
private ParseResponse result;
/**
* 基于任务状态创建聚合查询结果。
*
* @param status 任务状态
* @return 聚合查询结果
*/
public static ParseTaskInfo fromStatus(ParseTaskStatus status) {
ParseTaskInfo taskInfo = new ParseTaskInfo();
if (status == null) {
return taskInfo;
}
taskInfo.setTaskId(status.getTaskId());
taskInfo.setStatus(status.getStatus());
taskInfo.setBackend(status.getBackend());
taskInfo.setFileNames(status.getFileNames());
taskInfo.setCreatedAt(status.getCreatedAt());
taskInfo.setStartedAt(status.getStartedAt());
taskInfo.setCompletedAt(status.getCompletedAt());
taskInfo.setError(status.getError());
taskInfo.setStatusUrl(status.getStatusUrl());
taskInfo.setResultUrl(status.getResultUrl());
taskInfo.setQueuedAhead(status.getQueuedAhead());
return taskInfo;
}
/**
* 获取最终解析结果。
*
* @return 解析结果;任务未完成时可能为空
*/
public ParseResponse getResult() {
return result;
}
/**
* 设置最终解析结果。
*
* @param result 解析结果
*/
public void setResult(ParseResponse result) {
this.result = result;
}
}

View File

@@ -0,0 +1,113 @@
package com.easyagents.document.core.model;
import java.util.ArrayList;
import java.util.List;
/**
* 异步任务状态。
*
* @author Codex
* @since 2026-04-14
*/
public class ParseTaskStatus {
private String taskId;
private String status;
private String backend;
private List<String> fileNames = new ArrayList<String>();
private String createdAt;
private String startedAt;
private String completedAt;
private String error;
private String statusUrl;
private String resultUrl;
private Integer queuedAhead;
public String getTaskId() {
return taskId;
}
public void setTaskId(String taskId) {
this.taskId = taskId;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getBackend() {
return backend;
}
public void setBackend(String backend) {
this.backend = backend;
}
public List<String> getFileNames() {
return fileNames;
}
public void setFileNames(List<String> fileNames) {
this.fileNames = fileNames == null ? new ArrayList<String>() : fileNames;
}
public String getCreatedAt() {
return createdAt;
}
public void setCreatedAt(String createdAt) {
this.createdAt = createdAt;
}
public String getStartedAt() {
return startedAt;
}
public void setStartedAt(String startedAt) {
this.startedAt = startedAt;
}
public String getCompletedAt() {
return completedAt;
}
public void setCompletedAt(String completedAt) {
this.completedAt = completedAt;
}
public String getError() {
return error;
}
public void setError(String error) {
this.error = error;
}
public String getStatusUrl() {
return statusUrl;
}
public void setStatusUrl(String statusUrl) {
this.statusUrl = statusUrl;
}
public String getResultUrl() {
return resultUrl;
}
public void setResultUrl(String resultUrl) {
this.resultUrl = resultUrl;
}
public Integer getQueuedAhead() {
return queuedAhead;
}
public void setQueuedAhead(Integer queuedAhead) {
this.queuedAhead = queuedAhead;
}
}