From a41b50959e7585ccfcd762ba7914f7cb15c378b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E5=AD=90=E9=BB=98?= <925456043@qq.com> Date: Tue, 14 Apr 2026 19:57:32 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=AE=8C=E6=88=90S02=E6=A1=A5=E6=8E=A5?= =?UTF-8?q?=E8=83=BD=E5=8A=9B=E5=B9=B6=E6=8E=A5=E9=80=9AM09=E5=B7=A5?= =?UTF-8?q?=E4=BD=9C=E6=B5=81=E6=96=87=E6=A1=A3=E8=A7=A3=E6=9E=90=E9=97=AD?= =?UTF-8?q?=E7=8E=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增统一文档解析桥接子域,封装 easy-agents 文档解析门面 - 支持工作流开始节点文件上传与素材选择的单文件对象输入 - DocNode 改为文档解析节点,PDF 走统一解析,非 PDF 保持默认读取 --- easyflow-modules/easyflow-module-ai/pom.xml | 4 + .../DocumentParseBridgeException.java | 81 +++++ .../model/DocumentParseArtifacts.java | 63 ++++ .../document/model/DocumentParseScenario.java | 28 ++ .../document/model/DocumentParseTaskInfo.java | 33 +++ .../model/DocumentParseTaskStatus.java | 113 +++++++ .../document/model/DocumentParsedResult.java | 120 ++++++++ .../ai/document/model/DocumentSourceRef.java | 106 +++++++ .../service/DocumentParseBridgeService.java | 67 +++++ .../impl/DocumentParseBridgeServiceImpl.java | 170 +++++++++++ .../support/DocumentParseRequestFactory.java | 71 +++++ .../support/DocumentParseResultMapper.java | 128 ++++++++ .../support/DocumentSourceLoader.java | 146 +++++++++ .../support/LoadedDocumentSource.java | 49 +++ .../java/tech/easyflow/ai/node/DocNode.java | 27 +- .../ai/node/DocNodeFileContentExtractor.java | 170 +++++++++++ .../DocumentParseBridgeServiceImplTest.java | 190 ++++++++++++ .../DocumentParseRequestFactoryTest.java | 57 ++++ .../DocumentParseResultMapperTest.java | 82 ++++++ .../support/DocumentSourceLoaderTest.java | 133 +++++++++ .../node/DocNodeFileContentExtractorTest.java | 278 ++++++++++++++++++ .../src/main/resources/application.yml | 13 +- .../src/locales/langs/en-US/aiWorkflow.json | 6 +- .../src/locales/langs/zh-CN/aiWorkflow.json | 6 +- .../workflow/components/WorkflowFileInput.vue | 161 ++++++++++ .../workflow/components/WorkflowFormItem.vue | 20 +- .../__tests__/workflowFileValue.test.ts | 52 ++++ .../workflow/components/workflowFileValue.ts | 109 +++++++ .../ai/workflow/customNode/documentNode.ts | 7 +- pom.xml | 5 + 30 files changed, 2475 insertions(+), 20 deletions(-) create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/exception/DocumentParseBridgeException.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseArtifacts.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseScenario.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskInfo.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskStatus.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParsedResult.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentSourceRef.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/DocumentParseBridgeService.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseRequestFactory.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseResultMapper.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentSourceLoader.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/LoadedDocumentSource.java create mode 100644 easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java create mode 100644 easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java create mode 100644 easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseRequestFactoryTest.java create mode 100644 easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseResultMapperTest.java create mode 100644 easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentSourceLoaderTest.java create mode 100644 easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java create mode 100644 easyflow-ui-admin/app/src/views/ai/workflow/components/WorkflowFileInput.vue create mode 100644 easyflow-ui-admin/app/src/views/ai/workflow/components/__tests__/workflowFileValue.test.ts create mode 100644 easyflow-ui-admin/app/src/views/ai/workflow/components/workflowFileValue.ts diff --git a/easyflow-modules/easyflow-module-ai/pom.xml b/easyflow-modules/easyflow-module-ai/pom.xml index c177e1d..c164ae9 100644 --- a/easyflow-modules/easyflow-module-ai/pom.xml +++ b/easyflow-modules/easyflow-module-ai/pom.xml @@ -41,6 +41,10 @@ com.easyagents easy-agents-support + + com.easyagents + easy-agents-document-core + com.easyagents easy-agents-rag-retrieval diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/exception/DocumentParseBridgeException.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/exception/DocumentParseBridgeException.java new file mode 100644 index 0000000..2108d6f --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/exception/DocumentParseBridgeException.java @@ -0,0 +1,81 @@ +package tech.easyflow.ai.document.exception; + +/** + * 文档解析桥接层异常。 + * + *

桥接层负责把底层文档服务异常和文件加载异常转换为稳定语义, + * 供上层业务按场景进行处理。

+ * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentParseBridgeException extends RuntimeException { + + private final String code; + + public DocumentParseBridgeException(String code, String message) { + super(message); + this.code = code; + } + + public DocumentParseBridgeException(String code, String message, Throwable cause) { + super(message, cause); + this.code = code; + } + + /** + * 获取稳定错误码。 + * + * @return 错误码 + */ + public String getCode() { + return code; + } + + public static DocumentParseBridgeException serviceNotEnabled() { + return new DocumentParseBridgeException( + "service_not_enabled", + "统一文档解析服务未启用,请先配置 easy-agents.document.pdf.provider" + ); + } + + public static DocumentParseBridgeException unsupportedSource(String message) { + return new DocumentParseBridgeException("unsupported_source", message); + } + + public static DocumentParseBridgeException sourceLoadFailed(String message, Throwable cause) { + return new DocumentParseBridgeException("source_load_failed", message, cause); + } + + public static DocumentParseBridgeException requestBuildFailed(String message, Throwable cause) { + return new DocumentParseBridgeException("request_build_failed", message, cause); + } + + public static DocumentParseBridgeException requestBuildFailed(String message) { + return new DocumentParseBridgeException("request_build_failed", message); + } + + public static DocumentParseBridgeException parseFailed(String message, Throwable cause) { + return new DocumentParseBridgeException("parse_failed", message, cause); + } + + public static DocumentParseBridgeException parseFailed(String message) { + return new DocumentParseBridgeException("parse_failed", message); + } + + public static DocumentParseBridgeException taskFailed(String message, Throwable cause) { + return new DocumentParseBridgeException("task_failed", message, cause); + } + + public static DocumentParseBridgeException taskFailed(String message) { + return new DocumentParseBridgeException("task_failed", message); + } + + public static DocumentParseBridgeException resultFetchFailed(String message, Throwable cause) { + return new DocumentParseBridgeException("result_fetch_failed", message, cause); + } + + public static DocumentParseBridgeException resultFetchFailed(String message) { + return new DocumentParseBridgeException("result_fetch_failed", message); + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseArtifacts.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseArtifacts.java new file mode 100644 index 0000000..269e258 --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseArtifacts.java @@ -0,0 +1,63 @@ +package tech.easyflow.ai.document.model; + +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * 文档解析工件。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentParseArtifacts { + + private Object middleJson; + private Object contentList; + private Object modelOutput; + private Map extraJsonArtifacts = new LinkedHashMap(); + private Map extraBinaryArtifacts = new LinkedHashMap(); + + public Object getMiddleJson() { + return middleJson; + } + + public void setMiddleJson(Object middleJson) { + this.middleJson = middleJson; + } + + public Object getContentList() { + return contentList; + } + + public void setContentList(Object contentList) { + this.contentList = contentList; + } + + public Object getModelOutput() { + return modelOutput; + } + + public void setModelOutput(Object modelOutput) { + this.modelOutput = modelOutput; + } + + public Map getExtraJsonArtifacts() { + return extraJsonArtifacts; + } + + public void setExtraJsonArtifacts(Map extraJsonArtifacts) { + this.extraJsonArtifacts = extraJsonArtifacts == null + ? new LinkedHashMap() + : extraJsonArtifacts; + } + + public Map getExtraBinaryArtifacts() { + return extraBinaryArtifacts; + } + + public void setExtraBinaryArtifacts(Map extraBinaryArtifacts) { + this.extraBinaryArtifacts = extraBinaryArtifacts == null + ? new LinkedHashMap() + : extraBinaryArtifacts; + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseScenario.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseScenario.java new file mode 100644 index 0000000..447b13d --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseScenario.java @@ -0,0 +1,28 @@ +package tech.easyflow.ai.document.model; + +/** + * 文档解析场景预设。 + * + *

场景由 easyflow 业务层传入,桥接层负责将场景映射为底层解析请求参数, + * 避免业务模块直接感知多个布尔开关。

+ * + * @author Codex + * @since 2026-04-14 + */ +public enum DocumentParseScenario { + + /** + * 工作流文本提取场景,仅要求尽快返回可直接消费的文本结果。 + */ + WORKFLOW_TEXT, + + /** + * 知识库导入场景,需要保留更多结构化工件供后续分块分析使用。 + */ + KNOWLEDGE_IMPORT, + + /** + * 尽可能返回完整工件,供后续高级消费场景使用。 + */ + FULL_ARTIFACTS +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskInfo.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskInfo.java new file mode 100644 index 0000000..a2b163c --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskInfo.java @@ -0,0 +1,33 @@ +package tech.easyflow.ai.document.model; + +/** + * 任务聚合查询结果。 + * + *

该对象在任务状态基础上按需附带最终解析结果。 + * 当任务尚未完成时仅返回状态信息;当任务已完成时可同时返回标准化结果。

+ * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentParseTaskInfo extends DocumentParseTaskStatus { + + private DocumentParsedResult result; + + /** + * 获取标准化解析结果。 + * + * @return 解析结果;任务未完成时可能为空 + */ + public DocumentParsedResult getResult() { + return result; + } + + /** + * 设置标准化解析结果。 + * + * @param result 解析结果 + */ + public void setResult(DocumentParsedResult result) { + this.result = result; + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskStatus.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskStatus.java new file mode 100644 index 0000000..580dc95 --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskStatus.java @@ -0,0 +1,113 @@ +package tech.easyflow.ai.document.model; + +import java.util.ArrayList; +import java.util.List; + +/** + * 标准化异步解析任务状态。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentParseTaskStatus { + + private String taskId; + private String status; + private String backend; + private List fileNames = new ArrayList(); + private String createdAt; + private String startedAt; + private String completedAt; + private String error; + private String statusUrl; + private String resultUrl; + private Integer queuedAhead; + + public String getTaskId() { + return taskId; + } + + public void setTaskId(String taskId) { + this.taskId = taskId; + } + + public String getStatus() { + return status; + } + + public void setStatus(String status) { + this.status = status; + } + + public String getBackend() { + return backend; + } + + public void setBackend(String backend) { + this.backend = backend; + } + + public List getFileNames() { + return fileNames; + } + + public void setFileNames(List fileNames) { + this.fileNames = fileNames == null ? new ArrayList() : fileNames; + } + + public String getCreatedAt() { + return createdAt; + } + + public void setCreatedAt(String createdAt) { + this.createdAt = createdAt; + } + + public String getStartedAt() { + return startedAt; + } + + public void setStartedAt(String startedAt) { + this.startedAt = startedAt; + } + + public String getCompletedAt() { + return completedAt; + } + + public void setCompletedAt(String completedAt) { + this.completedAt = completedAt; + } + + public String getError() { + return error; + } + + public void setError(String error) { + this.error = error; + } + + public String getStatusUrl() { + return statusUrl; + } + + public void setStatusUrl(String statusUrl) { + this.statusUrl = statusUrl; + } + + public String getResultUrl() { + return resultUrl; + } + + public void setResultUrl(String resultUrl) { + this.resultUrl = resultUrl; + } + + public Integer getQueuedAhead() { + return queuedAhead; + } + + public void setQueuedAhead(Integer queuedAhead) { + this.queuedAhead = queuedAhead; + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParsedResult.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParsedResult.java new file mode 100644 index 0000000..0f5ed6e --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParsedResult.java @@ -0,0 +1,120 @@ +package tech.easyflow.ai.document.model; + +import com.easyagents.document.core.model.DocumentBlock; +import com.easyagents.document.core.model.DocumentImage; +import com.easyagents.document.core.model.DocumentPage; +import com.easyagents.document.core.model.DocumentTable; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +/** + * 标准化单文档解析结果。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentParsedResult { + + private String fileName; + private String preferredText; + private String markdown; + private String plainText; + private List pages = new ArrayList(); + private List blocks = new ArrayList(); + private List tables = new ArrayList(); + private List images = new ArrayList(); + private List warnings = new ArrayList(); + private Map metadata = new LinkedHashMap(); + private DocumentParseArtifacts artifacts = new DocumentParseArtifacts(); + + public String getFileName() { + return fileName; + } + + public void setFileName(String fileName) { + this.fileName = fileName; + } + + public String getPreferredText() { + return preferredText; + } + + public void setPreferredText(String preferredText) { + this.preferredText = preferredText; + } + + public String getMarkdown() { + return markdown; + } + + public void setMarkdown(String markdown) { + this.markdown = markdown; + } + + public String getPlainText() { + return plainText; + } + + public void setPlainText(String plainText) { + this.plainText = plainText; + } + + public List getPages() { + return pages; + } + + public void setPages(List pages) { + this.pages = pages == null ? new ArrayList() : pages; + } + + public List getBlocks() { + return blocks; + } + + public void setBlocks(List blocks) { + this.blocks = blocks == null ? new ArrayList() : blocks; + } + + public List getTables() { + return tables; + } + + public void setTables(List tables) { + this.tables = tables == null ? new ArrayList() : tables; + } + + public List getImages() { + return images; + } + + public void setImages(List images) { + this.images = images == null ? new ArrayList() : images; + } + + public List getWarnings() { + return warnings; + } + + public void setWarnings(List warnings) { + this.warnings = warnings == null ? new ArrayList() : warnings; + } + + public Map getMetadata() { + return metadata; + } + + public void setMetadata(Map metadata) { + this.metadata = metadata == null ? new LinkedHashMap() : metadata; + } + + public DocumentParseArtifacts getArtifacts() { + return artifacts; + } + + public void setArtifacts(DocumentParseArtifacts artifacts) { + this.artifacts = artifacts == null ? new DocumentParseArtifacts() : artifacts; + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentSourceRef.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentSourceRef.java new file mode 100644 index 0000000..f83dede --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentSourceRef.java @@ -0,0 +1,106 @@ +package tech.easyflow.ai.document.model; + +/** + * 统一文档源引用。 + * + *

该模型用于屏蔽业务模块和底层解析框架的文件输入差异,业务方只需要描述文件来自哪里, + * 具体由桥接层负责加载字节内容并转成统一解析请求。

+ * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentSourceRef { + + private String fileName; + private String filePath; + private String contentType; + private Long size; + private String url; + private byte[] contentBytes; + + /** + * 创建基于文件存储路径的文档源。 + * + * @param filePath 存储路径 + * @return 文档源 + */ + public static DocumentSourceRef ofPath(String filePath) { + DocumentSourceRef sourceRef = new DocumentSourceRef(); + sourceRef.setFilePath(filePath); + return sourceRef; + } + + /** + * 创建基于内存字节的文档源。 + * + * @param fileName 文件名 + * @param contentBytes 文件字节 + * @return 文档源 + */ + public static DocumentSourceRef ofBytes(String fileName, byte[] contentBytes) { + DocumentSourceRef sourceRef = new DocumentSourceRef(); + sourceRef.setFileName(fileName); + sourceRef.setContentBytes(contentBytes); + return sourceRef; + } + + /** + * 创建基于 URL 的文档源。 + * + * @param url 文件 URL + * @return 文档源 + */ + public static DocumentSourceRef ofUrl(String url) { + DocumentSourceRef sourceRef = new DocumentSourceRef(); + sourceRef.setUrl(url); + return sourceRef; + } + + public String getFileName() { + return fileName; + } + + public void setFileName(String fileName) { + this.fileName = fileName; + } + + public String getFilePath() { + return filePath; + } + + public void setFilePath(String filePath) { + this.filePath = filePath; + } + + public String getContentType() { + return contentType; + } + + public void setContentType(String contentType) { + this.contentType = contentType; + } + + public Long getSize() { + return size; + } + + public void setSize(Long size) { + this.size = size; + } + + public String getUrl() { + return url; + } + + public void setUrl(String url) { + this.url = url; + } + + public byte[] getContentBytes() { + return contentBytes; + } + + public void setContentBytes(byte[] contentBytes) { + this.contentBytes = contentBytes; + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/DocumentParseBridgeService.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/DocumentParseBridgeService.java new file mode 100644 index 0000000..88af40a --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/DocumentParseBridgeService.java @@ -0,0 +1,67 @@ +package tech.easyflow.ai.document.service; + +import tech.easyflow.ai.document.model.DocumentParseScenario; +import tech.easyflow.ai.document.model.DocumentSourceRef; +import tech.easyflow.ai.document.model.DocumentParseTaskInfo; +import tech.easyflow.ai.document.model.DocumentParseTaskStatus; +import tech.easyflow.ai.document.model.DocumentParsedResult; + +/** + * 统一文档解析桥接门面。 + * + *

业务模块通过该门面使用文档解析能力,而不是直接依赖 easy-agents 的原始请求和结果模型。

+ * + * @author Codex + * @since 2026-04-14 + */ +public interface DocumentParseBridgeService { + + /** + * 同步解析单文档。 + * + * @param source 文档源 + * @param scenario 解析场景 + * @return 标准化解析结果 + */ + DocumentParsedResult parse(DocumentSourceRef source, DocumentParseScenario scenario); + + /** + * 异步提交单文档解析任务。 + * + * @param source 文档源 + * @param scenario 解析场景 + * @return 异步任务状态 + */ + DocumentParseTaskStatus submit(DocumentSourceRef source, DocumentParseScenario scenario); + + /** + * 查询异步任务状态。 + * + * @param taskId 任务 ID + * @return 异步任务状态 + */ + DocumentParseTaskStatus queryTask(String taskId); + + /** + * 获取异步任务最终结果。 + * + *

该方法面向“结果读取”语义,底层 provider 可能在内部等待任务完成后再返回最终结果, + * 因此不适合直接作为轻量状态轮询接口;如果业务需要统一查看“当前状态 + 已完成结果”, + * 应优先使用 {@link #queryTaskInfo(String)}。

+ * + * @param taskId 任务 ID + * @return 标准化解析结果 + */ + DocumentParsedResult queryResult(String taskId); + + /** + * 聚合查询异步任务信息。 + * + *

当任务仍在处理中时仅返回状态;当任务已完成时会附带标准化结果。 + * 该方法适合用于页面或业务侧统一读取“当前状态 + 可用结果”。

+ * + * @param taskId 任务 ID + * @return 聚合任务信息 + */ + DocumentParseTaskInfo queryTaskInfo(String taskId); +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java new file mode 100644 index 0000000..c9503ac --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java @@ -0,0 +1,170 @@ +package tech.easyflow.ai.document.service.impl; + +import com.easyagents.document.core.DocumentParseService; +import com.easyagents.document.core.model.ParseResponse; +import com.easyagents.document.core.model.ParseResult; +import com.easyagents.document.core.model.ParseTaskInfo; +import com.easyagents.document.core.model.ParseTaskStatus; +import org.springframework.lang.Nullable; +import org.springframework.stereotype.Service; +import org.springframework.util.StringUtils; +import tech.easyflow.ai.document.exception.DocumentParseBridgeException; +import tech.easyflow.ai.document.model.DocumentParseScenario; +import tech.easyflow.ai.document.model.DocumentSourceRef; +import tech.easyflow.ai.document.model.DocumentParseTaskInfo; +import tech.easyflow.ai.document.model.DocumentParseTaskStatus; +import tech.easyflow.ai.document.model.DocumentParsedResult; +import tech.easyflow.ai.document.service.DocumentParseBridgeService; +import tech.easyflow.ai.document.support.DocumentSourceLoader; +import tech.easyflow.ai.document.support.DocumentParseRequestFactory; +import tech.easyflow.ai.document.support.DocumentParseResultMapper; +import tech.easyflow.ai.document.support.LoadedDocumentSource; +import tech.easyflow.ai.utils.DocUtil; + +/** + * 统一文档解析桥接门面默认实现。 + * + * @author Codex + * @since 2026-04-14 + */ +@Service +public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeService { + + @Nullable + private final DocumentParseService documentParseService; + private final DocumentSourceLoader documentSourceLoader; + private final DocumentParseRequestFactory parseRequestFactory; + private final DocumentParseResultMapper parseResultMapper; + + public DocumentParseBridgeServiceImpl(@Nullable DocumentParseService documentParseService, + DocumentSourceLoader documentSourceLoader, + DocumentParseRequestFactory parseRequestFactory, + DocumentParseResultMapper parseResultMapper) { + this.documentParseService = documentParseService; + this.documentSourceLoader = documentSourceLoader; + this.parseRequestFactory = parseRequestFactory; + this.parseResultMapper = parseResultMapper; + } + + /** + * {@inheritDoc} + */ + @Override + public DocumentParsedResult parse(DocumentSourceRef source, DocumentParseScenario scenario) { + try { + LoadedDocumentSource loadedSource = preparePdfSource(source); + ParseResponse response = requireService().parse(parseRequestFactory.build(loadedSource, scenario)); + return parseResultMapper.map(extractSingleResult(response, false)); + } catch (DocumentParseBridgeException e) { + throw e; + } catch (Exception e) { + throw DocumentParseBridgeException.parseFailed("同步文档解析失败", e); + } + } + + /** + * {@inheritDoc} + */ + @Override + public DocumentParseTaskStatus submit(DocumentSourceRef source, DocumentParseScenario scenario) { + try { + LoadedDocumentSource loadedSource = preparePdfSource(source); + ParseTaskStatus taskStatus = requireService().submit(parseRequestFactory.build(loadedSource, scenario)); + return parseResultMapper.map(taskStatus); + } catch (DocumentParseBridgeException e) { + throw e; + } catch (Exception e) { + throw DocumentParseBridgeException.taskFailed("提交异步文档解析任务失败", e); + } + } + + /** + * {@inheritDoc} + */ + @Override + public DocumentParseTaskStatus queryTask(String taskId) { + if (!StringUtils.hasText(taskId)) { + throw DocumentParseBridgeException.taskFailed("taskId 不能为空"); + } + try { + return parseResultMapper.map(requireService().queryTask(taskId)); + } catch (DocumentParseBridgeException e) { + throw e; + } catch (Exception e) { + throw DocumentParseBridgeException.taskFailed("查询异步文档解析任务状态失败", e); + } + } + + /** + * {@inheritDoc} + */ + @Override + public DocumentParsedResult queryResult(String taskId) { + if (!StringUtils.hasText(taskId)) { + throw DocumentParseBridgeException.resultFetchFailed("taskId 不能为空"); + } + try { + ParseResponse response = requireService().queryResult(taskId); + return parseResultMapper.map(extractSingleResult(response, true)); + } catch (DocumentParseBridgeException e) { + throw e; + } catch (Exception e) { + throw DocumentParseBridgeException.resultFetchFailed("获取异步文档解析结果失败", e); + } + } + + /** + * {@inheritDoc} + */ + @Override + public DocumentParseTaskInfo queryTaskInfo(String taskId) { + if (!StringUtils.hasText(taskId)) { + throw DocumentParseBridgeException.taskFailed("taskId 不能为空"); + } + try { + ParseTaskInfo taskInfo = requireService().queryTaskInfo(taskId); + return parseResultMapper.map(taskInfo); + } catch (DocumentParseBridgeException e) { + throw e; + } catch (Exception e) { + throw DocumentParseBridgeException.taskFailed("聚合查询异步文档解析任务信息失败", e); + } + } + + private DocumentParseService requireService() { + if (documentParseService == null) { + throw DocumentParseBridgeException.serviceNotEnabled(); + } + return documentParseService; + } + + private LoadedDocumentSource preparePdfSource(DocumentSourceRef source) { + LoadedDocumentSource loadedSource = documentSourceLoader.load(source); + if (!isPdf(loadedSource)) { + throw DocumentParseBridgeException.unsupportedSource("统一文档解析桥接首版仅支持 PDF 文件"); + } + return loadedSource; + } + + private boolean isPdf(LoadedDocumentSource loadedSource) { + String contentType = loadedSource.getContentType(); + if (StringUtils.hasText(contentType) && contentType.toLowerCase().contains("pdf")) { + return true; + } + String fileName = loadedSource.getFileName(); + if (!StringUtils.hasText(fileName) || !fileName.contains(".")) { + return false; + } + return "pdf".equals(DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName))); + } + + private ParseResult extractSingleResult(ParseResponse response, boolean resultFetchPhase) { + if (response == null || response.getResults() == null || response.getResults().isEmpty()) { + if (resultFetchPhase) { + throw DocumentParseBridgeException.resultFetchFailed("异步文档解析结果为空"); + } + throw DocumentParseBridgeException.parseFailed("同步文档解析结果为空"); + } + return response.getResults().get(0); + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseRequestFactory.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseRequestFactory.java new file mode 100644 index 0000000..0b49319 --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseRequestFactory.java @@ -0,0 +1,71 @@ +package tech.easyflow.ai.document.support; + +import com.easyagents.document.core.model.ParseFile; +import com.easyagents.document.core.model.ParseRequest; +import org.springframework.stereotype.Component; +import tech.easyflow.ai.document.exception.DocumentParseBridgeException; +import tech.easyflow.ai.document.model.DocumentParseScenario; + +/** + * easy-agents 解析请求工厂。 + * + *

负责把 easyflow 业务场景预设映射为底层统一解析请求。

+ * + * @author Codex + * @since 2026-04-14 + */ +@Component +public class DocumentParseRequestFactory { + + /** + * 构建统一解析请求。 + * + * @param source 已加载文档源 + * @param scenario 解析场景 + * @return 统一解析请求 + */ + public ParseRequest build(LoadedDocumentSource source, DocumentParseScenario scenario) { + if (source == null || source.getContentBytes() == null || source.getContentBytes().length == 0) { + throw DocumentParseBridgeException.requestBuildFailed("文档源内容为空,无法构建解析请求"); + } + if (scenario == null) { + throw DocumentParseBridgeException.requestBuildFailed("解析场景不能为空"); + } + ParseRequest request = new ParseRequest(); + // 保持为空,交由 easy-agents provider 按环境配置回填默认值。 + request.setParseMethod(null); + request.setFormulaEnabled(null); + request.setTableEnabled(null); + request.addFile(ParseFile.of(source.getFileName(), source.getContentBytes(), source.getContentType())); + applyScenario(request, scenario); + return request; + } + + private void applyScenario(ParseRequest request, DocumentParseScenario scenario) { + switch (scenario) { + case WORKFLOW_TEXT: + request.setReturnMarkdown(Boolean.TRUE); + request.setReturnMiddleJson(Boolean.FALSE); + request.setReturnContentList(Boolean.FALSE); + request.setReturnModelOutput(Boolean.FALSE); + request.setReturnImages(Boolean.FALSE); + break; + case KNOWLEDGE_IMPORT: + request.setReturnMarkdown(Boolean.TRUE); + request.setReturnMiddleJson(Boolean.TRUE); + request.setReturnContentList(Boolean.TRUE); + request.setReturnModelOutput(Boolean.FALSE); + request.setReturnImages(Boolean.TRUE); + break; + case FULL_ARTIFACTS: + request.setReturnMarkdown(Boolean.TRUE); + request.setReturnMiddleJson(Boolean.TRUE); + request.setReturnContentList(Boolean.TRUE); + request.setReturnModelOutput(Boolean.TRUE); + request.setReturnImages(Boolean.TRUE); + break; + default: + throw DocumentParseBridgeException.requestBuildFailed("不支持的文档解析场景: " + scenario); + } + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseResultMapper.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseResultMapper.java new file mode 100644 index 0000000..d1b4eae --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseResultMapper.java @@ -0,0 +1,128 @@ +package tech.easyflow.ai.document.support; + +import com.easyagents.document.core.model.ParseArtifacts; +import com.easyagents.document.core.model.ParseResult; +import com.easyagents.document.core.model.ParseTaskInfo; +import com.easyagents.document.core.model.ParseTaskStatus; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; +import tech.easyflow.ai.document.model.DocumentParseArtifacts; +import tech.easyflow.ai.document.model.DocumentParseTaskInfo; +import tech.easyflow.ai.document.model.DocumentParseTaskStatus; +import tech.easyflow.ai.document.model.DocumentParsedResult; + +/** + * easy-agents 结果映射器。 + * + *

负责把底层解析结果转换为 easyflow 侧稳定 DTO,并统一 preferredText 规则。

+ * + * @author Codex + * @since 2026-04-14 + */ +@Component +public class DocumentParseResultMapper { + + /** + * 映射单文件解析结果。 + * + * @param parseResult 底层结果 + * @return easyflow 结果 + */ + public DocumentParsedResult map(ParseResult parseResult) { + DocumentParsedResult document = new DocumentParsedResult(); + if (parseResult == null) { + return document; + } + document.setFileName(parseResult.getFileName()); + document.setMarkdown(parseResult.getMarkdown()); + document.setPlainText(parseResult.getPlainText()); + document.setPreferredText(resolvePreferredText(parseResult)); + document.setPages(parseResult.getPages()); + document.setBlocks(parseResult.getBlocks()); + document.setTables(parseResult.getTables()); + document.setImages(parseResult.getImages()); + document.setWarnings(parseResult.getWarnings()); + document.setMetadata(parseResult.getMetadata()); + document.setArtifacts(mapArtifacts(parseResult.getArtifacts())); + return document; + } + + /** + * 映射异步任务状态。 + * + * @param taskStatus 底层任务状态 + * @return easyflow 任务状态 + */ + public DocumentParseTaskStatus map(ParseTaskStatus taskStatus) { + DocumentParseTaskStatus status = new DocumentParseTaskStatus(); + if (taskStatus == null) { + return status; + } + status.setTaskId(taskStatus.getTaskId()); + status.setStatus(taskStatus.getStatus()); + status.setBackend(taskStatus.getBackend()); + status.setFileNames(taskStatus.getFileNames()); + status.setCreatedAt(taskStatus.getCreatedAt()); + status.setStartedAt(taskStatus.getStartedAt()); + status.setCompletedAt(taskStatus.getCompletedAt()); + status.setError(taskStatus.getError()); + status.setStatusUrl(taskStatus.getStatusUrl()); + status.setResultUrl(taskStatus.getResultUrl()); + status.setQueuedAhead(taskStatus.getQueuedAhead()); + return status; + } + + /** + * 映射任务聚合查询结果。 + * + * @param taskInfo 底层任务聚合结果 + * @return easyflow 聚合任务结果 + */ + public DocumentParseTaskInfo map(ParseTaskInfo taskInfo) { + DocumentParseTaskInfo mapped = new DocumentParseTaskInfo(); + if (taskInfo == null) { + return mapped; + } + fillTaskStatus(mapped, taskInfo); + if (taskInfo.getResult() != null + && taskInfo.getResult().getResults() != null + && !taskInfo.getResult().getResults().isEmpty()) { + mapped.setResult(map(taskInfo.getResult().getResults().get(0))); + } + return mapped; + } + + private void fillTaskStatus(DocumentParseTaskStatus status, ParseTaskStatus taskStatus) { + status.setTaskId(taskStatus.getTaskId()); + status.setStatus(taskStatus.getStatus()); + status.setBackend(taskStatus.getBackend()); + status.setFileNames(taskStatus.getFileNames()); + status.setCreatedAt(taskStatus.getCreatedAt()); + status.setStartedAt(taskStatus.getStartedAt()); + status.setCompletedAt(taskStatus.getCompletedAt()); + status.setError(taskStatus.getError()); + status.setStatusUrl(taskStatus.getStatusUrl()); + status.setResultUrl(taskStatus.getResultUrl()); + status.setQueuedAhead(taskStatus.getQueuedAhead()); + } + + private String resolvePreferredText(ParseResult parseResult) { + if (StringUtils.hasText(parseResult.getMarkdown())) { + return parseResult.getMarkdown(); + } + return parseResult.getPlainText(); + } + + private DocumentParseArtifacts mapArtifacts(ParseArtifacts artifacts) { + DocumentParseArtifacts mappedArtifacts = new DocumentParseArtifacts(); + if (artifacts == null) { + return mappedArtifacts; + } + mappedArtifacts.setMiddleJson(artifacts.getMiddleJson()); + mappedArtifacts.setContentList(artifacts.getContentList()); + mappedArtifacts.setModelOutput(artifacts.getModelOutput()); + mappedArtifacts.setExtraJsonArtifacts(artifacts.getExtraJsonArtifacts()); + mappedArtifacts.setExtraBinaryArtifacts(artifacts.getExtraBinaryArtifacts()); + return mappedArtifacts; + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentSourceLoader.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentSourceLoader.java new file mode 100644 index 0000000..48f2ecb --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentSourceLoader.java @@ -0,0 +1,146 @@ +package tech.easyflow.ai.document.support; + +import cn.hutool.http.HttpUtil; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.stereotype.Component; +import org.springframework.util.StringUtils; +import tech.easyflow.ai.document.exception.DocumentParseBridgeException; +import tech.easyflow.ai.document.model.DocumentSourceRef; +import tech.easyflow.common.filestorage.FileStorageService; +import tech.easyflow.common.filestorage.utils.PathGeneratorUtil; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URLConnection; + +/** + * 文档源加载器。 + * + *

负责把不同来源的文件引用统一转换为内存字节和标准文件元数据。

+ * + * @author Codex + * @since 2026-04-14 + */ +@Component +public class DocumentSourceLoader { + + private final FileStorageService fileStorageService; + + public DocumentSourceLoader(@Qualifier("default") FileStorageService fileStorageService) { + this.fileStorageService = fileStorageService; + } + + /** + * 加载文档源。 + * + * @param sourceRef easyflow 文档源 + * @return 内部已加载文档对象 + */ + public LoadedDocumentSource load(DocumentSourceRef sourceRef) { + if (sourceRef == null) { + throw DocumentParseBridgeException.unsupportedSource("文档源不能为空"); + } + if (hasContentBytes(sourceRef)) { + return buildLoadedSource( + resolveFileName(sourceRef), + resolveContentType(sourceRef, resolveFileName(sourceRef)), + resolveSize(sourceRef, sourceRef.getContentBytes().length), + sourceRef.getContentBytes() + ); + } + if (StringUtils.hasText(sourceRef.getFilePath())) { + if (isRemoteUrl(sourceRef.getFilePath())) { + return loadFromRemoteValue(sourceRef, sourceRef.getFilePath()); + } + return loadFromFilePath(sourceRef); + } + if (StringUtils.hasText(sourceRef.getUrl())) { + return loadFromUrl(sourceRef); + } + throw DocumentParseBridgeException.unsupportedSource("文档源缺少 filePath、url 或 contentBytes"); + } + + private LoadedDocumentSource loadFromFilePath(DocumentSourceRef sourceRef) { + String fileName = resolveFileName(sourceRef); + try (InputStream inputStream = fileStorageService.readStream(sourceRef.getFilePath())) { + byte[] contentBytes = inputStream.readAllBytes(); + long actualSize = sourceRef.getSize() != null ? sourceRef.getSize() : fileStorageService.getFileSize(sourceRef.getFilePath()); + return buildLoadedSource( + fileName, + resolveContentType(sourceRef, fileName), + resolveSize(sourceRef, actualSize), + contentBytes + ); + } catch (IOException e) { + throw DocumentParseBridgeException.sourceLoadFailed( + "读取文档存储文件失败: " + sourceRef.getFilePath(), + e + ); + } + } + + private LoadedDocumentSource loadFromUrl(DocumentSourceRef sourceRef) { + return loadFromRemoteValue(sourceRef, sourceRef.getUrl()); + } + + private LoadedDocumentSource loadFromRemoteValue(DocumentSourceRef sourceRef, String remoteUrl) { + String fileName = resolveFileName(sourceRef); + try { + byte[] contentBytes = HttpUtil.downloadBytes(remoteUrl); + return buildLoadedSource( + fileName, + resolveContentType(sourceRef, fileName), + resolveSize(sourceRef, contentBytes.length), + contentBytes + ); + } catch (Exception e) { + throw DocumentParseBridgeException.sourceLoadFailed( + "下载文档 URL 失败: " + remoteUrl, + e + ); + } + } + + private LoadedDocumentSource buildLoadedSource(String fileName, String contentType, Long size, byte[] contentBytes) { + LoadedDocumentSource loadedSource = new LoadedDocumentSource(); + loadedSource.setFileName(fileName); + loadedSource.setContentType(contentType); + loadedSource.setSize(size); + loadedSource.setContentBytes(contentBytes); + return loadedSource; + } + + private String resolveFileName(DocumentSourceRef sourceRef) { + if (StringUtils.hasText(sourceRef.getFileName())) { + return PathGeneratorUtil.getPureFileName(sourceRef.getFileName()); + } + if (StringUtils.hasText(sourceRef.getFilePath())) { + return PathGeneratorUtil.getPureFileName(sourceRef.getFilePath()); + } + if (StringUtils.hasText(sourceRef.getUrl())) { + String pureName = PathGeneratorUtil.getPureFileName(sourceRef.getUrl()); + int queryIndex = pureName.indexOf('?'); + return queryIndex >= 0 ? pureName.substring(0, queryIndex) : pureName; + } + throw DocumentParseBridgeException.unsupportedSource("文档源缺少可用文件名"); + } + + private String resolveContentType(DocumentSourceRef sourceRef, String fileName) { + if (StringUtils.hasText(sourceRef.getContentType())) { + return sourceRef.getContentType(); + } + return URLConnection.guessContentTypeFromName(fileName); + } + + private Long resolveSize(DocumentSourceRef sourceRef, long fallbackSize) { + return sourceRef.getSize() != null ? sourceRef.getSize() : fallbackSize; + } + + private boolean hasContentBytes(DocumentSourceRef sourceRef) { + return sourceRef.getContentBytes() != null && sourceRef.getContentBytes().length > 0; + } + + private boolean isRemoteUrl(String value) { + return value.startsWith("http://") || value.startsWith("https://"); + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/LoadedDocumentSource.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/LoadedDocumentSource.java new file mode 100644 index 0000000..ee81be7 --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/LoadedDocumentSource.java @@ -0,0 +1,49 @@ +package tech.easyflow.ai.document.support; + +/** + * 桥接层内部已加载文档源。 + * + *

该对象只在桥接层内部流转,用于承接已解析出的文件名、类型和字节内容。

+ * + * @author Codex + * @since 2026-04-14 + */ +public class LoadedDocumentSource { + + private String fileName; + private String contentType; + private Long size; + private byte[] contentBytes; + + public String getFileName() { + return fileName; + } + + public void setFileName(String fileName) { + this.fileName = fileName; + } + + public String getContentType() { + return contentType; + } + + public void setContentType(String contentType) { + this.contentType = contentType; + } + + public Long getSize() { + return size; + } + + public void setSize(Long size) { + this.size = size; + } + + public byte[] getContentBytes() { + return contentBytes; + } + + public void setContentBytes(byte[] contentBytes) { + this.contentBytes = contentBytes; + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java index 7f157cc..8ed4403 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java @@ -4,28 +4,35 @@ import com.easyagents.core.util.StringUtil; import com.easyagents.flow.core.chain.Chain; import com.easyagents.flow.core.chain.Parameter; import com.easyagents.flow.core.node.BaseNode; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import tech.easyflow.ai.utils.DocUtil; import tech.easyflow.common.util.SpringContextUtil; -import java.io.ByteArrayInputStream; import java.util.HashMap; import java.util.List; import java.util.Map; +/** + * 工作流文件内容提取节点。 + * + *

节点输入为统一文件对象,PDF 交给统一文档解析桥接服务, + * 其他类型继续走默认文档读取器。

+ * + * @author Codex + * @since 2026-04-14 + */ public class DocNode extends BaseNode { - private static final Logger log = LoggerFactory.getLogger(DocNode.class); - + /** + * 执行文件内容提取。 + * + * @param chain 当前流程链 + * @return 节点输出 + */ @Override public Map execute(Chain chain) { Map map = chain.getState().resolveParameters(this); Map res = new HashMap<>(); - String url = map.get("fileUrl").toString(); - byte[] bytes = DocUtil.downloadFile(url); - ReaderManager manager = SpringContextUtil.getBean(ReaderManager.class); - String docContent = manager.getReader().read(DocUtil.getFileNameByUrl(url), new ByteArrayInputStream(bytes)); + DocNodeFileContentExtractor extractor = SpringContextUtil.getBean(DocNodeFileContentExtractor.class); + String docContent = extractor.extract(map.get("file")); String key = "content"; List outputDefs = getOutputDefs(); diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java new file mode 100644 index 0000000..b38ee88 --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java @@ -0,0 +1,170 @@ +package tech.easyflow.ai.node; + +import cn.hutool.http.HttpUtil; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.stereotype.Component; +import tech.easyflow.ai.document.model.DocumentParseScenario; +import tech.easyflow.ai.document.model.DocumentParsedResult; +import tech.easyflow.ai.document.model.DocumentSourceRef; +import tech.easyflow.ai.document.service.DocumentParseBridgeService; +import tech.easyflow.ai.utils.DocUtil; +import tech.easyflow.common.filestorage.FileStorageService; +import tech.easyflow.common.util.StringUtil; +import tech.easyflow.common.web.exceptions.BusinessException; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Map; + +/** + * {@link DocNode} 文件内容提取器。 + * + *

负责把工作流运行态中的文件对象转换为统一文档源,并根据文件类型选择 + * 统一文档解析桥接服务或默认读取器。

+ * + * @author Codex + * @since 2026-04-14 + */ +@Component +public class DocNodeFileContentExtractor { + + private final DocumentParseBridgeService documentParseBridgeService; + private final FileStorageService fileStorageService; + private final ReaderManager readerManager; + + /** + * 创建文件内容提取器。 + * + * @param documentParseBridgeService 统一文档解析桥接服务 + * @param fileStorageService 文件存储服务 + * @param readerManager 默认读取器管理器 + */ + public DocNodeFileContentExtractor(DocumentParseBridgeService documentParseBridgeService, + @Qualifier("default") FileStorageService fileStorageService, + ReaderManager readerManager) { + this.documentParseBridgeService = documentParseBridgeService; + this.fileStorageService = fileStorageService; + this.readerManager = readerManager; + } + + /** + * 提取文件文本内容。 + * + * @param fileValue 工作流运行态中的文件对象 + * @return 可供下游节点直接消费的文本 + */ + public String extract(Object fileValue) { + DocumentSourceRef sourceRef = toDocumentSourceRef(fileValue); + validateSourceRef(sourceRef); + if (isPdf(sourceRef)) { + return extractPdfContent(sourceRef); + } + return extractDefaultContent(sourceRef); + } + + /** + * 将运行时文件值转换为统一文档源。 + * + * @param fileValue 运行时文件值 + * @return 文档源 + */ + DocumentSourceRef toDocumentSourceRef(Object fileValue) { + if (fileValue instanceof DocumentSourceRef sourceRef) { + return sourceRef; + } + if (!(fileValue instanceof Map fileMap)) { + throw new BusinessException("文件输入格式不正确,必须为文件对象"); + } + DocumentSourceRef sourceRef = new DocumentSourceRef(); + sourceRef.setFileName(asText(fileMap.get("fileName"))); + sourceRef.setFilePath(asText(fileMap.get("filePath"))); + sourceRef.setContentType(asText(fileMap.get("contentType"))); + sourceRef.setUrl(asText(fileMap.get("url"))); + sourceRef.setSize(asLong(fileMap.get("size"))); + return sourceRef; + } + + private void validateSourceRef(DocumentSourceRef sourceRef) { + if (sourceRef == null) { + throw new BusinessException("文件输入不能为空"); + } + if (!StringUtil.hasText(sourceRef.getFileName())) { + throw new BusinessException("文件输入缺少 fileName"); + } + if (!StringUtil.hasText(sourceRef.getFilePath())) { + throw new BusinessException("文件输入缺少 filePath"); + } + } + + private boolean isPdf(DocumentSourceRef sourceRef) { + if (StringUtil.hasText(sourceRef.getContentType()) + && sourceRef.getContentType().toLowerCase().contains("pdf")) { + return true; + } + String fileName = sourceRef.getFileName(); + if (!StringUtil.hasText(fileName) || !fileName.contains(".")) { + return false; + } + return "pdf".equals(DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName))); + } + + private String extractPdfContent(DocumentSourceRef sourceRef) { + DocumentParsedResult parsedResult = documentParseBridgeService.parse(sourceRef, DocumentParseScenario.WORKFLOW_TEXT); + String preferredText = parsedResult == null ? null : parsedResult.getPreferredText(); + if (StringUtil.hasText(preferredText)) { + return preferredText; + } + if (parsedResult != null && StringUtil.hasText(parsedResult.getMarkdown())) { + return parsedResult.getMarkdown(); + } + if (parsedResult != null && StringUtil.hasText(parsedResult.getPlainText())) { + return parsedResult.getPlainText(); + } + throw new BusinessException("PDF 文档解析结果为空"); + } + + private String extractDefaultContent(DocumentSourceRef sourceRef) { + try (InputStream inputStream = openInputStream(sourceRef)) { + return readerManager.getReader().read(sourceRef.getFileName(), inputStream); + } catch (IOException e) { + throw new RuntimeException("读取文件内容失败: " + sourceRef.getFilePath(), e); + } + } + + private InputStream openInputStream(DocumentSourceRef sourceRef) throws IOException { + String filePath = sourceRef.getFilePath(); + if (StringUtil.hasText(filePath) && isRemoteUrl(filePath)) { + byte[] bytes = HttpUtil.downloadBytes(filePath); + return new java.io.ByteArrayInputStream(bytes); + } + if (StringUtil.hasText(filePath)) { + return fileStorageService.readStream(filePath); + } + if (StringUtil.hasText(sourceRef.getUrl())) { + byte[] bytes = HttpUtil.downloadBytes(sourceRef.getUrl()); + return new java.io.ByteArrayInputStream(bytes); + } + throw new IOException("文件输入缺少可读取路径"); + } + + private boolean isRemoteUrl(String value) { + return value.startsWith("http://") || value.startsWith("https://"); + } + + private String asText(Object value) { + return value == null ? null : String.valueOf(value).trim(); + } + + private Long asLong(Object value) { + if (value == null) { + return null; + } + if (value instanceof Number number) { + return number.longValue(); + } + if (value instanceof String text && StringUtil.hasText(text)) { + return Long.parseLong(text.trim()); + } + return null; + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java new file mode 100644 index 0000000..7854901 --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java @@ -0,0 +1,190 @@ +package tech.easyflow.ai.document.service.impl; + +import com.easyagents.document.core.DocumentParseService; +import com.easyagents.document.core.model.ParseRequest; +import com.easyagents.document.core.model.ParseResponse; +import com.easyagents.document.core.model.ParseResult; +import com.easyagents.document.core.model.ParseTaskInfo; +import com.easyagents.document.core.model.ParseTaskStatus; +import org.junit.Assert; +import org.junit.Test; +import tech.easyflow.ai.document.exception.DocumentParseBridgeException; +import tech.easyflow.ai.document.model.DocumentParseScenario; +import tech.easyflow.ai.document.model.DocumentSourceRef; +import tech.easyflow.ai.document.model.DocumentParseTaskInfo; +import tech.easyflow.ai.document.model.DocumentParseTaskStatus; +import tech.easyflow.ai.document.model.DocumentParsedResult; +import tech.easyflow.ai.document.support.DocumentSourceLoader; +import tech.easyflow.ai.document.support.DocumentParseRequestFactory; +import tech.easyflow.ai.document.support.DocumentParseResultMapper; +import tech.easyflow.common.filestorage.FileStorageService; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.Collections; + +/** + * {@link DocumentParseBridgeServiceImpl} 单元测试。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentParseBridgeServiceImplTest { + + /** + * 验证同步解析成功透传并返回标准化结果。 + */ + @Test + public void shouldParseSuccessfully() { + FakeDocumentParseService parseService = new FakeDocumentParseService(); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService); + + DocumentParsedResult document = bridgeService.parse(buildSource(), DocumentParseScenario.WORKFLOW_TEXT); + + Assert.assertEquals("# demo", document.getPreferredText()); + Assert.assertFalse(parseService.lastParseRequest.getReturnMiddleJson()); + Assert.assertFalse(parseService.lastParseRequest.getReturnImages()); + } + + /** + * 验证异步提交、状态查询和结果查询链路可用。 + */ + @Test + public void shouldSupportAsyncFlow() { + FakeDocumentParseService parseService = new FakeDocumentParseService(); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService); + + DocumentParseTaskStatus taskStatus = bridgeService.submit(buildSource(), DocumentParseScenario.KNOWLEDGE_IMPORT); + DocumentParseTaskStatus queriedStatus = bridgeService.queryTask("task-1"); + DocumentParsedResult queriedResult = bridgeService.queryResult("task-1"); + + Assert.assertEquals("task-1", taskStatus.getTaskId()); + Assert.assertEquals("running", queriedStatus.getStatus()); + Assert.assertEquals("# demo", queriedResult.getPreferredText()); + } + + /** + * 验证聚合查询在完成状态下会附带标准化结果。 + */ + @Test + public void shouldQueryTaskInfoSuccessfully() { + FakeDocumentParseService parseService = new FakeDocumentParseService(); + parseService.taskStatusValue = "completed"; + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService); + + DocumentParseTaskInfo taskInfo = bridgeService.queryTaskInfo("task-1"); + + Assert.assertEquals("completed", taskInfo.getStatus()); + Assert.assertNotNull(taskInfo.getResult()); + Assert.assertEquals("# demo", taskInfo.getResult().getPreferredText()); + } + + /** + * 验证缺少底层服务时抛出稳定错误码。 + */ + @Test + public void shouldThrowWhenServiceDisabled() { + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null); + + try { + bridgeService.parse(buildSource(), DocumentParseScenario.WORKFLOW_TEXT); + Assert.fail("expected DocumentParseBridgeException"); + } catch (DocumentParseBridgeException e) { + Assert.assertEquals("service_not_enabled", e.getCode()); + } + } + + private DocumentParseBridgeServiceImpl buildBridgeService(DocumentParseService parseService) { + return new DocumentParseBridgeServiceImpl( + parseService, + new DocumentSourceLoader(new InMemoryFileStorageService()), + new DocumentParseRequestFactory(), + new DocumentParseResultMapper() + ); + } + + private DocumentSourceRef buildSource() { + DocumentSourceRef sourceRef = DocumentSourceRef.ofBytes("demo.pdf", "pdf-data".getBytes(StandardCharsets.UTF_8)); + sourceRef.setContentType("application/pdf"); + sourceRef.setSize(8L); + return sourceRef; + } + + private static class InMemoryFileStorageService implements FileStorageService { + + @Override + public String save(org.springframework.web.multipart.MultipartFile file) { + return null; + } + + @Override + public void delete(String path) { + } + + @Override + public InputStream readStream(String path) { + return new ByteArrayInputStream("pdf-data".getBytes(StandardCharsets.UTF_8)); + } + + @Override + public long getFileSize(String path) { + return 8L; + } + } + + private static class FakeDocumentParseService implements DocumentParseService { + + private ParseRequest lastParseRequest; + private String taskStatusValue = "running"; + + @Override + public ParseResponse parse(ParseRequest request) { + this.lastParseRequest = request; + return buildResponse(); + } + + @Override + public ParseTaskStatus submit(ParseRequest request) { + this.lastParseRequest = request; + ParseTaskStatus status = new ParseTaskStatus(); + status.setTaskId("task-1"); + status.setStatus("submitted"); + status.setFileNames(Collections.singletonList("demo.pdf")); + return status; + } + + @Override + public ParseTaskStatus queryTask(String taskId) { + ParseTaskStatus status = new ParseTaskStatus(); + status.setTaskId(taskId); + status.setStatus(taskStatusValue); + status.setFileNames(Collections.singletonList("demo.pdf")); + return status; + } + + @Override + public ParseResponse queryResult(String taskId) { + return buildResponse(); + } + + @Override + public ParseTaskInfo queryTaskInfo(String taskId) { + ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(queryTask(taskId)); + if ("completed".equals(taskStatusValue)) { + taskInfo.setResult(buildResponse()); + } + return taskInfo; + } + + private ParseResponse buildResponse() { + ParseResult result = new ParseResult(); + result.setFileName("demo.pdf"); + result.setMarkdown("# demo"); + result.setPlainText("demo"); + ParseResponse response = new ParseResponse(); + response.setResults(Collections.singletonList(result)); + return response; + } + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseRequestFactoryTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseRequestFactoryTest.java new file mode 100644 index 0000000..8e42d82 --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseRequestFactoryTest.java @@ -0,0 +1,57 @@ +package tech.easyflow.ai.document.support; + +import com.easyagents.document.core.model.ParseRequest; +import org.junit.Assert; +import org.junit.Test; +import tech.easyflow.ai.document.model.DocumentParseScenario; + +/** + * {@link DocumentParseRequestFactory} 单元测试。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentParseRequestFactoryTest { + + /** + * 验证工作流文本场景只请求最小文本结果。 + */ + @Test + public void shouldBuildWorkflowTextScenarioRequest() { + DocumentParseRequestFactory factory = new DocumentParseRequestFactory(); + + ParseRequest request = factory.build(buildSource(), DocumentParseScenario.WORKFLOW_TEXT); + + Assert.assertNull(request.getParseMethod()); + Assert.assertNull(request.getFormulaEnabled()); + Assert.assertNull(request.getTableEnabled()); + Assert.assertTrue(request.getReturnMarkdown()); + Assert.assertFalse(request.getReturnMiddleJson()); + Assert.assertFalse(request.getReturnContentList()); + Assert.assertFalse(request.getReturnImages()); + } + + /** + * 验证知识库导入场景保留结构化工件。 + */ + @Test + public void shouldBuildKnowledgeImportScenarioRequest() { + DocumentParseRequestFactory factory = new DocumentParseRequestFactory(); + + ParseRequest request = factory.build(buildSource(), DocumentParseScenario.KNOWLEDGE_IMPORT); + + Assert.assertTrue(request.getReturnMarkdown()); + Assert.assertTrue(request.getReturnMiddleJson()); + Assert.assertTrue(request.getReturnContentList()); + Assert.assertTrue(request.getReturnImages()); + } + + private LoadedDocumentSource buildSource() { + LoadedDocumentSource source = new LoadedDocumentSource(); + source.setFileName("demo.pdf"); + source.setContentType("application/pdf"); + source.setContentBytes("pdf-data".getBytes()); + source.setSize(8L); + return source; + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseResultMapperTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseResultMapperTest.java new file mode 100644 index 0000000..7fcf9ed --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseResultMapperTest.java @@ -0,0 +1,82 @@ +package tech.easyflow.ai.document.support; + +import com.easyagents.document.core.model.ParseArtifacts; +import com.easyagents.document.core.model.ParseResult; +import com.easyagents.document.core.model.ParseResponse; +import com.easyagents.document.core.model.ParseTaskInfo; +import org.junit.Assert; +import org.junit.Test; +import tech.easyflow.ai.document.model.DocumentParseTaskInfo; +import tech.easyflow.ai.document.model.DocumentParsedResult; + +import java.util.Collections; + +/** + * {@link DocumentParseResultMapper} 单元测试。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentParseResultMapperTest { + + /** + * 验证 preferredText 按 markdown 优先、plainText 回退。 + */ + @Test + public void shouldPreferMarkdown() { + DocumentParseResultMapper mapper = new DocumentParseResultMapper(); + ParseResult result = new ParseResult(); + result.setFileName("demo.pdf"); + result.setMarkdown("# title"); + result.setPlainText("plain"); + + DocumentParsedResult mapped = mapper.map(result); + + Assert.assertEquals("# title", mapped.getPreferredText()); + } + + /** + * 验证结构化工件被正确映射。 + */ + @Test + public void shouldMapArtifacts() { + DocumentParseResultMapper mapper = new DocumentParseResultMapper(); + ParseResult result = new ParseResult(); + ParseArtifacts artifacts = new ParseArtifacts(); + artifacts.setMiddleJson(Collections.singletonMap("page", 1)); + artifacts.setContentList(Collections.singletonList("block")); + artifacts.setModelOutput(Collections.singletonMap("raw", "ok")); + result.setArtifacts(artifacts); + + DocumentParsedResult mapped = mapper.map(result); + + Assert.assertNotNull(mapped.getArtifacts()); + Assert.assertEquals(Collections.singletonMap("page", 1), mapped.getArtifacts().getMiddleJson()); + Assert.assertEquals(Collections.singletonList("block"), mapped.getArtifacts().getContentList()); + Assert.assertEquals(Collections.singletonMap("raw", "ok"), mapped.getArtifacts().getModelOutput()); + } + + /** + * 验证任务聚合结果被正确映射。 + */ + @Test + public void shouldMapTaskInfo() { + DocumentParseResultMapper mapper = new DocumentParseResultMapper(); + ParseTaskInfo taskInfo = new ParseTaskInfo(); + taskInfo.setTaskId("task-1"); + taskInfo.setStatus("completed"); + + ParseResult result = new ParseResult(); + result.setFileName("demo.pdf"); + result.setMarkdown("# title"); + ParseResponse response = new ParseResponse(); + response.setResults(Collections.singletonList(result)); + taskInfo.setResult(response); + + DocumentParseTaskInfo mapped = mapper.map(taskInfo); + + Assert.assertEquals("task-1", mapped.getTaskId()); + Assert.assertNotNull(mapped.getResult()); + Assert.assertEquals("# title", mapped.getResult().getPreferredText()); + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentSourceLoaderTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentSourceLoaderTest.java new file mode 100644 index 0000000..85983ca --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentSourceLoaderTest.java @@ -0,0 +1,133 @@ +package tech.easyflow.ai.document.support; + +import org.junit.Assert; +import org.junit.Test; +import tech.easyflow.ai.document.exception.DocumentParseBridgeException; +import tech.easyflow.ai.document.model.DocumentSourceRef; +import tech.easyflow.common.filestorage.FileStorageService; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.InetSocketAddress; +import java.nio.charset.StandardCharsets; +import com.sun.net.httpserver.HttpServer; + +/** + * {@link DocumentSourceLoader} 单元测试。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocumentSourceLoaderTest { + + /** + * 验证可从 filePath 正常读取文件内容。 + * + * @throws IOException 测试读取异常 + */ + @Test + public void shouldLoadContentFromFilePath() throws IOException { + DocumentSourceLoader loader = new DocumentSourceLoader(new FakeFileStorageService("demo-pdf".getBytes(StandardCharsets.UTF_8))); + DocumentSourceRef sourceRef = DocumentSourceRef.ofPath("/attachment/test/demo.pdf"); + + LoadedDocumentSource loadedSource = loader.load(sourceRef); + + Assert.assertEquals("demo.pdf", loadedSource.getFileName()); + Assert.assertEquals(8L, loadedSource.getSize().longValue()); + Assert.assertArrayEquals("demo-pdf".getBytes(StandardCharsets.UTF_8), loadedSource.getContentBytes()); + } + + /** + * 验证缺少有效来源时抛出明确异常。 + */ + @Test + public void shouldThrowWhenSourceMissing() { + DocumentSourceLoader loader = new DocumentSourceLoader(new FakeFileStorageService(new byte[0])); + + try { + loader.load(new DocumentSourceRef()); + Assert.fail("expected DocumentParseBridgeException"); + } catch (DocumentParseBridgeException e) { + Assert.assertEquals("unsupported_source", e.getCode()); + } + } + + /** + * 验证 filePath 为远端 URL 时不会误走存储读取。 + */ + @Test + public void shouldPreferRemoteDownloadWhenFilePathIsRemoteUrl() throws IOException { + DocumentSourceLoader loader = new DocumentSourceLoader(new FailingFileStorageService()); + HttpServer server = HttpServer.create(new InetSocketAddress(0), 0); + byte[] body = "demo-pdf".getBytes(StandardCharsets.UTF_8); + server.createContext("/demo.pdf", exchange -> { + exchange.sendResponseHeaders(200, body.length); + exchange.getResponseBody().write(body); + exchange.close(); + }); + server.start(); + try { + DocumentSourceRef sourceRef = new DocumentSourceRef(); + sourceRef.setFileName("demo.pdf"); + sourceRef.setFilePath("http://127.0.0.1:" + server.getAddress().getPort() + "/demo.pdf"); + + LoadedDocumentSource loadedSource = loader.load(sourceRef); + + Assert.assertEquals("demo.pdf", loadedSource.getFileName()); + Assert.assertArrayEquals(body, loadedSource.getContentBytes()); + } finally { + server.stop(0); + } + } + + private static class FakeFileStorageService implements FileStorageService { + + private final byte[] content; + + private FakeFileStorageService(byte[] content) { + this.content = content; + } + + @Override + public String save(org.springframework.web.multipart.MultipartFile file) { + return null; + } + + @Override + public void delete(String path) { + } + + @Override + public InputStream readStream(String path) { + return new ByteArrayInputStream(content); + } + + @Override + public long getFileSize(String path) { + return content.length; + } + } + + private static class FailingFileStorageService implements FileStorageService { + + @Override + public String save(org.springframework.web.multipart.MultipartFile file) { + return null; + } + + @Override + public void delete(String path) { + } + + @Override + public InputStream readStream(String path) throws IOException { + throw new IOException("should not read remote url from storage"); + } + + @Override + public long getFileSize(String path) { + return 0L; + } + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java new file mode 100644 index 0000000..b1b2e25 --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java @@ -0,0 +1,278 @@ +package tech.easyflow.ai.node; + +import org.junit.Assert; +import org.junit.Test; +import tech.easyflow.ai.document.model.DocumentParseTaskInfo; +import tech.easyflow.ai.document.model.DocumentParseTaskStatus; +import tech.easyflow.ai.document.model.DocumentParsedResult; +import tech.easyflow.ai.document.model.DocumentSourceRef; +import tech.easyflow.ai.document.service.DocumentParseBridgeService; +import tech.easyflow.common.filestorage.FileStorageService; +import tech.easyflow.common.web.exceptions.BusinessException; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.InetSocketAddress; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; +import com.sun.net.httpserver.HttpServer; + +/** + * {@link DocNodeFileContentExtractor} 单元测试。 + * + * @author Codex + * @since 2026-04-14 + */ +public class DocNodeFileContentExtractorTest { + + /** + * 验证 PDF 文件会走统一文档解析桥接服务。 + */ + @Test + public void shouldUseDocumentBridgeForPdf() { + RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); + DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor( + bridgeService, + new FakeFileStorageService(), + new FakeReaderManager("ignored") + ); + + String content = extractor.extract(buildFileValue("demo.pdf", "/files/demo.pdf", "application/pdf")); + + Assert.assertEquals("# parsed", content); + Assert.assertNotNull(bridgeService.lastSource); + Assert.assertEquals("demo.pdf", bridgeService.lastSource.getFileName()); + } + + /** + * 验证非 PDF 文件会继续走默认读取器。 + */ + @Test + public void shouldUseDefaultReaderForNonPdf() { + RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); + DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor( + bridgeService, + new FakeFileStorageService(), + new FakeReaderManager("plain text") + ); + + String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + + Assert.assertEquals("plain text", content); + Assert.assertNull(bridgeService.lastSource); + } + + /** + * 验证缺少 filePath 时会抛出明确异常。 + */ + @Test + public void shouldRejectMissingFilePath() { + DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor( + new RecordingDocumentParseBridgeService(), + new FakeFileStorageService(), + new FakeReaderManager("plain text") + ); + + try { + extractor.extract(buildFileValue("demo.pdf", null, "application/pdf")); + Assert.fail("expected BusinessException"); + } catch (BusinessException e) { + Assert.assertEquals("文件输入缺少 filePath", e.getMessage()); + } + } + + /** + * 验证解析结果为空时不会回退旧 PDF 读取链路。 + */ + @Test + public void shouldFailWhenPdfParseResultIsEmpty() { + RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); + bridgeService.response.setPreferredText(null); + bridgeService.response.setMarkdown(null); + bridgeService.response.setPlainText(null); + DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor( + bridgeService, + new FakeFileStorageService(), + new FakeReaderManager("pdf fallback") + ); + + try { + extractor.extract(buildFileValue("demo.pdf", "/files/demo.pdf", "application/pdf")); + Assert.fail("expected BusinessException"); + } catch (BusinessException e) { + Assert.assertEquals("PDF 文档解析结果为空", e.getMessage()); + } + } + + /** + * 验证远端素材 URL 的非 PDF 文件不会误走本地存储读取。 + */ + @Test + public void shouldReadRemoteUrlForNonPdf() { + RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); + HttpServer server; + try { + server = HttpServer.create(new InetSocketAddress(0), 0); + } catch (IOException e) { + throw new RuntimeException(e); + } + byte[] body = "remote text".getBytes(StandardCharsets.UTF_8); + server.createContext("/demo.docx", exchange -> { + exchange.sendResponseHeaders(200, body.length); + exchange.getResponseBody().write(body); + exchange.close(); + }); + server.start(); + try { + DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor( + bridgeService, + new FailingFileStorageService(), + new ReadingReaderManager() + ); + + String content = extractor.extract(buildFileValue( + "demo.docx", + "http://127.0.0.1:" + server.getAddress().getPort() + "/demo.docx", + "" + )); + + Assert.assertEquals("remote text", content); + Assert.assertNull(bridgeService.lastSource); + } finally { + server.stop(0); + } + } + + private Map buildFileValue(String fileName, String filePath, String contentType) { + Map value = new HashMap(); + value.put("fileName", fileName); + value.put("filePath", filePath); + value.put("contentType", contentType); + value.put("size", 16L); + value.put("url", filePath); + return value; + } + + private static class RecordingDocumentParseBridgeService implements DocumentParseBridgeService { + + private final DocumentParsedResult response = new DocumentParsedResult(); + private DocumentSourceRef lastSource; + + private RecordingDocumentParseBridgeService() { + response.setPreferredText("# parsed"); + response.setMarkdown("# parsed"); + response.setPlainText("parsed"); + } + + @Override + public DocumentParsedResult parse(DocumentSourceRef source, tech.easyflow.ai.document.model.DocumentParseScenario scenario) { + this.lastSource = source; + return response; + } + + @Override + public DocumentParseTaskStatus submit(DocumentSourceRef source, tech.easyflow.ai.document.model.DocumentParseScenario scenario) { + return new DocumentParseTaskStatus(); + } + + @Override + public DocumentParseTaskStatus queryTask(String taskId) { + return new DocumentParseTaskStatus(); + } + + @Override + public DocumentParsedResult queryResult(String taskId) { + return response; + } + + @Override + public DocumentParseTaskInfo queryTaskInfo(String taskId) { + return new DocumentParseTaskInfo(); + } + } + + private static class FakeFileStorageService implements FileStorageService { + + @Override + public String save(org.springframework.web.multipart.MultipartFile file) { + return null; + } + + @Override + public void delete(String path) { + } + + @Override + public String save(File file, String prePath) { + return null; + } + + @Override + public InputStream readStream(String path) throws IOException { + return new ByteArrayInputStream("doc".getBytes(StandardCharsets.UTF_8)); + } + + @Override + public long getFileSize(String path) { + return 3L; + } + } + + private static class FakeReaderManager extends ReaderManager { + + private final String content; + + private FakeReaderManager(String content) { + this.content = content; + } + + @Override + public ReadDocService getReader() { + return (fileName, is) -> content; + } + } + + private static class ReadingReaderManager extends ReaderManager { + + @Override + public ReadDocService getReader() { + return (fileName, is) -> { + try { + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } catch (IOException e) { + throw new RuntimeException(e); + } + }; + } + } + + private static class FailingFileStorageService implements FileStorageService { + + @Override + public String save(org.springframework.web.multipart.MultipartFile file) { + return null; + } + + @Override + public void delete(String path) { + } + + @Override + public String save(File file, String prePath) { + return null; + } + + @Override + public InputStream readStream(String path) throws IOException { + throw new IOException("should not read remote url from storage"); + } + + @Override + public long getFileSize(String path) { + return 0L; + } + } +} diff --git a/easyflow-starter/easyflow-starter-all/src/main/resources/application.yml b/easyflow-starter/easyflow-starter-all/src/main/resources/application.yml index 0efc39c..226a191 100644 --- a/easyflow-starter/easyflow-starter-all/src/main/resources/application.yml +++ b/easyflow-starter/easyflow-starter-all/src/main/resources/application.yml @@ -105,7 +105,7 @@ easyflow: analytical-db: # 是否启用分析数据库 enabled: true - url: jdbc:clickhouse://127.0.0.1:38123/easyflow?jdbc_ignore_unsupported_values=true&socket_timeout=30000&compress=false&ssl=false + url: jdbc:clickhouse://114.66.22.180:38123/easyflow_dev?jdbc_ignore_unsupported_values=true&socket_timeout=30000&compress=false&ssl=false username: easyflow password: 123456 driver-class-name: com.clickhouse.jdbc.ClickHouseDriver @@ -163,6 +163,17 @@ dromara: domain: http://127.0.0.1:39000/easyflow/ base-path: attachment +# easy-agents 文档解析统一配置 +easy-agents: + document: + pdf: + provider: mineru + mineru: + # 统一文档解析桥接层直接复用 easy-agents 的 provider 配置,不在 easyflow 再复制一套配置体系 + base-url: https://hub.wust.edu.cn/modelServer/mineru-api + default-lang-list: + - ch + # 自定义节点相关配置 node: # 文件内容提取节点,默认使用简单文档读取器,可自行实现 ReadDocService diff --git a/easyflow-ui-admin/app/src/locales/langs/en-US/aiWorkflow.json b/easyflow-ui-admin/app/src/locales/langs/en-US/aiWorkflow.json index 0af5fdd..0cd47b7 100644 --- a/easyflow-ui-admin/app/src/locales/langs/en-US/aiWorkflow.json +++ b/easyflow-ui-admin/app/src/locales/langs/en-US/aiWorkflow.json @@ -26,7 +26,8 @@ "result": "Result", "confirm": "For contents to be confirmed, please confirm first!", "completed": "Chain has been completed, please start a new one.", - "fileContentExtraction": "FileContentExtraction", + "fileContentExtraction": "Document Parsing", + "documentInput": "FileInput", "documentAddress": "DocumentAddress", "parsedText": "ParsedText", "resourceSync": "ResourceSync", @@ -94,7 +95,8 @@ "stageSave": "Save Check", "stagePreExecute": "Pre-execute Check", "descriptions": { - "fileContentExtraction": "Extract text content from PDF or Word documents, etc", + "fileContentExtraction": "Parse content from PDF, Word, and other document files", + "documentInput": "Upload a file or choose one from resources", "documentAddress": "Document URL address", "parsedText": "Parsed text content", "resourceSync": "Download resource files and save to system resource library", diff --git a/easyflow-ui-admin/app/src/locales/langs/zh-CN/aiWorkflow.json b/easyflow-ui-admin/app/src/locales/langs/zh-CN/aiWorkflow.json index 2a94968..4e098f1 100644 --- a/easyflow-ui-admin/app/src/locales/langs/zh-CN/aiWorkflow.json +++ b/easyflow-ui-admin/app/src/locales/langs/zh-CN/aiWorkflow.json @@ -26,7 +26,8 @@ "result": "执行结果", "confirm": "有待确认的内容,请先确认!", "completed": "流程已执行完毕,请重新发起。", - "fileContentExtraction": "文件内容提取", + "fileContentExtraction": "文档解析", + "documentInput": "文件输入", "documentAddress": "文档地址", "parsedText": "解析后的文本", "resourceSync": "素材同步", @@ -94,7 +95,8 @@ "stageSave": "保存校验", "stagePreExecute": "预执行校验", "descriptions": { - "fileContentExtraction": "提取 PDF 或者 Word 等文件中的文字内容", + "fileContentExtraction": "解析 PDF 或 Word 等文档内容", + "documentInput": "上传文件或从素材库中选择文件", "documentAddress": "文档的url地址", "parsedText": "解析后的文本内容", "resourceSync": "下载素材文件并保存到系统素材库", diff --git a/easyflow-ui-admin/app/src/views/ai/workflow/components/WorkflowFileInput.vue b/easyflow-ui-admin/app/src/views/ai/workflow/components/WorkflowFileInput.vue new file mode 100644 index 0000000..b2311b7 --- /dev/null +++ b/easyflow-ui-admin/app/src/views/ai/workflow/components/WorkflowFileInput.vue @@ -0,0 +1,161 @@ + + + + + diff --git a/easyflow-ui-admin/app/src/views/ai/workflow/components/WorkflowFormItem.vue b/easyflow-ui-admin/app/src/views/ai/workflow/components/WorkflowFormItem.vue index 7249d44..dac9e1c 100644 --- a/easyflow-ui-admin/app/src/views/ai/workflow/components/WorkflowFormItem.vue +++ b/easyflow-ui-admin/app/src/views/ai/workflow/components/WorkflowFormItem.vue @@ -10,6 +10,7 @@ import { import { $t } from '#/locales'; import ChooseResource from '#/views/ai/resource/ChooseResource.vue'; +import WorkflowFileInput from '#/views/ai/workflow/components/WorkflowFileInput.vue'; const props = defineProps({ parameters: { @@ -27,10 +28,19 @@ const props = defineProps({ }); const emit = defineEmits(['update:runParams']); function getContentType(item: any) { - return item.contentType || 'text'; + if (item.contentType) { + return item.contentType; + } + if (String(item.dataType || '').toLowerCase() === 'file') { + return 'file'; + } + return 'text'; } function isResource(contentType: any) { - return ['audio', 'file', 'image', 'video'].includes(contentType); + return ['audio', 'image', 'video'].includes(contentType); +} +function isFileContentType(contentType: any) { + return contentType === 'file'; } function getCheckboxOptions(item: any) { if (item.enums) { @@ -105,6 +115,12 @@ function choose(data: any, propName: string) { :placeholder="item.formPlaceholder" /> +