feat: 支持工作流文档节点解析 Office 新格式

- DocNode 对 pdf/docx/pptx/xlsx 统一走桥接同步解析 - 修复 DOCX 误回退到 PDF 解析服务的问题并补齐回归测试
2026-04-18 19:52:45 +08:00
parent 4130381658
commit 8546d927bc
6 changed files with 211 additions and 32 deletions
--- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java
+++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java
@@ -222,7 +222,7 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic
            case PDF:
                return requireSpecificService(pdfDocumentParseService, defaultDocumentParseService, "PDF");
            case DOCX:
-                return requireSpecificService(defaultDocumentParseService, pdfDocumentParseService, "DOCX");
+                return requireSpecificService(defaultDocumentParseService, null, "DOCX");
            case PPTX:
                return requireSpecificService(pptxDocumentParseService, null, "PPTX");
            case XLSX:
--- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java
+++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java
@@ -5,9 +5,24 @@ import tech.easyflow.ai.utils.DocUtil;
 import java.io.InputStream;
 /**
 * 默认文档读取服务。
 *
 * <p>仅在统一文档解析桥接不支持当前文件类型时，作为工作流文档解析节点的回退读取器。</p>
 *
 * @author Codex
 * @since 2026-04-18
 */
@Component("defaultReader")
 public class DefaultReadService implements ReadDocService {
    /**
     * 读取默认支持的文档内容。
     *
     * @param fileName 文件名
     * @param is 文件输入流
     * @return 读取出的文本内容
     */
    @Override
    public String read(String fileName, InputStream is) {
        String suffix = DocUtil.getSuffix(fileName);
--- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java
+++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java
@@ -13,7 +13,7 @@ import java.util.Map;
 /**
 * 工作流文件内容提取节点。
 *
- * <p>节点输入为统一文件对象，PDF 交给统一文档解析桥接服务，
+ * <p>节点输入为统一文件对象，桥接支持的文档类型优先交给统一文档解析服务，
 * 其他类型继续走默认文档读取器。</p>
 *
 * @author Codex
--- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java
+++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java
@@ -7,7 +7,7 @@ import tech.easyflow.ai.document.model.DocumentParseScenario;
 import tech.easyflow.ai.document.model.DocumentParsedResult;
 import tech.easyflow.ai.document.model.DocumentSourceRef;
 import tech.easyflow.ai.document.service.DocumentParseBridgeService;
-import tech.easyflow.ai.utils.DocUtil;
+import tech.easyflow.ai.document.support.DocumentParseSourceType;
 import tech.easyflow.common.filestorage.FileStorageService;
 import tech.easyflow.common.util.StringUtil;
 import tech.easyflow.common.web.exceptions.BusinessException;
@@ -19,8 +19,8 @@ import java.util.Map;
 /**
 * {@link DocNode} 文件内容提取器。
 *
- * <p>负责把工作流运行态中的文件对象转换为统一文档源，并根据文件类型选择
+ * <p>负责把工作流运行态中的文件对象转换为统一文档源，并根据文件类型优先选择
- * 统一文档解析桥接服务或默认读取器。</p>
+ * 统一文档解析桥接服务；仅在桥接不支持时回退到默认读取器。</p>
 *
 * @author Codex
 * @since 2026-04-14
@@ -56,8 +56,8 @@ public class DocNodeFileContentExtractor {
    public String extract(Object fileValue) {
        DocumentSourceRef sourceRef = toDocumentSourceRef(fileValue);
        validateSourceRef(sourceRef);
-        if (isPdf(sourceRef)) {
+        if (shouldUseDocumentBridge(sourceRef)) {
-            return extractPdfContent(sourceRef);
+            return extractBridgeContent(sourceRef);
        }
        return extractDefaultContent(sourceRef);
    }
@@ -96,19 +96,24 @@ public class DocNodeFileContentExtractor {
        }
    }
-    private boolean isPdf(DocumentSourceRef sourceRef) {
+    /**
-        if (StringUtil.hasText(sourceRef.getContentType())
+     * 判断当前文件类型是否应优先走统一文档解析桥接。
-            && sourceRef.getContentType().toLowerCase().contains("pdf")) {
+     *
-            return true;
+     * @param sourceRef 文档源
-        }
+     * @return 是否走桥接
-        String fileName = sourceRef.getFileName();
+     */
-        if (!StringUtil.hasText(fileName) || !fileName.contains(".")) {
+    private boolean shouldUseDocumentBridge(DocumentSourceRef sourceRef) {
-            return false;
+        return DocumentParseSourceType.resolve(sourceRef.getFileName(), sourceRef.getContentType())
-        }
+            != DocumentParseSourceType.UNSUPPORTED;
        return "pdf".equals(DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName)));
    }
-    private String extractPdfContent(DocumentSourceRef sourceRef) {
+    /**
     * 通过统一文档解析桥接提取主文本结果。
     *
     * @param sourceRef 文档源
     * @return 桥接提取出的主文本
     */
    private String extractBridgeContent(DocumentSourceRef sourceRef) {
        DocumentParsedResult parsedResult = documentParseBridgeService.parse(sourceRef, DocumentParseScenario.WORKFLOW_TEXT);
        String preferredText = parsedResult == null ? null : parsedResult.getPreferredText();
        if (StringUtil.hasText(preferredText)) {
@@ -120,7 +125,7 @@ public class DocNodeFileContentExtractor {
        if (parsedResult != null && StringUtil.hasText(parsedResult.getPlainText())) {
            return parsedResult.getPlainText();
        }
-        throw new BusinessException("PDF 文档解析结果为空");
+        throw new BusinessException("文档解析结果为空");
    }
    private String extractDefaultContent(DocumentSourceRef sourceRef) {
--- a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java
+++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java
@@ -101,7 +101,7 @@ public class DocumentParseBridgeServiceImplTest {
    @Test
    public void shouldRoutePptxToDedicatedService() {
        FakePptxDocumentParseService pptxService = new FakePptxDocumentParseService();
-        FakePdfDocumentParseService defaultService = new FakePdfDocumentParseService();
+        FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
        DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, pptxService, null, defaultService);
        DocumentParsedResult result = bridgeService.parse(buildSource("slides.pptx",
@@ -112,6 +112,61 @@ public class DocumentParseBridgeServiceImplTest {
        Assert.assertEquals(0, defaultService.parseCallCount);
    }
    /**
     * 验证 DOCX 会路由到默认桥接服务，而不是误走 PDF 服务。
     */
    @Test
    public void shouldRouteDocxToDefaultBridgeService() {
        FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService();
        FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
        DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, defaultService);
        DocumentParsedResult result = bridgeService.parse(buildSource("demo.docx",
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT);
        Assert.assertEquals("# docx", result.getPreferredText());
        Assert.assertEquals(1, defaultService.parseCallCount);
        Assert.assertEquals(0, pdfService.parseCallCount);
    }
    /**
     * 验证 DOCX 在缺少默认桥接服务时会明确失败，而不是退到 PDF 服务。
     */
    @Test
    public void shouldThrowWhenDocxBridgeServiceDisabled() {
        FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService();
        DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, null);
        try {
            bridgeService.parse(buildSource("demo.docx",
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT);
            Assert.fail("expected DocumentParseBridgeException");
        } catch (DocumentParseBridgeException e) {
            Assert.assertEquals("service_not_enabled", e.getCode());
            Assert.assertTrue(e.getMessage().contains("DOCX"));
        }
        Assert.assertEquals(0, pdfService.parseCallCount);
    }
    /**
     * 验证 XLSX 在缺少专用桥接服务时会明确失败。
     */
    @Test
    public void shouldThrowWhenXlsxBridgeServiceDisabled() {
        FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
        DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, null, null, defaultService);
        try {
            bridgeService.parse(buildSource("table.xlsx",
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), DocumentParseScenario.WORKFLOW_TEXT);
            Assert.fail("expected DocumentParseBridgeException");
        } catch (DocumentParseBridgeException e) {
            Assert.assertEquals("service_not_enabled", e.getCode());
            Assert.assertTrue(e.getMessage().contains("XLSX"));
        }
        Assert.assertEquals(0, defaultService.parseCallCount);
    }
    private DocumentParseBridgeServiceImpl buildBridgeService(PdfDocumentParseService pdfDocumentParseService,
                                                              PptxDocumentParseService pptxDocumentParseService,
                                                              XlsxDocumentParseService xlsxDocumentParseService,
@@ -217,6 +272,45 @@ public class DocumentParseBridgeServiceImplTest {
        }
    }
    private static class FakeDefaultDocumentParseService implements DocumentParseService<ParseRequest> {
        private ParseRequest lastParseRequest;
        private int parseCallCount;
        @Override
        public ParseResponse parse(ParseRequest request) {
            parseCallCount++;
            lastParseRequest = request;
            ParseResult result = new ParseResult();
            result.setFileName("demo.docx");
            result.setMarkdown("# docx");
            result.setPlainText("docx");
            ParseResponse response = new ParseResponse();
            response.setResults(Collections.singletonList(result));
            return response;
        }
        @Override
        public ParseTaskStatus submit(ParseRequest request) {
            throw new UnsupportedOperationException();
        }
        @Override
        public ParseTaskStatus queryTask(String taskId) {
            throw new UnsupportedOperationException();
        }
        @Override
        public ParseResponse queryResult(String taskId) {
            throw new UnsupportedOperationException();
        }
        @Override
        public ParseTaskInfo queryTaskInfo(String taskId) {
            throw new UnsupportedOperationException();
        }
    }
    private static class FakePptxDocumentParseService implements PptxDocumentParseService {
        private int parseCallCount;
--- a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java
+++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java
@@ -48,10 +48,75 @@ public class DocNodeFileContentExtractorTest {
    }
    /**
-     * 验证非 PDF 文件会继续走默认读取器。
+     * 验证 DOCX 文件会走统一文档解析桥接服务。
     */
    @Test
-    public void shouldUseDefaultReaderForNonPdf() {
+    public void shouldUseDocumentBridgeForDocx() {
        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
        DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
            bridgeService,
            new FakeFileStorageService(),
            new FakeReaderManager("ignored")
        );
        String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
        Assert.assertEquals("# parsed", content);
        Assert.assertNotNull(bridgeService.lastSource);
        Assert.assertEquals("demo.docx", bridgeService.lastSource.getFileName());
    }
    /**
     * 验证 PPTX 文件会走统一文档解析桥接服务。
     */
    @Test
    public void shouldUseDocumentBridgeForPptx() {
        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
        DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
            bridgeService,
            new FakeFileStorageService(),
            new FakeReaderManager("ignored")
        );
        String content = extractor.extract(buildFileValue(
            "slides.pptx",
            "/files/slides.pptx",
            "application/vnd.openxmlformats-officedocument.presentationml.presentation"
        ));
        Assert.assertEquals("# parsed", content);
        Assert.assertNotNull(bridgeService.lastSource);
        Assert.assertEquals("slides.pptx", bridgeService.lastSource.getFileName());
    }
    /**
     * 验证 XLSX 文件会走统一文档解析桥接服务。
     */
    @Test
    public void shouldUseDocumentBridgeForXlsx() {
        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
        DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
            bridgeService,
            new FakeFileStorageService(),
            new FakeReaderManager("ignored")
        );
        String content = extractor.extract(buildFileValue(
            "table.xlsx",
            "/files/table.xlsx",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        ));
        Assert.assertEquals("# parsed", content);
        Assert.assertNotNull(bridgeService.lastSource);
        Assert.assertEquals("table.xlsx", bridgeService.lastSource.getFileName());
    }
    /**
     * 验证非桥接类型文件会继续走默认读取器。
     */
    @Test
    public void shouldUseDefaultReaderForUnsupportedType() {
        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
        DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
            bridgeService,
@@ -59,7 +124,7 @@ public class DocNodeFileContentExtractorTest {
            new FakeReaderManager("plain text")
        );
-        String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+        String content = extractor.extract(buildFileValue("note.txt", "/files/note.txt", "text/plain"));
        Assert.assertEquals("plain text", content);
        Assert.assertNull(bridgeService.lastSource);
@@ -85,10 +150,10 @@ public class DocNodeFileContentExtractorTest {
    }
    /**
-     * 验证解析结果为空时不会回退旧 PDF 读取链路。
+     * 验证桥接解析结果为空时不会回退旧读取链路。
     */
    @Test
-    public void shouldFailWhenPdfParseResultIsEmpty() {
+    public void shouldFailWhenBridgeParseResultIsEmpty() {
        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
        bridgeService.response.setPreferredText(null);
        bridgeService.response.setMarkdown(null);
@@ -103,15 +168,15 @@ public class DocNodeFileContentExtractorTest {
            extractor.extract(buildFileValue("demo.pdf", "/files/demo.pdf", "application/pdf"));
            Assert.fail("expected BusinessException");
        } catch (BusinessException e) {
-            Assert.assertEquals("PDF 文档解析结果为空", e.getMessage());
+            Assert.assertEquals("文档解析结果为空", e.getMessage());
        }
    }
    /**
-     * 验证远端素材 URL 的非 PDF 文件不会误走本地存储读取。
+     * 验证远端素材 URL 的非桥接文件不会误走本地存储读取。
     */
    @Test
-    public void shouldReadRemoteUrlForNonPdf() {
+    public void shouldReadRemoteUrlForUnsupportedType() {
        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
        HttpServer server;
        try {
@@ -120,7 +185,7 @@ public class DocNodeFileContentExtractorTest {
            throw new RuntimeException(e);
        }
        byte[] body = "remote text".getBytes(StandardCharsets.UTF_8);
-        server.createContext("/demo.docx", exchange -> {
+        server.createContext("/note.txt", exchange -> {
            exchange.sendResponseHeaders(200, body.length);
            exchange.getResponseBody().write(body);
            exchange.close();
@@ -134,9 +199,9 @@ public class DocNodeFileContentExtractorTest {
            );
            String content = extractor.extract(buildFileValue(
-                "demo.docx",
+                "note.txt",
-                "http://127.0.0.1:" + server.getAddress().getPort() + "/demo.docx",
+                "http://127.0.0.1:" + server.getAddress().getPort() + "/note.txt",
-                ""
+                "text/plain"
            ));
            Assert.assertEquals("remote text", content);