feat: 支持工作流文档节点解析 Office 新格式

- DocNode 对 pdf/docx/pptx/xlsx 统一走桥接同步解析 - 修复 DOCX 误回退到 PDF 解析服务的问题并补齐回归测试
2026-04-18 19:52:45 +08:00
parent 4130381658
commit 8546d927bc
6 changed files with 211 additions and 32 deletions
--- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java
+++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java
@@ -222,7 +222,7 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic
            case PDF:
                return requireSpecificService(pdfDocumentParseService, defaultDocumentParseService, "PDF");
            case DOCX:
-                return requireSpecificService(defaultDocumentParseService, pdfDocumentParseService, "DOCX");
+                return requireSpecificService(defaultDocumentParseService, null, "DOCX");
            case PPTX:
                return requireSpecificService(pptxDocumentParseService, null, "PPTX");
            case XLSX:
--- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java
+++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java
@@ -5,9 +5,24 @@ import tech.easyflow.ai.utils.DocUtil;

 import java.io.InputStream;

+/**
+ * 默认文档读取服务。
+ *
+ * <p>仅在统一文档解析桥接不支持当前文件类型时，作为工作流文档解析节点的回退读取器。</p>
+ *
+ * @author Codex
+ * @since 2026-04-18
+ */
@Component("defaultReader")
 public class DefaultReadService implements ReadDocService {

+    /**
+     * 读取默认支持的文档内容。
+     *
+     * @param fileName 文件名
+     * @param is 文件输入流
+     * @return 读取出的文本内容
+     */
    @Override
    public String read(String fileName, InputStream is) {
        String suffix = DocUtil.getSuffix(fileName);
--- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java
+++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java
@@ -13,7 +13,7 @@ import java.util.Map;
 /**
 * 工作流文件内容提取节点。
 *
- * <p>节点输入为统一文件对象，PDF 交给统一文档解析桥接服务，
+ * <p>节点输入为统一文件对象，桥接支持的文档类型优先交给统一文档解析服务，
 * 其他类型继续走默认文档读取器。</p>
 *
 * @author Codex
--- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java
+++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java
@@ -7,7 +7,7 @@ import tech.easyflow.ai.document.model.DocumentParseScenario;
 import tech.easyflow.ai.document.model.DocumentParsedResult;
 import tech.easyflow.ai.document.model.DocumentSourceRef;
 import tech.easyflow.ai.document.service.DocumentParseBridgeService;
-import tech.easyflow.ai.utils.DocUtil;
+import tech.easyflow.ai.document.support.DocumentParseSourceType;
 import tech.easyflow.common.filestorage.FileStorageService;
 import tech.easyflow.common.util.StringUtil;
 import tech.easyflow.common.web.exceptions.BusinessException;
@@ -19,8 +19,8 @@ import java.util.Map;
 /**
 * {@link DocNode} 文件内容提取器。
 *
- * <p>负责把工作流运行态中的文件对象转换为统一文档源，并根据文件类型选择
- * 统一文档解析桥接服务或默认读取器。</p>
+ * <p>负责把工作流运行态中的文件对象转换为统一文档源，并根据文件类型优先选择
+ * 统一文档解析桥接服务；仅在桥接不支持时回退到默认读取器。</p>
 *
 * @author Codex
 * @since 2026-04-14
@@ -56,8 +56,8 @@ public class DocNodeFileContentExtractor {
    public String extract(Object fileValue) {
        DocumentSourceRef sourceRef = toDocumentSourceRef(fileValue);
        validateSourceRef(sourceRef);
-        if (isPdf(sourceRef)) {
-            return extractPdfContent(sourceRef);
+        if (shouldUseDocumentBridge(sourceRef)) {
+            return extractBridgeContent(sourceRef);
        }
        return extractDefaultContent(sourceRef);
    }
@@ -96,19 +96,24 @@ public class DocNodeFileContentExtractor {
        }
    }

-    private boolean isPdf(DocumentSourceRef sourceRef) {
-        if (StringUtil.hasText(sourceRef.getContentType())
-            && sourceRef.getContentType().toLowerCase().contains("pdf")) {
-            return true;
-        }
-        String fileName = sourceRef.getFileName();
-        if (!StringUtil.hasText(fileName) || !fileName.contains(".")) {
-            return false;
-        }
-        return "pdf".equals(DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName)));
+    /**
+     * 判断当前文件类型是否应优先走统一文档解析桥接。
+     *
+     * @param sourceRef 文档源
+     * @return 是否走桥接
+     */
+    private boolean shouldUseDocumentBridge(DocumentSourceRef sourceRef) {
+        return DocumentParseSourceType.resolve(sourceRef.getFileName(), sourceRef.getContentType())
+            != DocumentParseSourceType.UNSUPPORTED;
    }

-    private String extractPdfContent(DocumentSourceRef sourceRef) {
+    /**
+     * 通过统一文档解析桥接提取主文本结果。
+     *
+     * @param sourceRef 文档源
+     * @return 桥接提取出的主文本
+     */
+    private String extractBridgeContent(DocumentSourceRef sourceRef) {
        DocumentParsedResult parsedResult = documentParseBridgeService.parse(sourceRef, DocumentParseScenario.WORKFLOW_TEXT);
        String preferredText = parsedResult == null ? null : parsedResult.getPreferredText();
        if (StringUtil.hasText(preferredText)) {
@@ -120,7 +125,7 @@ public class DocNodeFileContentExtractor {
        if (parsedResult != null && StringUtil.hasText(parsedResult.getPlainText())) {
            return parsedResult.getPlainText();
        }
-        throw new BusinessException("PDF 文档解析结果为空");
+        throw new BusinessException("文档解析结果为空");
    }

    private String extractDefaultContent(DocumentSourceRef sourceRef) {
--- a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java
+++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java
@@ -101,7 +101,7 @@ public class DocumentParseBridgeServiceImplTest {
    @Test
    public void shouldRoutePptxToDedicatedService() {
        FakePptxDocumentParseService pptxService = new FakePptxDocumentParseService();
-        FakePdfDocumentParseService defaultService = new FakePdfDocumentParseService();
+        FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
        DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, pptxService, null, defaultService);

        DocumentParsedResult result = bridgeService.parse(buildSource("slides.pptx",
@@ -112,6 +112,61 @@ public class DocumentParseBridgeServiceImplTest {
        Assert.assertEquals(0, defaultService.parseCallCount);
    }

+    /**
+     * 验证 DOCX 会路由到默认桥接服务，而不是误走 PDF 服务。
+     */
+    @Test
+    public void shouldRouteDocxToDefaultBridgeService() {
+        FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService();
+        FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
+        DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, defaultService);
+
+        DocumentParsedResult result = bridgeService.parse(buildSource("demo.docx",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT);
+
+        Assert.assertEquals("# docx", result.getPreferredText());
+        Assert.assertEquals(1, defaultService.parseCallCount);
+        Assert.assertEquals(0, pdfService.parseCallCount);
+    }
+
+    /**
+     * 验证 DOCX 在缺少默认桥接服务时会明确失败，而不是退到 PDF 服务。
+     */
+    @Test
+    public void shouldThrowWhenDocxBridgeServiceDisabled() {
+        FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService();
+        DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, null);
+
+        try {
+            bridgeService.parse(buildSource("demo.docx",
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT);
+            Assert.fail("expected DocumentParseBridgeException");
+        } catch (DocumentParseBridgeException e) {
+            Assert.assertEquals("service_not_enabled", e.getCode());
+            Assert.assertTrue(e.getMessage().contains("DOCX"));
+        }
+        Assert.assertEquals(0, pdfService.parseCallCount);
+    }
+
+    /**
+     * 验证 XLSX 在缺少专用桥接服务时会明确失败。
+     */
+    @Test
+    public void shouldThrowWhenXlsxBridgeServiceDisabled() {
+        FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
+        DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, null, null, defaultService);
+
+        try {
+            bridgeService.parse(buildSource("table.xlsx",
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), DocumentParseScenario.WORKFLOW_TEXT);
+            Assert.fail("expected DocumentParseBridgeException");
+        } catch (DocumentParseBridgeException e) {
+            Assert.assertEquals("service_not_enabled", e.getCode());
+            Assert.assertTrue(e.getMessage().contains("XLSX"));
+        }
+        Assert.assertEquals(0, defaultService.parseCallCount);
+    }
+
    private DocumentParseBridgeServiceImpl buildBridgeService(PdfDocumentParseService pdfDocumentParseService,
                                                              PptxDocumentParseService pptxDocumentParseService,
                                                              XlsxDocumentParseService xlsxDocumentParseService,
@@ -217,6 +272,45 @@ public class DocumentParseBridgeServiceImplTest {
        }
    }

+    private static class FakeDefaultDocumentParseService implements DocumentParseService<ParseRequest> {
+
+        private ParseRequest lastParseRequest;
+        private int parseCallCount;
+
+        @Override
+        public ParseResponse parse(ParseRequest request) {
+            parseCallCount++;
+            lastParseRequest = request;
+            ParseResult result = new ParseResult();
+            result.setFileName("demo.docx");
+            result.setMarkdown("# docx");
+            result.setPlainText("docx");
+            ParseResponse response = new ParseResponse();
+            response.setResults(Collections.singletonList(result));
+            return response;
+        }
+
+        @Override
+        public ParseTaskStatus submit(ParseRequest request) {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public ParseTaskStatus queryTask(String taskId) {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public ParseResponse queryResult(String taskId) {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public ParseTaskInfo queryTaskInfo(String taskId) {
+            throw new UnsupportedOperationException();
+        }
+    }
+
    private static class FakePptxDocumentParseService implements PptxDocumentParseService {

        private int parseCallCount;
--- a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java
+++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java
@@ -48,10 +48,75 @@ public class DocNodeFileContentExtractorTest {
    }

    /**
-     * 验证非 PDF 文件会继续走默认读取器。
+     * 验证 DOCX 文件会走统一文档解析桥接服务。
     */
    @Test
-    public void shouldUseDefaultReaderForNonPdf() {
+    public void shouldUseDocumentBridgeForDocx() {
+        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
+        DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
+            bridgeService,
+            new FakeFileStorageService(),
+            new FakeReaderManager("ignored")
+        );
+
+        String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+
+        Assert.assertEquals("# parsed", content);
+        Assert.assertNotNull(bridgeService.lastSource);
+        Assert.assertEquals("demo.docx", bridgeService.lastSource.getFileName());
+    }
+
+    /**
+     * 验证 PPTX 文件会走统一文档解析桥接服务。
+     */
+    @Test
+    public void shouldUseDocumentBridgeForPptx() {
+        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
+        DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
+            bridgeService,
+            new FakeFileStorageService(),
+            new FakeReaderManager("ignored")
+        );
+
+        String content = extractor.extract(buildFileValue(
+            "slides.pptx",
+            "/files/slides.pptx",
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+        ));
+
+        Assert.assertEquals("# parsed", content);
+        Assert.assertNotNull(bridgeService.lastSource);
+        Assert.assertEquals("slides.pptx", bridgeService.lastSource.getFileName());
+    }
+
+    /**
+     * 验证 XLSX 文件会走统一文档解析桥接服务。
+     */
+    @Test
+    public void shouldUseDocumentBridgeForXlsx() {
+        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
+        DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
+            bridgeService,
+            new FakeFileStorageService(),
+            new FakeReaderManager("ignored")
+        );
+
+        String content = extractor.extract(buildFileValue(
+            "table.xlsx",
+            "/files/table.xlsx",
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+        ));
+
+        Assert.assertEquals("# parsed", content);
+        Assert.assertNotNull(bridgeService.lastSource);
+        Assert.assertEquals("table.xlsx", bridgeService.lastSource.getFileName());
+    }
+
+    /**
+     * 验证非桥接类型文件会继续走默认读取器。
+     */
+    @Test
+    public void shouldUseDefaultReaderForUnsupportedType() {
        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
        DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
            bridgeService,
@@ -59,7 +124,7 @@ public class DocNodeFileContentExtractorTest {
            new FakeReaderManager("plain text")
        );

-        String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+        String content = extractor.extract(buildFileValue("note.txt", "/files/note.txt", "text/plain"));

        Assert.assertEquals("plain text", content);
        Assert.assertNull(bridgeService.lastSource);
@@ -85,10 +150,10 @@ public class DocNodeFileContentExtractorTest {
    }

    /**
-     * 验证解析结果为空时不会回退旧 PDF 读取链路。
+     * 验证桥接解析结果为空时不会回退旧读取链路。
     */
    @Test
-    public void shouldFailWhenPdfParseResultIsEmpty() {
+    public void shouldFailWhenBridgeParseResultIsEmpty() {
        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
        bridgeService.response.setPreferredText(null);
        bridgeService.response.setMarkdown(null);
@@ -103,15 +168,15 @@ public class DocNodeFileContentExtractorTest {
            extractor.extract(buildFileValue("demo.pdf", "/files/demo.pdf", "application/pdf"));
            Assert.fail("expected BusinessException");
        } catch (BusinessException e) {
-            Assert.assertEquals("PDF 文档解析结果为空", e.getMessage());
+            Assert.assertEquals("文档解析结果为空", e.getMessage());
        }
    }

    /**
-     * 验证远端素材 URL 的非 PDF 文件不会误走本地存储读取。
+     * 验证远端素材 URL 的非桥接文件不会误走本地存储读取。
     */
    @Test
-    public void shouldReadRemoteUrlForNonPdf() {
+    public void shouldReadRemoteUrlForUnsupportedType() {
        RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
        HttpServer server;
        try {
@@ -120,7 +185,7 @@ public class DocNodeFileContentExtractorTest {
            throw new RuntimeException(e);
        }
        byte[] body = "remote text".getBytes(StandardCharsets.UTF_8);
-        server.createContext("/demo.docx", exchange -> {
+        server.createContext("/note.txt", exchange -> {
            exchange.sendResponseHeaders(200, body.length);
            exchange.getResponseBody().write(body);
            exchange.close();
@@ -134,9 +199,9 @@ public class DocNodeFileContentExtractorTest {
            );

            String content = extractor.extract(buildFileValue(
-                "demo.docx",
-                "http://127.0.0.1:" + server.getAddress().getPort() + "/demo.docx",
-                ""
+                "note.txt",
+                "http://127.0.0.1:" + server.getAddress().getPort() + "/note.txt",
+                "text/plain"
            ));

            Assert.assertEquals("remote text", content);