diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java index 1f63a32..d0dc9f0 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java @@ -222,7 +222,7 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic case PDF: return requireSpecificService(pdfDocumentParseService, defaultDocumentParseService, "PDF"); case DOCX: - return requireSpecificService(defaultDocumentParseService, pdfDocumentParseService, "DOCX"); + return requireSpecificService(defaultDocumentParseService, null, "DOCX"); case PPTX: return requireSpecificService(pptxDocumentParseService, null, "PPTX"); case XLSX: diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java index 726baa7..91d6965 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java @@ -5,9 +5,24 @@ import tech.easyflow.ai.utils.DocUtil; import java.io.InputStream; +/** + * 默认文档读取服务。 + * + *

仅在统一文档解析桥接不支持当前文件类型时,作为工作流文档解析节点的回退读取器。

+ * + * @author Codex + * @since 2026-04-18 + */ @Component("defaultReader") public class DefaultReadService implements ReadDocService { + /** + * 读取默认支持的文档内容。 + * + * @param fileName 文件名 + * @param is 文件输入流 + * @return 读取出的文本内容 + */ @Override public String read(String fileName, InputStream is) { String suffix = DocUtil.getSuffix(fileName); diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java index 8ed4403..62f5bd1 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java @@ -13,7 +13,7 @@ import java.util.Map; /** * 工作流文件内容提取节点。 * - *

节点输入为统一文件对象,PDF 交给统一文档解析桥接服务, + *

节点输入为统一文件对象,桥接支持的文档类型优先交给统一文档解析服务, * 其他类型继续走默认文档读取器。

* * @author Codex diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java index b38ee88..53de2e6 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java @@ -7,7 +7,7 @@ import tech.easyflow.ai.document.model.DocumentParseScenario; import tech.easyflow.ai.document.model.DocumentParsedResult; import tech.easyflow.ai.document.model.DocumentSourceRef; import tech.easyflow.ai.document.service.DocumentParseBridgeService; -import tech.easyflow.ai.utils.DocUtil; +import tech.easyflow.ai.document.support.DocumentParseSourceType; import tech.easyflow.common.filestorage.FileStorageService; import tech.easyflow.common.util.StringUtil; import tech.easyflow.common.web.exceptions.BusinessException; @@ -19,8 +19,8 @@ import java.util.Map; /** * {@link DocNode} 文件内容提取器。 * - *

负责把工作流运行态中的文件对象转换为统一文档源,并根据文件类型选择 - * 统一文档解析桥接服务或默认读取器。

+ *

负责把工作流运行态中的文件对象转换为统一文档源,并根据文件类型优先选择 + * 统一文档解析桥接服务;仅在桥接不支持时回退到默认读取器。

* * @author Codex * @since 2026-04-14 @@ -56,8 +56,8 @@ public class DocNodeFileContentExtractor { public String extract(Object fileValue) { DocumentSourceRef sourceRef = toDocumentSourceRef(fileValue); validateSourceRef(sourceRef); - if (isPdf(sourceRef)) { - return extractPdfContent(sourceRef); + if (shouldUseDocumentBridge(sourceRef)) { + return extractBridgeContent(sourceRef); } return extractDefaultContent(sourceRef); } @@ -96,19 +96,24 @@ public class DocNodeFileContentExtractor { } } - private boolean isPdf(DocumentSourceRef sourceRef) { - if (StringUtil.hasText(sourceRef.getContentType()) - && sourceRef.getContentType().toLowerCase().contains("pdf")) { - return true; - } - String fileName = sourceRef.getFileName(); - if (!StringUtil.hasText(fileName) || !fileName.contains(".")) { - return false; - } - return "pdf".equals(DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName))); + /** + * 判断当前文件类型是否应优先走统一文档解析桥接。 + * + * @param sourceRef 文档源 + * @return 是否走桥接 + */ + private boolean shouldUseDocumentBridge(DocumentSourceRef sourceRef) { + return DocumentParseSourceType.resolve(sourceRef.getFileName(), sourceRef.getContentType()) + != DocumentParseSourceType.UNSUPPORTED; } - private String extractPdfContent(DocumentSourceRef sourceRef) { + /** + * 通过统一文档解析桥接提取主文本结果。 + * + * @param sourceRef 文档源 + * @return 桥接提取出的主文本 + */ + private String extractBridgeContent(DocumentSourceRef sourceRef) { DocumentParsedResult parsedResult = documentParseBridgeService.parse(sourceRef, DocumentParseScenario.WORKFLOW_TEXT); String preferredText = parsedResult == null ? null : parsedResult.getPreferredText(); if (StringUtil.hasText(preferredText)) { @@ -120,7 +125,7 @@ public class DocNodeFileContentExtractor { if (parsedResult != null && StringUtil.hasText(parsedResult.getPlainText())) { return parsedResult.getPlainText(); } - throw new BusinessException("PDF 文档解析结果为空"); + throw new BusinessException("文档解析结果为空"); } private String extractDefaultContent(DocumentSourceRef sourceRef) { diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java index 1074d4a..548785b 100644 --- a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java @@ -101,7 +101,7 @@ public class DocumentParseBridgeServiceImplTest { @Test public void shouldRoutePptxToDedicatedService() { FakePptxDocumentParseService pptxService = new FakePptxDocumentParseService(); - FakePdfDocumentParseService defaultService = new FakePdfDocumentParseService(); + FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService(); DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, pptxService, null, defaultService); DocumentParsedResult result = bridgeService.parse(buildSource("slides.pptx", @@ -112,6 +112,61 @@ public class DocumentParseBridgeServiceImplTest { Assert.assertEquals(0, defaultService.parseCallCount); } + /** + * 验证 DOCX 会路由到默认桥接服务,而不是误走 PDF 服务。 + */ + @Test + public void shouldRouteDocxToDefaultBridgeService() { + FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService(); + FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService(); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, defaultService); + + DocumentParsedResult result = bridgeService.parse(buildSource("demo.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT); + + Assert.assertEquals("# docx", result.getPreferredText()); + Assert.assertEquals(1, defaultService.parseCallCount); + Assert.assertEquals(0, pdfService.parseCallCount); + } + + /** + * 验证 DOCX 在缺少默认桥接服务时会明确失败,而不是退到 PDF 服务。 + */ + @Test + public void shouldThrowWhenDocxBridgeServiceDisabled() { + FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService(); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, null); + + try { + bridgeService.parse(buildSource("demo.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT); + Assert.fail("expected DocumentParseBridgeException"); + } catch (DocumentParseBridgeException e) { + Assert.assertEquals("service_not_enabled", e.getCode()); + Assert.assertTrue(e.getMessage().contains("DOCX")); + } + Assert.assertEquals(0, pdfService.parseCallCount); + } + + /** + * 验证 XLSX 在缺少专用桥接服务时会明确失败。 + */ + @Test + public void shouldThrowWhenXlsxBridgeServiceDisabled() { + FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService(); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, null, null, defaultService); + + try { + bridgeService.parse(buildSource("table.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), DocumentParseScenario.WORKFLOW_TEXT); + Assert.fail("expected DocumentParseBridgeException"); + } catch (DocumentParseBridgeException e) { + Assert.assertEquals("service_not_enabled", e.getCode()); + Assert.assertTrue(e.getMessage().contains("XLSX")); + } + Assert.assertEquals(0, defaultService.parseCallCount); + } + private DocumentParseBridgeServiceImpl buildBridgeService(PdfDocumentParseService pdfDocumentParseService, PptxDocumentParseService pptxDocumentParseService, XlsxDocumentParseService xlsxDocumentParseService, @@ -217,6 +272,45 @@ public class DocumentParseBridgeServiceImplTest { } } + private static class FakeDefaultDocumentParseService implements DocumentParseService { + + private ParseRequest lastParseRequest; + private int parseCallCount; + + @Override + public ParseResponse parse(ParseRequest request) { + parseCallCount++; + lastParseRequest = request; + ParseResult result = new ParseResult(); + result.setFileName("demo.docx"); + result.setMarkdown("# docx"); + result.setPlainText("docx"); + ParseResponse response = new ParseResponse(); + response.setResults(Collections.singletonList(result)); + return response; + } + + @Override + public ParseTaskStatus submit(ParseRequest request) { + throw new UnsupportedOperationException(); + } + + @Override + public ParseTaskStatus queryTask(String taskId) { + throw new UnsupportedOperationException(); + } + + @Override + public ParseResponse queryResult(String taskId) { + throw new UnsupportedOperationException(); + } + + @Override + public ParseTaskInfo queryTaskInfo(String taskId) { + throw new UnsupportedOperationException(); + } + } + private static class FakePptxDocumentParseService implements PptxDocumentParseService { private int parseCallCount; diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java index b1b2e25..2814a48 100644 --- a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/node/DocNodeFileContentExtractorTest.java @@ -48,10 +48,75 @@ public class DocNodeFileContentExtractorTest { } /** - * 验证非 PDF 文件会继续走默认读取器。 + * 验证 DOCX 文件会走统一文档解析桥接服务。 */ @Test - public void shouldUseDefaultReaderForNonPdf() { + public void shouldUseDocumentBridgeForDocx() { + RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); + DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor( + bridgeService, + new FakeFileStorageService(), + new FakeReaderManager("ignored") + ); + + String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + + Assert.assertEquals("# parsed", content); + Assert.assertNotNull(bridgeService.lastSource); + Assert.assertEquals("demo.docx", bridgeService.lastSource.getFileName()); + } + + /** + * 验证 PPTX 文件会走统一文档解析桥接服务。 + */ + @Test + public void shouldUseDocumentBridgeForPptx() { + RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); + DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor( + bridgeService, + new FakeFileStorageService(), + new FakeReaderManager("ignored") + ); + + String content = extractor.extract(buildFileValue( + "slides.pptx", + "/files/slides.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation" + )); + + Assert.assertEquals("# parsed", content); + Assert.assertNotNull(bridgeService.lastSource); + Assert.assertEquals("slides.pptx", bridgeService.lastSource.getFileName()); + } + + /** + * 验证 XLSX 文件会走统一文档解析桥接服务。 + */ + @Test + public void shouldUseDocumentBridgeForXlsx() { + RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); + DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor( + bridgeService, + new FakeFileStorageService(), + new FakeReaderManager("ignored") + ); + + String content = extractor.extract(buildFileValue( + "table.xlsx", + "/files/table.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + )); + + Assert.assertEquals("# parsed", content); + Assert.assertNotNull(bridgeService.lastSource); + Assert.assertEquals("table.xlsx", bridgeService.lastSource.getFileName()); + } + + /** + * 验证非桥接类型文件会继续走默认读取器。 + */ + @Test + public void shouldUseDefaultReaderForUnsupportedType() { RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor( bridgeService, @@ -59,7 +124,7 @@ public class DocNodeFileContentExtractorTest { new FakeReaderManager("plain text") ); - String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + String content = extractor.extract(buildFileValue("note.txt", "/files/note.txt", "text/plain")); Assert.assertEquals("plain text", content); Assert.assertNull(bridgeService.lastSource); @@ -85,10 +150,10 @@ public class DocNodeFileContentExtractorTest { } /** - * 验证解析结果为空时不会回退旧 PDF 读取链路。 + * 验证桥接解析结果为空时不会回退旧读取链路。 */ @Test - public void shouldFailWhenPdfParseResultIsEmpty() { + public void shouldFailWhenBridgeParseResultIsEmpty() { RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); bridgeService.response.setPreferredText(null); bridgeService.response.setMarkdown(null); @@ -103,15 +168,15 @@ public class DocNodeFileContentExtractorTest { extractor.extract(buildFileValue("demo.pdf", "/files/demo.pdf", "application/pdf")); Assert.fail("expected BusinessException"); } catch (BusinessException e) { - Assert.assertEquals("PDF 文档解析结果为空", e.getMessage()); + Assert.assertEquals("文档解析结果为空", e.getMessage()); } } /** - * 验证远端素材 URL 的非 PDF 文件不会误走本地存储读取。 + * 验证远端素材 URL 的非桥接文件不会误走本地存储读取。 */ @Test - public void shouldReadRemoteUrlForNonPdf() { + public void shouldReadRemoteUrlForUnsupportedType() { RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); HttpServer server; try { @@ -120,7 +185,7 @@ public class DocNodeFileContentExtractorTest { throw new RuntimeException(e); } byte[] body = "remote text".getBytes(StandardCharsets.UTF_8); - server.createContext("/demo.docx", exchange -> { + server.createContext("/note.txt", exchange -> { exchange.sendResponseHeaders(200, body.length); exchange.getResponseBody().write(body); exchange.close(); @@ -134,9 +199,9 @@ public class DocNodeFileContentExtractorTest { ); String content = extractor.extract(buildFileValue( - "demo.docx", - "http://127.0.0.1:" + server.getAddress().getPort() + "/demo.docx", - "" + "note.txt", + "http://127.0.0.1:" + server.getAddress().getPort() + "/note.txt", + "text/plain" )); Assert.assertEquals("remote text", content);