diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java index 1f63a32..d0dc9f0 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java @@ -222,7 +222,7 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic case PDF: return requireSpecificService(pdfDocumentParseService, defaultDocumentParseService, "PDF"); case DOCX: - return requireSpecificService(defaultDocumentParseService, pdfDocumentParseService, "DOCX"); + return requireSpecificService(defaultDocumentParseService, null, "DOCX"); case PPTX: return requireSpecificService(pptxDocumentParseService, null, "PPTX"); case XLSX: diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java index 726baa7..91d6965 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DefaultReadService.java @@ -5,9 +5,24 @@ import tech.easyflow.ai.utils.DocUtil; import java.io.InputStream; +/** + * 默认文档读取服务。 + * + *
仅在统一文档解析桥接不支持当前文件类型时,作为工作流文档解析节点的回退读取器。
+ * + * @author Codex + * @since 2026-04-18 + */ @Component("defaultReader") public class DefaultReadService implements ReadDocService { + /** + * 读取默认支持的文档内容。 + * + * @param fileName 文件名 + * @param is 文件输入流 + * @return 读取出的文本内容 + */ @Override public String read(String fileName, InputStream is) { String suffix = DocUtil.getSuffix(fileName); diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java index 8ed4403..62f5bd1 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNode.java @@ -13,7 +13,7 @@ import java.util.Map; /** * 工作流文件内容提取节点。 * - *节点输入为统一文件对象,PDF 交给统一文档解析桥接服务, + *
节点输入为统一文件对象,桥接支持的文档类型优先交给统一文档解析服务, * 其他类型继续走默认文档读取器。
* * @author Codex diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java index b38ee88..53de2e6 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/node/DocNodeFileContentExtractor.java @@ -7,7 +7,7 @@ import tech.easyflow.ai.document.model.DocumentParseScenario; import tech.easyflow.ai.document.model.DocumentParsedResult; import tech.easyflow.ai.document.model.DocumentSourceRef; import tech.easyflow.ai.document.service.DocumentParseBridgeService; -import tech.easyflow.ai.utils.DocUtil; +import tech.easyflow.ai.document.support.DocumentParseSourceType; import tech.easyflow.common.filestorage.FileStorageService; import tech.easyflow.common.util.StringUtil; import tech.easyflow.common.web.exceptions.BusinessException; @@ -19,8 +19,8 @@ import java.util.Map; /** * {@link DocNode} 文件内容提取器。 * - *负责把工作流运行态中的文件对象转换为统一文档源,并根据文件类型选择 - * 统一文档解析桥接服务或默认读取器。
+ *负责把工作流运行态中的文件对象转换为统一文档源,并根据文件类型优先选择 + * 统一文档解析桥接服务;仅在桥接不支持时回退到默认读取器。
* * @author Codex * @since 2026-04-14 @@ -56,8 +56,8 @@ public class DocNodeFileContentExtractor { public String extract(Object fileValue) { DocumentSourceRef sourceRef = toDocumentSourceRef(fileValue); validateSourceRef(sourceRef); - if (isPdf(sourceRef)) { - return extractPdfContent(sourceRef); + if (shouldUseDocumentBridge(sourceRef)) { + return extractBridgeContent(sourceRef); } return extractDefaultContent(sourceRef); } @@ -96,19 +96,24 @@ public class DocNodeFileContentExtractor { } } - private boolean isPdf(DocumentSourceRef sourceRef) { - if (StringUtil.hasText(sourceRef.getContentType()) - && sourceRef.getContentType().toLowerCase().contains("pdf")) { - return true; - } - String fileName = sourceRef.getFileName(); - if (!StringUtil.hasText(fileName) || !fileName.contains(".")) { - return false; - } - return "pdf".equals(DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName))); + /** + * 判断当前文件类型是否应优先走统一文档解析桥接。 + * + * @param sourceRef 文档源 + * @return 是否走桥接 + */ + private boolean shouldUseDocumentBridge(DocumentSourceRef sourceRef) { + return DocumentParseSourceType.resolve(sourceRef.getFileName(), sourceRef.getContentType()) + != DocumentParseSourceType.UNSUPPORTED; } - private String extractPdfContent(DocumentSourceRef sourceRef) { + /** + * 通过统一文档解析桥接提取主文本结果。 + * + * @param sourceRef 文档源 + * @return 桥接提取出的主文本 + */ + private String extractBridgeContent(DocumentSourceRef sourceRef) { DocumentParsedResult parsedResult = documentParseBridgeService.parse(sourceRef, DocumentParseScenario.WORKFLOW_TEXT); String preferredText = parsedResult == null ? null : parsedResult.getPreferredText(); if (StringUtil.hasText(preferredText)) { @@ -120,7 +125,7 @@ public class DocNodeFileContentExtractor { if (parsedResult != null && StringUtil.hasText(parsedResult.getPlainText())) { return parsedResult.getPlainText(); } - throw new BusinessException("PDF 文档解析结果为空"); + throw new BusinessException("文档解析结果为空"); } private String extractDefaultContent(DocumentSourceRef sourceRef) { diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java index 1074d4a..548785b 100644 --- a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java @@ -101,7 +101,7 @@ public class DocumentParseBridgeServiceImplTest { @Test public void shouldRoutePptxToDedicatedService() { FakePptxDocumentParseService pptxService = new FakePptxDocumentParseService(); - FakePdfDocumentParseService defaultService = new FakePdfDocumentParseService(); + FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService(); DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, pptxService, null, defaultService); DocumentParsedResult result = bridgeService.parse(buildSource("slides.pptx", @@ -112,6 +112,61 @@ public class DocumentParseBridgeServiceImplTest { Assert.assertEquals(0, defaultService.parseCallCount); } + /** + * 验证 DOCX 会路由到默认桥接服务,而不是误走 PDF 服务。 + */ + @Test + public void shouldRouteDocxToDefaultBridgeService() { + FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService(); + FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService(); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, defaultService); + + DocumentParsedResult result = bridgeService.parse(buildSource("demo.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT); + + Assert.assertEquals("# docx", result.getPreferredText()); + Assert.assertEquals(1, defaultService.parseCallCount); + Assert.assertEquals(0, pdfService.parseCallCount); + } + + /** + * 验证 DOCX 在缺少默认桥接服务时会明确失败,而不是退到 PDF 服务。 + */ + @Test + public void shouldThrowWhenDocxBridgeServiceDisabled() { + FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService(); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, null); + + try { + bridgeService.parse(buildSource("demo.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT); + Assert.fail("expected DocumentParseBridgeException"); + } catch (DocumentParseBridgeException e) { + Assert.assertEquals("service_not_enabled", e.getCode()); + Assert.assertTrue(e.getMessage().contains("DOCX")); + } + Assert.assertEquals(0, pdfService.parseCallCount); + } + + /** + * 验证 XLSX 在缺少专用桥接服务时会明确失败。 + */ + @Test + public void shouldThrowWhenXlsxBridgeServiceDisabled() { + FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService(); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, null, null, defaultService); + + try { + bridgeService.parse(buildSource("table.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), DocumentParseScenario.WORKFLOW_TEXT); + Assert.fail("expected DocumentParseBridgeException"); + } catch (DocumentParseBridgeException e) { + Assert.assertEquals("service_not_enabled", e.getCode()); + Assert.assertTrue(e.getMessage().contains("XLSX")); + } + Assert.assertEquals(0, defaultService.parseCallCount); + } + private DocumentParseBridgeServiceImpl buildBridgeService(PdfDocumentParseService pdfDocumentParseService, PptxDocumentParseService pptxDocumentParseService, XlsxDocumentParseService xlsxDocumentParseService, @@ -217,6 +272,45 @@ public class DocumentParseBridgeServiceImplTest { } } + private static class FakeDefaultDocumentParseService implements DocumentParseService