feat: 支持工作流文档节点解析 Office 新格式

- DocNode 对 pdf/docx/pptx/xlsx 统一走桥接同步解析

- 修复 DOCX 误回退到 PDF 解析服务的问题并补齐回归测试
This commit is contained in:
2026-04-18 19:52:45 +08:00
parent 4130381658
commit 8546d927bc
6 changed files with 211 additions and 32 deletions

View File

@@ -222,7 +222,7 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic
case PDF: case PDF:
return requireSpecificService(pdfDocumentParseService, defaultDocumentParseService, "PDF"); return requireSpecificService(pdfDocumentParseService, defaultDocumentParseService, "PDF");
case DOCX: case DOCX:
return requireSpecificService(defaultDocumentParseService, pdfDocumentParseService, "DOCX"); return requireSpecificService(defaultDocumentParseService, null, "DOCX");
case PPTX: case PPTX:
return requireSpecificService(pptxDocumentParseService, null, "PPTX"); return requireSpecificService(pptxDocumentParseService, null, "PPTX");
case XLSX: case XLSX:

View File

@@ -5,9 +5,24 @@ import tech.easyflow.ai.utils.DocUtil;
import java.io.InputStream; import java.io.InputStream;
/**
* 默认文档读取服务。
*
* <p>仅在统一文档解析桥接不支持当前文件类型时,作为工作流文档解析节点的回退读取器。</p>
*
* @author Codex
* @since 2026-04-18
*/
@Component("defaultReader") @Component("defaultReader")
public class DefaultReadService implements ReadDocService { public class DefaultReadService implements ReadDocService {
/**
* 读取默认支持的文档内容。
*
* @param fileName 文件名
* @param is 文件输入流
* @return 读取出的文本内容
*/
@Override @Override
public String read(String fileName, InputStream is) { public String read(String fileName, InputStream is) {
String suffix = DocUtil.getSuffix(fileName); String suffix = DocUtil.getSuffix(fileName);

View File

@@ -13,7 +13,7 @@ import java.util.Map;
/** /**
* 工作流文件内容提取节点。 * 工作流文件内容提取节点。
* *
* <p>节点输入为统一文件对象,PDF 交给统一文档解析桥接服务, * <p>节点输入为统一文件对象,桥接支持的文档类型优先交给统一文档解析服务,
* 其他类型继续走默认文档读取器。</p> * 其他类型继续走默认文档读取器。</p>
* *
* @author Codex * @author Codex

View File

@@ -7,7 +7,7 @@ import tech.easyflow.ai.document.model.DocumentParseScenario;
import tech.easyflow.ai.document.model.DocumentParsedResult; import tech.easyflow.ai.document.model.DocumentParsedResult;
import tech.easyflow.ai.document.model.DocumentSourceRef; import tech.easyflow.ai.document.model.DocumentSourceRef;
import tech.easyflow.ai.document.service.DocumentParseBridgeService; import tech.easyflow.ai.document.service.DocumentParseBridgeService;
import tech.easyflow.ai.utils.DocUtil; import tech.easyflow.ai.document.support.DocumentParseSourceType;
import tech.easyflow.common.filestorage.FileStorageService; import tech.easyflow.common.filestorage.FileStorageService;
import tech.easyflow.common.util.StringUtil; import tech.easyflow.common.util.StringUtil;
import tech.easyflow.common.web.exceptions.BusinessException; import tech.easyflow.common.web.exceptions.BusinessException;
@@ -19,8 +19,8 @@ import java.util.Map;
/** /**
* {@link DocNode} 文件内容提取器。 * {@link DocNode} 文件内容提取器。
* *
* <p>负责把工作流运行态中的文件对象转换为统一文档源,并根据文件类型选择 * <p>负责把工作流运行态中的文件对象转换为统一文档源,并根据文件类型优先选择
* 统一文档解析桥接服务默认读取器。</p> * 统一文档解析桥接服务;仅在桥接不支持时回退到默认读取器。</p>
* *
* @author Codex * @author Codex
* @since 2026-04-14 * @since 2026-04-14
@@ -56,8 +56,8 @@ public class DocNodeFileContentExtractor {
public String extract(Object fileValue) { public String extract(Object fileValue) {
DocumentSourceRef sourceRef = toDocumentSourceRef(fileValue); DocumentSourceRef sourceRef = toDocumentSourceRef(fileValue);
validateSourceRef(sourceRef); validateSourceRef(sourceRef);
if (isPdf(sourceRef)) { if (shouldUseDocumentBridge(sourceRef)) {
return extractPdfContent(sourceRef); return extractBridgeContent(sourceRef);
} }
return extractDefaultContent(sourceRef); return extractDefaultContent(sourceRef);
} }
@@ -96,19 +96,24 @@ public class DocNodeFileContentExtractor {
} }
} }
private boolean isPdf(DocumentSourceRef sourceRef) { /**
if (StringUtil.hasText(sourceRef.getContentType()) * 判断当前文件类型是否应优先走统一文档解析桥接。
&& sourceRef.getContentType().toLowerCase().contains("pdf")) { *
return true; * @param sourceRef 文档源
} * @return 是否走桥接
String fileName = sourceRef.getFileName(); */
if (!StringUtil.hasText(fileName) || !fileName.contains(".")) { private boolean shouldUseDocumentBridge(DocumentSourceRef sourceRef) {
return false; return DocumentParseSourceType.resolve(sourceRef.getFileName(), sourceRef.getContentType())
} != DocumentParseSourceType.UNSUPPORTED;
return "pdf".equals(DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName)));
} }
private String extractPdfContent(DocumentSourceRef sourceRef) { /**
* 通过统一文档解析桥接提取主文本结果。
*
* @param sourceRef 文档源
* @return 桥接提取出的主文本
*/
private String extractBridgeContent(DocumentSourceRef sourceRef) {
DocumentParsedResult parsedResult = documentParseBridgeService.parse(sourceRef, DocumentParseScenario.WORKFLOW_TEXT); DocumentParsedResult parsedResult = documentParseBridgeService.parse(sourceRef, DocumentParseScenario.WORKFLOW_TEXT);
String preferredText = parsedResult == null ? null : parsedResult.getPreferredText(); String preferredText = parsedResult == null ? null : parsedResult.getPreferredText();
if (StringUtil.hasText(preferredText)) { if (StringUtil.hasText(preferredText)) {
@@ -120,7 +125,7 @@ public class DocNodeFileContentExtractor {
if (parsedResult != null && StringUtil.hasText(parsedResult.getPlainText())) { if (parsedResult != null && StringUtil.hasText(parsedResult.getPlainText())) {
return parsedResult.getPlainText(); return parsedResult.getPlainText();
} }
throw new BusinessException("PDF 文档解析结果为空"); throw new BusinessException("文档解析结果为空");
} }
private String extractDefaultContent(DocumentSourceRef sourceRef) { private String extractDefaultContent(DocumentSourceRef sourceRef) {

View File

@@ -101,7 +101,7 @@ public class DocumentParseBridgeServiceImplTest {
@Test @Test
public void shouldRoutePptxToDedicatedService() { public void shouldRoutePptxToDedicatedService() {
FakePptxDocumentParseService pptxService = new FakePptxDocumentParseService(); FakePptxDocumentParseService pptxService = new FakePptxDocumentParseService();
FakePdfDocumentParseService defaultService = new FakePdfDocumentParseService(); FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, pptxService, null, defaultService); DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, pptxService, null, defaultService);
DocumentParsedResult result = bridgeService.parse(buildSource("slides.pptx", DocumentParsedResult result = bridgeService.parse(buildSource("slides.pptx",
@@ -112,6 +112,61 @@ public class DocumentParseBridgeServiceImplTest {
Assert.assertEquals(0, defaultService.parseCallCount); Assert.assertEquals(0, defaultService.parseCallCount);
} }
/**
* 验证 DOCX 会路由到默认桥接服务,而不是误走 PDF 服务。
*/
@Test
public void shouldRouteDocxToDefaultBridgeService() {
FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService();
FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, defaultService);
DocumentParsedResult result = bridgeService.parse(buildSource("demo.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT);
Assert.assertEquals("# docx", result.getPreferredText());
Assert.assertEquals(1, defaultService.parseCallCount);
Assert.assertEquals(0, pdfService.parseCallCount);
}
/**
* 验证 DOCX 在缺少默认桥接服务时会明确失败,而不是退到 PDF 服务。
*/
@Test
public void shouldThrowWhenDocxBridgeServiceDisabled() {
FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService();
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, null);
try {
bridgeService.parse(buildSource("demo.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT);
Assert.fail("expected DocumentParseBridgeException");
} catch (DocumentParseBridgeException e) {
Assert.assertEquals("service_not_enabled", e.getCode());
Assert.assertTrue(e.getMessage().contains("DOCX"));
}
Assert.assertEquals(0, pdfService.parseCallCount);
}
/**
* 验证 XLSX 在缺少专用桥接服务时会明确失败。
*/
@Test
public void shouldThrowWhenXlsxBridgeServiceDisabled() {
FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, null, null, defaultService);
try {
bridgeService.parse(buildSource("table.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), DocumentParseScenario.WORKFLOW_TEXT);
Assert.fail("expected DocumentParseBridgeException");
} catch (DocumentParseBridgeException e) {
Assert.assertEquals("service_not_enabled", e.getCode());
Assert.assertTrue(e.getMessage().contains("XLSX"));
}
Assert.assertEquals(0, defaultService.parseCallCount);
}
private DocumentParseBridgeServiceImpl buildBridgeService(PdfDocumentParseService pdfDocumentParseService, private DocumentParseBridgeServiceImpl buildBridgeService(PdfDocumentParseService pdfDocumentParseService,
PptxDocumentParseService pptxDocumentParseService, PptxDocumentParseService pptxDocumentParseService,
XlsxDocumentParseService xlsxDocumentParseService, XlsxDocumentParseService xlsxDocumentParseService,
@@ -217,6 +272,45 @@ public class DocumentParseBridgeServiceImplTest {
} }
} }
private static class FakeDefaultDocumentParseService implements DocumentParseService<ParseRequest> {
private ParseRequest lastParseRequest;
private int parseCallCount;
@Override
public ParseResponse parse(ParseRequest request) {
parseCallCount++;
lastParseRequest = request;
ParseResult result = new ParseResult();
result.setFileName("demo.docx");
result.setMarkdown("# docx");
result.setPlainText("docx");
ParseResponse response = new ParseResponse();
response.setResults(Collections.singletonList(result));
return response;
}
@Override
public ParseTaskStatus submit(ParseRequest request) {
throw new UnsupportedOperationException();
}
@Override
public ParseTaskStatus queryTask(String taskId) {
throw new UnsupportedOperationException();
}
@Override
public ParseResponse queryResult(String taskId) {
throw new UnsupportedOperationException();
}
@Override
public ParseTaskInfo queryTaskInfo(String taskId) {
throw new UnsupportedOperationException();
}
}
private static class FakePptxDocumentParseService implements PptxDocumentParseService { private static class FakePptxDocumentParseService implements PptxDocumentParseService {
private int parseCallCount; private int parseCallCount;

View File

@@ -48,10 +48,75 @@ public class DocNodeFileContentExtractorTest {
} }
/** /**
* 验证非 PDF 文件会继续走默认读取器 * 验证 DOCX 文件会走统一文档解析桥接服务
*/ */
@Test @Test
public void shouldUseDefaultReaderForNonPdf() { public void shouldUseDocumentBridgeForDocx() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
bridgeService,
new FakeFileStorageService(),
new FakeReaderManager("ignored")
);
String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
Assert.assertEquals("# parsed", content);
Assert.assertNotNull(bridgeService.lastSource);
Assert.assertEquals("demo.docx", bridgeService.lastSource.getFileName());
}
/**
* 验证 PPTX 文件会走统一文档解析桥接服务。
*/
@Test
public void shouldUseDocumentBridgeForPptx() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
bridgeService,
new FakeFileStorageService(),
new FakeReaderManager("ignored")
);
String content = extractor.extract(buildFileValue(
"slides.pptx",
"/files/slides.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
));
Assert.assertEquals("# parsed", content);
Assert.assertNotNull(bridgeService.lastSource);
Assert.assertEquals("slides.pptx", bridgeService.lastSource.getFileName());
}
/**
* 验证 XLSX 文件会走统一文档解析桥接服务。
*/
@Test
public void shouldUseDocumentBridgeForXlsx() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
bridgeService,
new FakeFileStorageService(),
new FakeReaderManager("ignored")
);
String content = extractor.extract(buildFileValue(
"table.xlsx",
"/files/table.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
));
Assert.assertEquals("# parsed", content);
Assert.assertNotNull(bridgeService.lastSource);
Assert.assertEquals("table.xlsx", bridgeService.lastSource.getFileName());
}
/**
* 验证非桥接类型文件会继续走默认读取器。
*/
@Test
public void shouldUseDefaultReaderForUnsupportedType() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor( DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
bridgeService, bridgeService,
@@ -59,7 +124,7 @@ public class DocNodeFileContentExtractorTest {
new FakeReaderManager("plain text") new FakeReaderManager("plain text")
); );
String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")); String content = extractor.extract(buildFileValue("note.txt", "/files/note.txt", "text/plain"));
Assert.assertEquals("plain text", content); Assert.assertEquals("plain text", content);
Assert.assertNull(bridgeService.lastSource); Assert.assertNull(bridgeService.lastSource);
@@ -85,10 +150,10 @@ public class DocNodeFileContentExtractorTest {
} }
/** /**
* 验证解析结果为空时不会回退旧 PDF 读取链路。 * 验证桥接解析结果为空时不会回退旧读取链路。
*/ */
@Test @Test
public void shouldFailWhenPdfParseResultIsEmpty() { public void shouldFailWhenBridgeParseResultIsEmpty() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
bridgeService.response.setPreferredText(null); bridgeService.response.setPreferredText(null);
bridgeService.response.setMarkdown(null); bridgeService.response.setMarkdown(null);
@@ -103,15 +168,15 @@ public class DocNodeFileContentExtractorTest {
extractor.extract(buildFileValue("demo.pdf", "/files/demo.pdf", "application/pdf")); extractor.extract(buildFileValue("demo.pdf", "/files/demo.pdf", "application/pdf"));
Assert.fail("expected BusinessException"); Assert.fail("expected BusinessException");
} catch (BusinessException e) { } catch (BusinessException e) {
Assert.assertEquals("PDF 文档解析结果为空", e.getMessage()); Assert.assertEquals("文档解析结果为空", e.getMessage());
} }
} }
/** /**
* 验证远端素材 URL 的非 PDF 文件不会误走本地存储读取。 * 验证远端素材 URL 的非桥接文件不会误走本地存储读取。
*/ */
@Test @Test
public void shouldReadRemoteUrlForNonPdf() { public void shouldReadRemoteUrlForUnsupportedType() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService(); RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
HttpServer server; HttpServer server;
try { try {
@@ -120,7 +185,7 @@ public class DocNodeFileContentExtractorTest {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
byte[] body = "remote text".getBytes(StandardCharsets.UTF_8); byte[] body = "remote text".getBytes(StandardCharsets.UTF_8);
server.createContext("/demo.docx", exchange -> { server.createContext("/note.txt", exchange -> {
exchange.sendResponseHeaders(200, body.length); exchange.sendResponseHeaders(200, body.length);
exchange.getResponseBody().write(body); exchange.getResponseBody().write(body);
exchange.close(); exchange.close();
@@ -134,9 +199,9 @@ public class DocNodeFileContentExtractorTest {
); );
String content = extractor.extract(buildFileValue( String content = extractor.extract(buildFileValue(
"demo.docx", "note.txt",
"http://127.0.0.1:" + server.getAddress().getPort() + "/demo.docx", "http://127.0.0.1:" + server.getAddress().getPort() + "/note.txt",
"" "text/plain"
)); ));
Assert.assertEquals("remote text", content); Assert.assertEquals("remote text", content);