feat: 支持工作流文档节点解析 Office 新格式

- DocNode 对 pdf/docx/pptx/xlsx 统一走桥接同步解析

- 修复 DOCX 误回退到 PDF 解析服务的问题并补齐回归测试
This commit is contained in:
2026-04-18 19:52:45 +08:00
parent 4130381658
commit 8546d927bc
6 changed files with 211 additions and 32 deletions

View File

@@ -222,7 +222,7 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic
case PDF:
return requireSpecificService(pdfDocumentParseService, defaultDocumentParseService, "PDF");
case DOCX:
return requireSpecificService(defaultDocumentParseService, pdfDocumentParseService, "DOCX");
return requireSpecificService(defaultDocumentParseService, null, "DOCX");
case PPTX:
return requireSpecificService(pptxDocumentParseService, null, "PPTX");
case XLSX:

View File

@@ -5,9 +5,24 @@ import tech.easyflow.ai.utils.DocUtil;
import java.io.InputStream;
/**
* 默认文档读取服务。
*
* <p>仅在统一文档解析桥接不支持当前文件类型时,作为工作流文档解析节点的回退读取器。</p>
*
* @author Codex
* @since 2026-04-18
*/
@Component("defaultReader")
public class DefaultReadService implements ReadDocService {
/**
* 读取默认支持的文档内容。
*
* @param fileName 文件名
* @param is 文件输入流
* @return 读取出的文本内容
*/
@Override
public String read(String fileName, InputStream is) {
String suffix = DocUtil.getSuffix(fileName);

View File

@@ -13,7 +13,7 @@ import java.util.Map;
/**
* 工作流文件内容提取节点。
*
* <p>节点输入为统一文件对象,PDF 交给统一文档解析桥接服务,
* <p>节点输入为统一文件对象,桥接支持的文档类型优先交给统一文档解析服务,
* 其他类型继续走默认文档读取器。</p>
*
* @author Codex

View File

@@ -7,7 +7,7 @@ import tech.easyflow.ai.document.model.DocumentParseScenario;
import tech.easyflow.ai.document.model.DocumentParsedResult;
import tech.easyflow.ai.document.model.DocumentSourceRef;
import tech.easyflow.ai.document.service.DocumentParseBridgeService;
import tech.easyflow.ai.utils.DocUtil;
import tech.easyflow.ai.document.support.DocumentParseSourceType;
import tech.easyflow.common.filestorage.FileStorageService;
import tech.easyflow.common.util.StringUtil;
import tech.easyflow.common.web.exceptions.BusinessException;
@@ -19,8 +19,8 @@ import java.util.Map;
/**
* {@link DocNode} 文件内容提取器。
*
* <p>负责把工作流运行态中的文件对象转换为统一文档源,并根据文件类型选择
* 统一文档解析桥接服务默认读取器。</p>
* <p>负责把工作流运行态中的文件对象转换为统一文档源,并根据文件类型优先选择
* 统一文档解析桥接服务;仅在桥接不支持时回退到默认读取器。</p>
*
* @author Codex
* @since 2026-04-14
@@ -56,8 +56,8 @@ public class DocNodeFileContentExtractor {
public String extract(Object fileValue) {
DocumentSourceRef sourceRef = toDocumentSourceRef(fileValue);
validateSourceRef(sourceRef);
if (isPdf(sourceRef)) {
return extractPdfContent(sourceRef);
if (shouldUseDocumentBridge(sourceRef)) {
return extractBridgeContent(sourceRef);
}
return extractDefaultContent(sourceRef);
}
@@ -96,19 +96,24 @@ public class DocNodeFileContentExtractor {
}
}
private boolean isPdf(DocumentSourceRef sourceRef) {
if (StringUtil.hasText(sourceRef.getContentType())
&& sourceRef.getContentType().toLowerCase().contains("pdf")) {
return true;
}
String fileName = sourceRef.getFileName();
if (!StringUtil.hasText(fileName) || !fileName.contains(".")) {
return false;
}
return "pdf".equals(DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName)));
/**
* 判断当前文件类型是否应优先走统一文档解析桥接。
*
* @param sourceRef 文档源
* @return 是否走桥接
*/
private boolean shouldUseDocumentBridge(DocumentSourceRef sourceRef) {
return DocumentParseSourceType.resolve(sourceRef.getFileName(), sourceRef.getContentType())
!= DocumentParseSourceType.UNSUPPORTED;
}
private String extractPdfContent(DocumentSourceRef sourceRef) {
/**
* 通过统一文档解析桥接提取主文本结果。
*
* @param sourceRef 文档源
* @return 桥接提取出的主文本
*/
private String extractBridgeContent(DocumentSourceRef sourceRef) {
DocumentParsedResult parsedResult = documentParseBridgeService.parse(sourceRef, DocumentParseScenario.WORKFLOW_TEXT);
String preferredText = parsedResult == null ? null : parsedResult.getPreferredText();
if (StringUtil.hasText(preferredText)) {
@@ -120,7 +125,7 @@ public class DocNodeFileContentExtractor {
if (parsedResult != null && StringUtil.hasText(parsedResult.getPlainText())) {
return parsedResult.getPlainText();
}
throw new BusinessException("PDF 文档解析结果为空");
throw new BusinessException("文档解析结果为空");
}
private String extractDefaultContent(DocumentSourceRef sourceRef) {

View File

@@ -101,7 +101,7 @@ public class DocumentParseBridgeServiceImplTest {
@Test
public void shouldRoutePptxToDedicatedService() {
FakePptxDocumentParseService pptxService = new FakePptxDocumentParseService();
FakePdfDocumentParseService defaultService = new FakePdfDocumentParseService();
FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, pptxService, null, defaultService);
DocumentParsedResult result = bridgeService.parse(buildSource("slides.pptx",
@@ -112,6 +112,61 @@ public class DocumentParseBridgeServiceImplTest {
Assert.assertEquals(0, defaultService.parseCallCount);
}
/**
* 验证 DOCX 会路由到默认桥接服务,而不是误走 PDF 服务。
*/
@Test
public void shouldRouteDocxToDefaultBridgeService() {
FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService();
FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, defaultService);
DocumentParsedResult result = bridgeService.parse(buildSource("demo.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT);
Assert.assertEquals("# docx", result.getPreferredText());
Assert.assertEquals(1, defaultService.parseCallCount);
Assert.assertEquals(0, pdfService.parseCallCount);
}
/**
* 验证 DOCX 在缺少默认桥接服务时会明确失败,而不是退到 PDF 服务。
*/
@Test
public void shouldThrowWhenDocxBridgeServiceDisabled() {
FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService();
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, null);
try {
bridgeService.parse(buildSource("demo.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT);
Assert.fail("expected DocumentParseBridgeException");
} catch (DocumentParseBridgeException e) {
Assert.assertEquals("service_not_enabled", e.getCode());
Assert.assertTrue(e.getMessage().contains("DOCX"));
}
Assert.assertEquals(0, pdfService.parseCallCount);
}
/**
* 验证 XLSX 在缺少专用桥接服务时会明确失败。
*/
@Test
public void shouldThrowWhenXlsxBridgeServiceDisabled() {
FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, null, null, defaultService);
try {
bridgeService.parse(buildSource("table.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), DocumentParseScenario.WORKFLOW_TEXT);
Assert.fail("expected DocumentParseBridgeException");
} catch (DocumentParseBridgeException e) {
Assert.assertEquals("service_not_enabled", e.getCode());
Assert.assertTrue(e.getMessage().contains("XLSX"));
}
Assert.assertEquals(0, defaultService.parseCallCount);
}
private DocumentParseBridgeServiceImpl buildBridgeService(PdfDocumentParseService pdfDocumentParseService,
PptxDocumentParseService pptxDocumentParseService,
XlsxDocumentParseService xlsxDocumentParseService,
@@ -217,6 +272,45 @@ public class DocumentParseBridgeServiceImplTest {
}
}
private static class FakeDefaultDocumentParseService implements DocumentParseService<ParseRequest> {
private ParseRequest lastParseRequest;
private int parseCallCount;
@Override
public ParseResponse parse(ParseRequest request) {
parseCallCount++;
lastParseRequest = request;
ParseResult result = new ParseResult();
result.setFileName("demo.docx");
result.setMarkdown("# docx");
result.setPlainText("docx");
ParseResponse response = new ParseResponse();
response.setResults(Collections.singletonList(result));
return response;
}
@Override
public ParseTaskStatus submit(ParseRequest request) {
throw new UnsupportedOperationException();
}
@Override
public ParseTaskStatus queryTask(String taskId) {
throw new UnsupportedOperationException();
}
@Override
public ParseResponse queryResult(String taskId) {
throw new UnsupportedOperationException();
}
@Override
public ParseTaskInfo queryTaskInfo(String taskId) {
throw new UnsupportedOperationException();
}
}
private static class FakePptxDocumentParseService implements PptxDocumentParseService {
private int parseCallCount;

View File

@@ -48,10 +48,75 @@ public class DocNodeFileContentExtractorTest {
}
/**
* 验证非 PDF 文件会继续走默认读取器
* 验证 DOCX 文件会走统一文档解析桥接服务
*/
@Test
public void shouldUseDefaultReaderForNonPdf() {
public void shouldUseDocumentBridgeForDocx() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
bridgeService,
new FakeFileStorageService(),
new FakeReaderManager("ignored")
);
String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
Assert.assertEquals("# parsed", content);
Assert.assertNotNull(bridgeService.lastSource);
Assert.assertEquals("demo.docx", bridgeService.lastSource.getFileName());
}
/**
* 验证 PPTX 文件会走统一文档解析桥接服务。
*/
@Test
public void shouldUseDocumentBridgeForPptx() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
bridgeService,
new FakeFileStorageService(),
new FakeReaderManager("ignored")
);
String content = extractor.extract(buildFileValue(
"slides.pptx",
"/files/slides.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
));
Assert.assertEquals("# parsed", content);
Assert.assertNotNull(bridgeService.lastSource);
Assert.assertEquals("slides.pptx", bridgeService.lastSource.getFileName());
}
/**
* 验证 XLSX 文件会走统一文档解析桥接服务。
*/
@Test
public void shouldUseDocumentBridgeForXlsx() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
bridgeService,
new FakeFileStorageService(),
new FakeReaderManager("ignored")
);
String content = extractor.extract(buildFileValue(
"table.xlsx",
"/files/table.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
));
Assert.assertEquals("# parsed", content);
Assert.assertNotNull(bridgeService.lastSource);
Assert.assertEquals("table.xlsx", bridgeService.lastSource.getFileName());
}
/**
* 验证非桥接类型文件会继续走默认读取器。
*/
@Test
public void shouldUseDefaultReaderForUnsupportedType() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
bridgeService,
@@ -59,7 +124,7 @@ public class DocNodeFileContentExtractorTest {
new FakeReaderManager("plain text")
);
String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
String content = extractor.extract(buildFileValue("note.txt", "/files/note.txt", "text/plain"));
Assert.assertEquals("plain text", content);
Assert.assertNull(bridgeService.lastSource);
@@ -85,10 +150,10 @@ public class DocNodeFileContentExtractorTest {
}
/**
* 验证解析结果为空时不会回退旧 PDF 读取链路。
* 验证桥接解析结果为空时不会回退旧读取链路。
*/
@Test
public void shouldFailWhenPdfParseResultIsEmpty() {
public void shouldFailWhenBridgeParseResultIsEmpty() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
bridgeService.response.setPreferredText(null);
bridgeService.response.setMarkdown(null);
@@ -103,15 +168,15 @@ public class DocNodeFileContentExtractorTest {
extractor.extract(buildFileValue("demo.pdf", "/files/demo.pdf", "application/pdf"));
Assert.fail("expected BusinessException");
} catch (BusinessException e) {
Assert.assertEquals("PDF 文档解析结果为空", e.getMessage());
Assert.assertEquals("文档解析结果为空", e.getMessage());
}
}
/**
* 验证远端素材 URL 的非 PDF 文件不会误走本地存储读取。
* 验证远端素材 URL 的非桥接文件不会误走本地存储读取。
*/
@Test
public void shouldReadRemoteUrlForNonPdf() {
public void shouldReadRemoteUrlForUnsupportedType() {
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
HttpServer server;
try {
@@ -120,7 +185,7 @@ public class DocNodeFileContentExtractorTest {
throw new RuntimeException(e);
}
byte[] body = "remote text".getBytes(StandardCharsets.UTF_8);
server.createContext("/demo.docx", exchange -> {
server.createContext("/note.txt", exchange -> {
exchange.sendResponseHeaders(200, body.length);
exchange.getResponseBody().write(body);
exchange.close();
@@ -134,9 +199,9 @@ public class DocNodeFileContentExtractorTest {
);
String content = extractor.extract(buildFileValue(
"demo.docx",
"http://127.0.0.1:" + server.getAddress().getPort() + "/demo.docx",
""
"note.txt",
"http://127.0.0.1:" + server.getAddress().getPort() + "/note.txt",
"text/plain"
));
Assert.assertEquals("remote text", content);