feat: 支持工作流文档节点解析 Office 新格式
- DocNode 对 pdf/docx/pptx/xlsx 统一走桥接同步解析 - 修复 DOCX 误回退到 PDF 解析服务的问题并补齐回归测试
This commit is contained in:
@@ -222,7 +222,7 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic
|
||||
case PDF:
|
||||
return requireSpecificService(pdfDocumentParseService, defaultDocumentParseService, "PDF");
|
||||
case DOCX:
|
||||
return requireSpecificService(defaultDocumentParseService, pdfDocumentParseService, "DOCX");
|
||||
return requireSpecificService(defaultDocumentParseService, null, "DOCX");
|
||||
case PPTX:
|
||||
return requireSpecificService(pptxDocumentParseService, null, "PPTX");
|
||||
case XLSX:
|
||||
|
||||
@@ -5,9 +5,24 @@ import tech.easyflow.ai.utils.DocUtil;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* 默认文档读取服务。
|
||||
*
|
||||
* <p>仅在统一文档解析桥接不支持当前文件类型时,作为工作流文档解析节点的回退读取器。</p>
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-18
|
||||
*/
|
||||
@Component("defaultReader")
|
||||
public class DefaultReadService implements ReadDocService {
|
||||
|
||||
/**
|
||||
* 读取默认支持的文档内容。
|
||||
*
|
||||
* @param fileName 文件名
|
||||
* @param is 文件输入流
|
||||
* @return 读取出的文本内容
|
||||
*/
|
||||
@Override
|
||||
public String read(String fileName, InputStream is) {
|
||||
String suffix = DocUtil.getSuffix(fileName);
|
||||
|
||||
@@ -13,7 +13,7 @@ import java.util.Map;
|
||||
/**
|
||||
* 工作流文件内容提取节点。
|
||||
*
|
||||
* <p>节点输入为统一文件对象,PDF 交给统一文档解析桥接服务,
|
||||
* <p>节点输入为统一文件对象,桥接支持的文档类型优先交给统一文档解析服务,
|
||||
* 其他类型继续走默认文档读取器。</p>
|
||||
*
|
||||
* @author Codex
|
||||
|
||||
@@ -7,7 +7,7 @@ import tech.easyflow.ai.document.model.DocumentParseScenario;
|
||||
import tech.easyflow.ai.document.model.DocumentParsedResult;
|
||||
import tech.easyflow.ai.document.model.DocumentSourceRef;
|
||||
import tech.easyflow.ai.document.service.DocumentParseBridgeService;
|
||||
import tech.easyflow.ai.utils.DocUtil;
|
||||
import tech.easyflow.ai.document.support.DocumentParseSourceType;
|
||||
import tech.easyflow.common.filestorage.FileStorageService;
|
||||
import tech.easyflow.common.util.StringUtil;
|
||||
import tech.easyflow.common.web.exceptions.BusinessException;
|
||||
@@ -19,8 +19,8 @@ import java.util.Map;
|
||||
/**
|
||||
* {@link DocNode} 文件内容提取器。
|
||||
*
|
||||
* <p>负责把工作流运行态中的文件对象转换为统一文档源,并根据文件类型选择
|
||||
* 统一文档解析桥接服务或默认读取器。</p>
|
||||
* <p>负责把工作流运行态中的文件对象转换为统一文档源,并根据文件类型优先选择
|
||||
* 统一文档解析桥接服务;仅在桥接不支持时回退到默认读取器。</p>
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-14
|
||||
@@ -56,8 +56,8 @@ public class DocNodeFileContentExtractor {
|
||||
public String extract(Object fileValue) {
|
||||
DocumentSourceRef sourceRef = toDocumentSourceRef(fileValue);
|
||||
validateSourceRef(sourceRef);
|
||||
if (isPdf(sourceRef)) {
|
||||
return extractPdfContent(sourceRef);
|
||||
if (shouldUseDocumentBridge(sourceRef)) {
|
||||
return extractBridgeContent(sourceRef);
|
||||
}
|
||||
return extractDefaultContent(sourceRef);
|
||||
}
|
||||
@@ -96,19 +96,24 @@ public class DocNodeFileContentExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isPdf(DocumentSourceRef sourceRef) {
|
||||
if (StringUtil.hasText(sourceRef.getContentType())
|
||||
&& sourceRef.getContentType().toLowerCase().contains("pdf")) {
|
||||
return true;
|
||||
}
|
||||
String fileName = sourceRef.getFileName();
|
||||
if (!StringUtil.hasText(fileName) || !fileName.contains(".")) {
|
||||
return false;
|
||||
}
|
||||
return "pdf".equals(DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName)));
|
||||
/**
|
||||
* 判断当前文件类型是否应优先走统一文档解析桥接。
|
||||
*
|
||||
* @param sourceRef 文档源
|
||||
* @return 是否走桥接
|
||||
*/
|
||||
private boolean shouldUseDocumentBridge(DocumentSourceRef sourceRef) {
|
||||
return DocumentParseSourceType.resolve(sourceRef.getFileName(), sourceRef.getContentType())
|
||||
!= DocumentParseSourceType.UNSUPPORTED;
|
||||
}
|
||||
|
||||
private String extractPdfContent(DocumentSourceRef sourceRef) {
|
||||
/**
|
||||
* 通过统一文档解析桥接提取主文本结果。
|
||||
*
|
||||
* @param sourceRef 文档源
|
||||
* @return 桥接提取出的主文本
|
||||
*/
|
||||
private String extractBridgeContent(DocumentSourceRef sourceRef) {
|
||||
DocumentParsedResult parsedResult = documentParseBridgeService.parse(sourceRef, DocumentParseScenario.WORKFLOW_TEXT);
|
||||
String preferredText = parsedResult == null ? null : parsedResult.getPreferredText();
|
||||
if (StringUtil.hasText(preferredText)) {
|
||||
@@ -120,7 +125,7 @@ public class DocNodeFileContentExtractor {
|
||||
if (parsedResult != null && StringUtil.hasText(parsedResult.getPlainText())) {
|
||||
return parsedResult.getPlainText();
|
||||
}
|
||||
throw new BusinessException("PDF 文档解析结果为空");
|
||||
throw new BusinessException("文档解析结果为空");
|
||||
}
|
||||
|
||||
private String extractDefaultContent(DocumentSourceRef sourceRef) {
|
||||
|
||||
@@ -101,7 +101,7 @@ public class DocumentParseBridgeServiceImplTest {
|
||||
@Test
|
||||
public void shouldRoutePptxToDedicatedService() {
|
||||
FakePptxDocumentParseService pptxService = new FakePptxDocumentParseService();
|
||||
FakePdfDocumentParseService defaultService = new FakePdfDocumentParseService();
|
||||
FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, pptxService, null, defaultService);
|
||||
|
||||
DocumentParsedResult result = bridgeService.parse(buildSource("slides.pptx",
|
||||
@@ -112,6 +112,61 @@ public class DocumentParseBridgeServiceImplTest {
|
||||
Assert.assertEquals(0, defaultService.parseCallCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证 DOCX 会路由到默认桥接服务,而不是误走 PDF 服务。
|
||||
*/
|
||||
@Test
|
||||
public void shouldRouteDocxToDefaultBridgeService() {
|
||||
FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService();
|
||||
FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, defaultService);
|
||||
|
||||
DocumentParsedResult result = bridgeService.parse(buildSource("demo.docx",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT);
|
||||
|
||||
Assert.assertEquals("# docx", result.getPreferredText());
|
||||
Assert.assertEquals(1, defaultService.parseCallCount);
|
||||
Assert.assertEquals(0, pdfService.parseCallCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证 DOCX 在缺少默认桥接服务时会明确失败,而不是退到 PDF 服务。
|
||||
*/
|
||||
@Test
|
||||
public void shouldThrowWhenDocxBridgeServiceDisabled() {
|
||||
FakePdfDocumentParseService pdfService = new FakePdfDocumentParseService();
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(pdfService, null, null, null);
|
||||
|
||||
try {
|
||||
bridgeService.parse(buildSource("demo.docx",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"), DocumentParseScenario.WORKFLOW_TEXT);
|
||||
Assert.fail("expected DocumentParseBridgeException");
|
||||
} catch (DocumentParseBridgeException e) {
|
||||
Assert.assertEquals("service_not_enabled", e.getCode());
|
||||
Assert.assertTrue(e.getMessage().contains("DOCX"));
|
||||
}
|
||||
Assert.assertEquals(0, pdfService.parseCallCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证 XLSX 在缺少专用桥接服务时会明确失败。
|
||||
*/
|
||||
@Test
|
||||
public void shouldThrowWhenXlsxBridgeServiceDisabled() {
|
||||
FakeDefaultDocumentParseService defaultService = new FakeDefaultDocumentParseService();
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, null, null, defaultService);
|
||||
|
||||
try {
|
||||
bridgeService.parse(buildSource("table.xlsx",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), DocumentParseScenario.WORKFLOW_TEXT);
|
||||
Assert.fail("expected DocumentParseBridgeException");
|
||||
} catch (DocumentParseBridgeException e) {
|
||||
Assert.assertEquals("service_not_enabled", e.getCode());
|
||||
Assert.assertTrue(e.getMessage().contains("XLSX"));
|
||||
}
|
||||
Assert.assertEquals(0, defaultService.parseCallCount);
|
||||
}
|
||||
|
||||
private DocumentParseBridgeServiceImpl buildBridgeService(PdfDocumentParseService pdfDocumentParseService,
|
||||
PptxDocumentParseService pptxDocumentParseService,
|
||||
XlsxDocumentParseService xlsxDocumentParseService,
|
||||
@@ -217,6 +272,45 @@ public class DocumentParseBridgeServiceImplTest {
|
||||
}
|
||||
}
|
||||
|
||||
private static class FakeDefaultDocumentParseService implements DocumentParseService<ParseRequest> {
|
||||
|
||||
private ParseRequest lastParseRequest;
|
||||
private int parseCallCount;
|
||||
|
||||
@Override
|
||||
public ParseResponse parse(ParseRequest request) {
|
||||
parseCallCount++;
|
||||
lastParseRequest = request;
|
||||
ParseResult result = new ParseResult();
|
||||
result.setFileName("demo.docx");
|
||||
result.setMarkdown("# docx");
|
||||
result.setPlainText("docx");
|
||||
ParseResponse response = new ParseResponse();
|
||||
response.setResults(Collections.singletonList(result));
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus submit(ParseRequest request) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskInfo queryTaskInfo(String taskId) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
private static class FakePptxDocumentParseService implements PptxDocumentParseService {
|
||||
|
||||
private int parseCallCount;
|
||||
|
||||
@@ -48,10 +48,75 @@ public class DocNodeFileContentExtractorTest {
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证非 PDF 文件会继续走默认读取器。
|
||||
* 验证 DOCX 文件会走统一文档解析桥接服务。
|
||||
*/
|
||||
@Test
|
||||
public void shouldUseDefaultReaderForNonPdf() {
|
||||
public void shouldUseDocumentBridgeForDocx() {
|
||||
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
|
||||
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
|
||||
bridgeService,
|
||||
new FakeFileStorageService(),
|
||||
new FakeReaderManager("ignored")
|
||||
);
|
||||
|
||||
String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
|
||||
|
||||
Assert.assertEquals("# parsed", content);
|
||||
Assert.assertNotNull(bridgeService.lastSource);
|
||||
Assert.assertEquals("demo.docx", bridgeService.lastSource.getFileName());
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证 PPTX 文件会走统一文档解析桥接服务。
|
||||
*/
|
||||
@Test
|
||||
public void shouldUseDocumentBridgeForPptx() {
|
||||
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
|
||||
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
|
||||
bridgeService,
|
||||
new FakeFileStorageService(),
|
||||
new FakeReaderManager("ignored")
|
||||
);
|
||||
|
||||
String content = extractor.extract(buildFileValue(
|
||||
"slides.pptx",
|
||||
"/files/slides.pptx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
));
|
||||
|
||||
Assert.assertEquals("# parsed", content);
|
||||
Assert.assertNotNull(bridgeService.lastSource);
|
||||
Assert.assertEquals("slides.pptx", bridgeService.lastSource.getFileName());
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证 XLSX 文件会走统一文档解析桥接服务。
|
||||
*/
|
||||
@Test
|
||||
public void shouldUseDocumentBridgeForXlsx() {
|
||||
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
|
||||
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
|
||||
bridgeService,
|
||||
new FakeFileStorageService(),
|
||||
new FakeReaderManager("ignored")
|
||||
);
|
||||
|
||||
String content = extractor.extract(buildFileValue(
|
||||
"table.xlsx",
|
||||
"/files/table.xlsx",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
));
|
||||
|
||||
Assert.assertEquals("# parsed", content);
|
||||
Assert.assertNotNull(bridgeService.lastSource);
|
||||
Assert.assertEquals("table.xlsx", bridgeService.lastSource.getFileName());
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证非桥接类型文件会继续走默认读取器。
|
||||
*/
|
||||
@Test
|
||||
public void shouldUseDefaultReaderForUnsupportedType() {
|
||||
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
|
||||
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
|
||||
bridgeService,
|
||||
@@ -59,7 +124,7 @@ public class DocNodeFileContentExtractorTest {
|
||||
new FakeReaderManager("plain text")
|
||||
);
|
||||
|
||||
String content = extractor.extract(buildFileValue("demo.docx", "/files/demo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
|
||||
String content = extractor.extract(buildFileValue("note.txt", "/files/note.txt", "text/plain"));
|
||||
|
||||
Assert.assertEquals("plain text", content);
|
||||
Assert.assertNull(bridgeService.lastSource);
|
||||
@@ -85,10 +150,10 @@ public class DocNodeFileContentExtractorTest {
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证解析结果为空时不会回退旧 PDF 读取链路。
|
||||
* 验证桥接解析结果为空时不会回退旧读取链路。
|
||||
*/
|
||||
@Test
|
||||
public void shouldFailWhenPdfParseResultIsEmpty() {
|
||||
public void shouldFailWhenBridgeParseResultIsEmpty() {
|
||||
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
|
||||
bridgeService.response.setPreferredText(null);
|
||||
bridgeService.response.setMarkdown(null);
|
||||
@@ -103,15 +168,15 @@ public class DocNodeFileContentExtractorTest {
|
||||
extractor.extract(buildFileValue("demo.pdf", "/files/demo.pdf", "application/pdf"));
|
||||
Assert.fail("expected BusinessException");
|
||||
} catch (BusinessException e) {
|
||||
Assert.assertEquals("PDF 文档解析结果为空", e.getMessage());
|
||||
Assert.assertEquals("文档解析结果为空", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证远端素材 URL 的非 PDF 文件不会误走本地存储读取。
|
||||
* 验证远端素材 URL 的非桥接文件不会误走本地存储读取。
|
||||
*/
|
||||
@Test
|
||||
public void shouldReadRemoteUrlForNonPdf() {
|
||||
public void shouldReadRemoteUrlForUnsupportedType() {
|
||||
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
|
||||
HttpServer server;
|
||||
try {
|
||||
@@ -120,7 +185,7 @@ public class DocNodeFileContentExtractorTest {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
byte[] body = "remote text".getBytes(StandardCharsets.UTF_8);
|
||||
server.createContext("/demo.docx", exchange -> {
|
||||
server.createContext("/note.txt", exchange -> {
|
||||
exchange.sendResponseHeaders(200, body.length);
|
||||
exchange.getResponseBody().write(body);
|
||||
exchange.close();
|
||||
@@ -134,9 +199,9 @@ public class DocNodeFileContentExtractorTest {
|
||||
);
|
||||
|
||||
String content = extractor.extract(buildFileValue(
|
||||
"demo.docx",
|
||||
"http://127.0.0.1:" + server.getAddress().getPort() + "/demo.docx",
|
||||
""
|
||||
"note.txt",
|
||||
"http://127.0.0.1:" + server.getAddress().getPort() + "/note.txt",
|
||||
"text/plain"
|
||||
));
|
||||
|
||||
Assert.assertEquals("remote text", content);
|
||||
|
||||
Reference in New Issue
Block a user