feat: 完成工作流多文件文档解析闭环
- 支持文档解析节点批量解析并收口为 documents 轻量输出 - 收口引用树、节点输出展示与旧工作流固定输出兼容 - 修复共享按钮点击事件,恢复多个节点加号交互
This commit is contained in:
@@ -6,6 +6,7 @@ import com.easyagents.flow.core.chain.Parameter;
|
||||
import com.easyagents.flow.core.node.BaseNode;
|
||||
import tech.easyflow.common.util.SpringContextUtil;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@@ -30,17 +31,42 @@ public class DocNode extends BaseNode {
|
||||
@Override
|
||||
public Map<String, Object> execute(Chain chain) {
|
||||
Map<String, Object> map = chain.getState().resolveParameters(this);
|
||||
Map<String, Object> res = new HashMap<>();
|
||||
DocNodeFileContentExtractor extractor = SpringContextUtil.getBean(DocNodeFileContentExtractor.class);
|
||||
String docContent = extractor.extract(map.get("file"));
|
||||
List<DocNodeFileContentExtractor.DocExtractedDocument> documents = extractor.extractDocuments(map.get("file"));
|
||||
|
||||
String key = "content";
|
||||
List<Parameter> outputDefs = getOutputDefs();
|
||||
if (outputDefs != null && !outputDefs.isEmpty()) {
|
||||
String defName = outputDefs.get(0).getName();
|
||||
if (StringUtil.hasText(defName)) key = defName;
|
||||
List<Map<String, Object>> documentMaps = new ArrayList<>();
|
||||
for (DocNodeFileContentExtractor.DocExtractedDocument document : documents) {
|
||||
documentMaps.add(document.toMap());
|
||||
}
|
||||
res.put(key, docContent);
|
||||
|
||||
Map<String, String> outputKeyMapping = resolveOutputKeyMapping();
|
||||
Map<String, Object> res = new HashMap<>();
|
||||
res.put(outputKeyMapping.get("documents"), documentMaps);
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据节点输出定义解析运行态输出键名。
|
||||
*
|
||||
* @return 逻辑字段到实际输出键名的映射
|
||||
*/
|
||||
private Map<String, String> resolveOutputKeyMapping() {
|
||||
Map<String, String> mapping = new HashMap<>();
|
||||
mapping.put("documents", "documents");
|
||||
|
||||
List<Parameter> outputDefs = getOutputDefs();
|
||||
if (outputDefs == null || outputDefs.isEmpty()) {
|
||||
return mapping;
|
||||
}
|
||||
|
||||
String[] logicalKeys = {"documents"};
|
||||
for (int i = 0; i < outputDefs.size() && i < logicalKeys.length; i++) {
|
||||
Parameter outputDef = outputDefs.get(i);
|
||||
String name = outputDef == null ? null : outputDef.getName();
|
||||
if (StringUtil.hasText(name)) {
|
||||
mapping.put(logicalKeys[i], name);
|
||||
}
|
||||
}
|
||||
return mapping;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,13 @@ import tech.easyflow.common.web.exceptions.BusinessException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* {@link DocNode} 文件内容提取器。
|
||||
@@ -27,6 +33,9 @@ import java.util.Map;
|
||||
*/
|
||||
@Component
|
||||
public class DocNodeFileContentExtractor {
|
||||
private static final int FILE_MAX_COUNT = 10;
|
||||
private static final long FILE_MAX_SINGLE_SIZE = 5L * 1024 * 1024;
|
||||
private static final long FILE_MAX_TOTAL_SIZE = 50L * 1024 * 1024;
|
||||
|
||||
private final DocumentParseBridgeService documentParseBridgeService;
|
||||
private final FileStorageService fileStorageService;
|
||||
@@ -62,6 +71,33 @@ public class DocNodeFileContentExtractor {
|
||||
return extractDefaultContent(sourceRef);
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量提取文件文本内容。
|
||||
*
|
||||
* @param fileValue 工作流运行态中的单文件或多文件值
|
||||
* @return 逐文件解析结果
|
||||
*/
|
||||
public List<DocExtractedDocument> extractDocuments(Object fileValue) {
|
||||
List<DocumentSourceRef> sourceRefs = toDocumentSourceRefs(fileValue);
|
||||
List<DocExtractedDocument> results = new ArrayList<>();
|
||||
for (int index = 0; index < sourceRefs.size(); index++) {
|
||||
DocumentSourceRef sourceRef = sourceRefs.get(index);
|
||||
try {
|
||||
String content = shouldUseDocumentBridge(sourceRef)
|
||||
? extractBridgeContent(sourceRef)
|
||||
: extractDefaultContent(sourceRef);
|
||||
results.add(new DocExtractedDocument(sourceRef.getFileName(), content));
|
||||
} catch (Exception e) {
|
||||
String fileName = StringUtil.hasText(sourceRef.getFileName()) ? sourceRef.getFileName() : ("#" + (index + 1));
|
||||
if (e instanceof BusinessException businessException) {
|
||||
throw new BusinessException("文件解析失败(" + fileName + "): " + businessException.getMessage());
|
||||
}
|
||||
throw new RuntimeException("文件解析失败(" + fileName + ")", e);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将运行时文件值转换为统一文档源。
|
||||
*
|
||||
@@ -84,6 +120,50 @@ public class DocNodeFileContentExtractor {
|
||||
return sourceRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将单文件或多文件运行值归一化为统一文档源列表。
|
||||
*
|
||||
* @param fileValue 运行态文件值
|
||||
* @return 文档源列表
|
||||
*/
|
||||
List<DocumentSourceRef> toDocumentSourceRefs(Object fileValue) {
|
||||
List<Object> candidates = new ArrayList<>();
|
||||
collectFileValues(fileValue, candidates);
|
||||
if (candidates.isEmpty()) {
|
||||
throw new BusinessException("文件输入不能为空");
|
||||
}
|
||||
|
||||
List<DocumentSourceRef> sourceRefs = new ArrayList<>();
|
||||
Set<String> seenFilePaths = new LinkedHashSet<>();
|
||||
long totalSize = 0L;
|
||||
for (Object candidate : candidates) {
|
||||
DocumentSourceRef sourceRef = toDocumentSourceRef(candidate);
|
||||
validateSourceRef(sourceRef);
|
||||
String filePath = sourceRef.getFilePath().trim();
|
||||
if (!seenFilePaths.add(filePath)) {
|
||||
continue;
|
||||
}
|
||||
Long size = sourceRef.getSize();
|
||||
if (size != null && size > FILE_MAX_SINGLE_SIZE) {
|
||||
throw new BusinessException("单个文件不能超过 5MB: " + sourceRef.getFileName());
|
||||
}
|
||||
if (size != null && size > 0) {
|
||||
totalSize += size;
|
||||
}
|
||||
sourceRefs.add(sourceRef);
|
||||
}
|
||||
if (sourceRefs.size() > FILE_MAX_COUNT) {
|
||||
throw new BusinessException("最多上传 10 个文件");
|
||||
}
|
||||
if (totalSize > FILE_MAX_TOTAL_SIZE) {
|
||||
throw new BusinessException("文件总大小不能超过 50MB");
|
||||
}
|
||||
if (sourceRefs.isEmpty()) {
|
||||
throw new BusinessException("文件输入不能为空");
|
||||
}
|
||||
return sourceRefs;
|
||||
}
|
||||
|
||||
private void validateSourceRef(DocumentSourceRef sourceRef) {
|
||||
if (sourceRef == null) {
|
||||
throw new BusinessException("文件输入不能为空");
|
||||
@@ -96,6 +176,19 @@ public class DocNodeFileContentExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
private void collectFileValues(Object value, List<Object> result) {
|
||||
if (value == null) {
|
||||
return;
|
||||
}
|
||||
if (value instanceof Collection<?> collection) {
|
||||
for (Object item : collection) {
|
||||
collectFileValues(item, result);
|
||||
}
|
||||
return;
|
||||
}
|
||||
result.add(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断当前文件类型是否应优先走统一文档解析桥接。
|
||||
*
|
||||
@@ -172,4 +265,49 @@ public class DocNodeFileContentExtractor {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 逐文件解析结果。
|
||||
*/
|
||||
public static final class DocExtractedDocument {
|
||||
private final String fileName;
|
||||
private final String content;
|
||||
|
||||
/**
|
||||
* 创建逐文件解析结果。
|
||||
*
|
||||
* @param fileName 文件名
|
||||
* @param content 解析文本
|
||||
*/
|
||||
public DocExtractedDocument(String fileName, String content) {
|
||||
this.fileName = fileName;
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return 文件名
|
||||
*/
|
||||
public String getFileName() {
|
||||
return fileName;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return 文本内容
|
||||
*/
|
||||
public String getContent() {
|
||||
return content;
|
||||
}
|
||||
|
||||
/**
|
||||
* 转为轻量 Map,供工作流结果与引用树消费。
|
||||
*
|
||||
* @return 轻量结果对象
|
||||
*/
|
||||
public Map<String, Object> toMap() {
|
||||
Map<String, Object> result = new LinkedHashMap<>();
|
||||
result.put("fileName", fileName);
|
||||
result.put("content", content);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,9 @@ import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import com.sun.net.httpserver.HttpServer;
|
||||
|
||||
@@ -211,6 +213,56 @@ public class DocNodeFileContentExtractorTest {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证多文件输入会按顺序返回逐文件结果。
|
||||
*/
|
||||
@Test
|
||||
public void shouldExtractDocumentsForMultipleFiles() {
|
||||
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
|
||||
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
|
||||
bridgeService,
|
||||
new FakeFileStorageService(),
|
||||
new FakeReaderManager("plain text")
|
||||
);
|
||||
|
||||
List<DocNodeFileContentExtractor.DocExtractedDocument> documents = extractor.extractDocuments(Arrays.asList(
|
||||
buildFileValue("demo.pdf", "/files/demo.pdf", "application/pdf"),
|
||||
buildFileValue("note.txt", "/files/note.txt", "text/plain")
|
||||
));
|
||||
|
||||
Assert.assertEquals(2, documents.size());
|
||||
Assert.assertEquals("demo.pdf", documents.get(0).getFileName());
|
||||
Assert.assertEquals("# parsed", documents.get(0).getContent());
|
||||
Assert.assertEquals("note.txt", documents.get(1).getFileName());
|
||||
Assert.assertEquals("plain text", documents.get(1).getContent());
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证多文件中任一文件失败时会暴露文件名。
|
||||
*/
|
||||
@Test
|
||||
public void shouldExposeFileNameWhenMultipleDocumentsFail() {
|
||||
RecordingDocumentParseBridgeService bridgeService = new RecordingDocumentParseBridgeService();
|
||||
bridgeService.response.setPreferredText(null);
|
||||
bridgeService.response.setMarkdown(null);
|
||||
bridgeService.response.setPlainText(null);
|
||||
DocNodeFileContentExtractor extractor = new DocNodeFileContentExtractor(
|
||||
bridgeService,
|
||||
new FakeFileStorageService(),
|
||||
new FakeReaderManager("plain text")
|
||||
);
|
||||
|
||||
try {
|
||||
extractor.extractDocuments(Arrays.asList(
|
||||
buildFileValue("broken.pdf", "/files/broken.pdf", "application/pdf"),
|
||||
buildFileValue("note.txt", "/files/note.txt", "text/plain")
|
||||
));
|
||||
Assert.fail("expected BusinessException");
|
||||
} catch (BusinessException e) {
|
||||
Assert.assertEquals("文件解析失败(broken.pdf): 文档解析结果为空", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, Object> buildFileValue(String fileName, String filePath, String contentType) {
|
||||
Map<String, Object> value = new HashMap<String, Object>();
|
||||
value.put("fileName", fileName);
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
package tech.easyflow.ai.node;
|
||||
|
||||
import com.easyagents.flow.core.chain.Parameter;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* {@link DocNode} 单元测试。
|
||||
*/
|
||||
public class DocNodeTest {
|
||||
|
||||
/**
|
||||
* 历史工作流若改过输出名,仍应按固定输出槽位顺序映射运行态结果键。
|
||||
*
|
||||
* @throws Exception 反射调用失败
|
||||
*/
|
||||
@Test
|
||||
public void shouldResolveOutputKeyMappingByOutputOrder() throws Exception {
|
||||
DocNode node = new DocNode();
|
||||
node.setOutputDefs(Arrays.asList(
|
||||
parameter("documentItems")
|
||||
));
|
||||
|
||||
Method method = DocNode.class.getDeclaredMethod("resolveOutputKeyMapping");
|
||||
method.setAccessible(true);
|
||||
Map<String, String> mapping = (Map<String, String>) method.invoke(node);
|
||||
|
||||
Assert.assertEquals("documentItems", mapping.get("documents"));
|
||||
}
|
||||
|
||||
private static Parameter parameter(String name) {
|
||||
Parameter parameter = new Parameter();
|
||||
parameter.setName(name);
|
||||
return parameter;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user