diff --git a/easyflow-api/easyflow-api-admin/src/main/java/tech/easyflow/admin/controller/ai/DocumentCollectionController.java b/easyflow-api/easyflow-api-admin/src/main/java/tech/easyflow/admin/controller/ai/DocumentCollectionController.java index 6a7fefa..59dcf15 100644 --- a/easyflow-api/easyflow-api-admin/src/main/java/tech/easyflow/admin/controller/ai/DocumentCollectionController.java +++ b/easyflow-api/easyflow-api-admin/src/main/java/tech/easyflow/admin/controller/ai/DocumentCollectionController.java @@ -407,6 +407,8 @@ public class DocumentCollectionController extends BaseCurdControllercom.easyagents easy-agents-mcp - junit junit diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/exception/DocumentParseBridgeException.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/exception/DocumentParseBridgeException.java index 2108d6f..fa32121 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/exception/DocumentParseBridgeException.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/exception/DocumentParseBridgeException.java @@ -35,10 +35,14 @@ public class DocumentParseBridgeException extends RuntimeException { public static DocumentParseBridgeException serviceNotEnabled() { return new DocumentParseBridgeException( "service_not_enabled", - "统一文档解析服务未启用,请先配置 easy-agents.document.pdf.provider" + "统一文档解析服务未启用,请先配置 easy-agents.document.ocr.provider=mineru" ); } + public static DocumentParseBridgeException serviceNotEnabled(String message) { + return new DocumentParseBridgeException("service_not_enabled", message); + } + public static DocumentParseBridgeException unsupportedSource(String message) { return new DocumentParseBridgeException("unsupported_source", message); } diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskStatus.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskStatus.java index 580dc95..92743e9 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskStatus.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/model/DocumentParseTaskStatus.java @@ -22,6 +22,11 @@ public class DocumentParseTaskStatus { private String statusUrl; private String resultUrl; private Integer queuedAhead; + private Integer progressPercent; + private String currentStage; + private Integer processedItems; + private Integer totalItems; + private String statusMessage; public String getTaskId() { return taskId; @@ -110,4 +115,44 @@ public class DocumentParseTaskStatus { public void setQueuedAhead(Integer queuedAhead) { this.queuedAhead = queuedAhead; } + + public Integer getProgressPercent() { + return progressPercent; + } + + public void setProgressPercent(Integer progressPercent) { + this.progressPercent = progressPercent; + } + + public String getCurrentStage() { + return currentStage; + } + + public void setCurrentStage(String currentStage) { + this.currentStage = currentStage; + } + + public Integer getProcessedItems() { + return processedItems; + } + + public void setProcessedItems(Integer processedItems) { + this.processedItems = processedItems; + } + + public Integer getTotalItems() { + return totalItems; + } + + public void setTotalItems(Integer totalItems) { + this.totalItems = totalItems; + } + + public String getStatusMessage() { + return statusMessage; + } + + public void setStatusMessage(String statusMessage) { + this.statusMessage = statusMessage; + } } diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java index c76b91c..1f63a32 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImpl.java @@ -5,6 +5,10 @@ import com.easyagents.document.core.entity.ParseResponse; import com.easyagents.document.core.entity.ParseResult; import com.easyagents.document.core.entity.ParseTaskInfo; import com.easyagents.document.core.entity.ParseTaskStatus; +import com.easyagents.document.pdf.PdfDocumentParseService; +import com.easyagents.document.pptx.PptxDocumentParseService; +import com.easyagents.document.xlsx.XlsxDocumentParseService; +import org.springframework.beans.factory.annotation.Qualifier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.lang.Nullable; @@ -20,8 +24,13 @@ import tech.easyflow.ai.document.service.DocumentParseBridgeService; import tech.easyflow.ai.document.support.DocumentSourceLoader; import tech.easyflow.ai.document.support.DocumentParseRequestFactory; import tech.easyflow.ai.document.support.DocumentParseResultMapper; +import tech.easyflow.ai.document.support.DocumentParseSourceType; import tech.easyflow.ai.document.support.LoadedDocumentSource; -import tech.easyflow.ai.utils.DocUtil; + +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.function.Function; /** * 统一文档解析桥接门面默认实现。 @@ -33,18 +42,33 @@ import tech.easyflow.ai.utils.DocUtil; public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeService { private static final Logger LOG = LoggerFactory.getLogger(DocumentParseBridgeServiceImpl.class); + private static final String DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME = "documentParseService"; @Nullable - private final DocumentParseService documentParseService; + private final DocumentParseService defaultDocumentParseService; + @Nullable + private final PdfDocumentParseService pdfDocumentParseService; + @Nullable + private final PptxDocumentParseService pptxDocumentParseService; + @Nullable + private final XlsxDocumentParseService xlsxDocumentParseService; private final DocumentSourceLoader documentSourceLoader; private final DocumentParseRequestFactory parseRequestFactory; private final DocumentParseResultMapper parseResultMapper; - public DocumentParseBridgeServiceImpl(@Nullable DocumentParseService documentParseService, + public DocumentParseBridgeServiceImpl(@Nullable + @Qualifier(DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME) + DocumentParseService defaultDocumentParseService, + @Nullable PdfDocumentParseService pdfDocumentParseService, + @Nullable PptxDocumentParseService pptxDocumentParseService, + @Nullable XlsxDocumentParseService xlsxDocumentParseService, DocumentSourceLoader documentSourceLoader, DocumentParseRequestFactory parseRequestFactory, DocumentParseResultMapper parseResultMapper) { - this.documentParseService = documentParseService; + this.defaultDocumentParseService = defaultDocumentParseService; + this.pdfDocumentParseService = pdfDocumentParseService; + this.pptxDocumentParseService = pptxDocumentParseService; + this.xlsxDocumentParseService = xlsxDocumentParseService; this.documentSourceLoader = documentSourceLoader; this.parseRequestFactory = parseRequestFactory; this.parseResultMapper = parseResultMapper; @@ -59,7 +83,8 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic LoadedDocumentSource loadedSource = prepareSupportedSource(source); LOG.info("桥接服务开始同步解析文档: fileName={}, contentType={}, scenario={}", loadedSource.getFileName(), loadedSource.getContentType(), scenario); - ParseResponse response = requireService().parse(parseRequestFactory.build(loadedSource, scenario)); + DocumentParseService parseService = resolveService(loadedSource); + ParseResponse response = parseService.parse(parseRequestFactory.build(loadedSource, scenario)); DocumentParsedResult result = parseResultMapper.map(extractSingleResult(response, false)); LOG.info("桥接服务同步解析完成: fileName={}, scenario={}, preferredTextLength={}", loadedSource.getFileName(), scenario, resolveTextLength(result)); @@ -84,7 +109,8 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic LoadedDocumentSource loadedSource = prepareSupportedSource(source); LOG.info("桥接服务开始提交异步解析任务: fileName={}, contentType={}, scenario={}", loadedSource.getFileName(), loadedSource.getContentType(), scenario); - ParseTaskStatus taskStatus = requireService().submit(parseRequestFactory.build(loadedSource, scenario)); + DocumentParseService parseService = resolveService(loadedSource); + ParseTaskStatus taskStatus = parseService.submit(parseRequestFactory.build(loadedSource, scenario)); DocumentParseTaskStatus mappedStatus = parseResultMapper.map(taskStatus); LOG.info("桥接服务异步解析任务提交完成: fileName={}, scenario={}, providerTaskId={}, status={}", loadedSource.getFileName(), scenario, mappedStatus.getTaskId(), mappedStatus.getStatus()); @@ -109,7 +135,8 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic throw DocumentParseBridgeException.taskFailed("taskId 不能为空"); } try { - return parseResultMapper.map(requireService().queryTask(taskId)); + ParseTaskStatus taskStatus = executeAgainstTaskService(taskId, service -> service.queryTask(taskId)); + return parseResultMapper.map(taskStatus); } catch (DocumentParseBridgeException e) { throw e; } catch (Exception e) { @@ -127,7 +154,7 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic } try { LOG.info("桥接服务开始获取异步解析结果: providerTaskId={}", taskId); - ParseResponse response = requireService().queryResult(taskId); + ParseResponse response = executeAgainstTaskService(taskId, service -> service.queryResult(taskId)); DocumentParsedResult result = parseResultMapper.map(extractSingleResult(response, true)); LOG.info("桥接服务获取异步解析结果完成: providerTaskId={}, preferredTextLength={}", taskId, resolveTextLength(result)); @@ -150,7 +177,7 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic throw DocumentParseBridgeException.taskFailed("taskId 不能为空"); } try { - ParseTaskInfo taskInfo = requireService().queryTaskInfo(taskId); + ParseTaskInfo taskInfo = executeAgainstTaskService(taskId, service -> service.queryTaskInfo(taskId)); DocumentParseTaskInfo mappedTaskInfo = parseResultMapper.map(taskInfo); LOG.info("桥接服务查询异步解析任务状态: providerTaskId={}, status={}, hasResult={}", taskId, @@ -177,39 +204,84 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic return text == null ? 0 : text.length(); } - private DocumentParseService requireService() { - if (documentParseService == null) { - throw DocumentParseBridgeException.serviceNotEnabled(); - } - return documentParseService; - } - private LoadedDocumentSource prepareSupportedSource(DocumentSourceRef source) { LoadedDocumentSource loadedSource = documentSourceLoader.load(source); if (!isSupportedByBridge(loadedSource)) { - throw DocumentParseBridgeException.unsupportedSource("统一文档解析桥接当前仅支持 PDF、DOCX 文件"); + throw DocumentParseBridgeException.unsupportedSource("统一文档解析桥接当前仅支持 PDF、DOCX、PPTX、XLSX 文件"); } return loadedSource; } private boolean isSupportedByBridge(LoadedDocumentSource loadedSource) { - String contentType = loadedSource.getContentType(); - if (StringUtils.hasText(contentType)) { - String normalizedContentType = contentType.toLowerCase(); - if (normalizedContentType.contains("pdf") - || normalizedContentType.contains("wordprocessingml.document")) { - return true; + return DocumentParseSourceType.resolve(loadedSource.getFileName(), loadedSource.getContentType()) != DocumentParseSourceType.UNSUPPORTED; + } + + private DocumentParseService resolveService(LoadedDocumentSource loadedSource) { + DocumentParseSourceType sourceType = DocumentParseSourceType.resolve(loadedSource.getFileName(), loadedSource.getContentType()); + switch (sourceType) { + case PDF: + return requireSpecificService(pdfDocumentParseService, defaultDocumentParseService, "PDF"); + case DOCX: + return requireSpecificService(defaultDocumentParseService, pdfDocumentParseService, "DOCX"); + case PPTX: + return requireSpecificService(pptxDocumentParseService, null, "PPTX"); + case XLSX: + return requireSpecificService(xlsxDocumentParseService, null, "XLSX"); + default: + throw DocumentParseBridgeException.unsupportedSource("当前文件类型暂不支持桥接解析"); + } + } + + private DocumentParseService requireSpecificService(@Nullable DocumentParseService primaryService, + @Nullable DocumentParseService fallbackService, + String sourceType) { + if (primaryService != null) { + return primaryService; + } + if (fallbackService != null) { + return fallbackService; + } + throw DocumentParseBridgeException.serviceNotEnabled("未启用 " + sourceType + " 文档解析服务"); + } + + private T executeAgainstTaskService(String taskId, Function action) { + List services = availableServices(); + if (services.isEmpty()) { + throw DocumentParseBridgeException.serviceNotEnabled(); + } + Exception lastException = null; + for (DocumentParseService service : services) { + try { + return action.apply(service); + } catch (Exception exception) { + lastException = exception; + LOG.debug("桥接服务任务查询尝试失败,准备切换下一个解析服务: taskId={}, service={}", + taskId, + service.getClass().getSimpleName(), + exception); } } - String fileName = loadedSource.getFileName(); - if (!StringUtils.hasText(fileName) || !fileName.contains(".")) { - return false; + if (lastException instanceof RuntimeException) { + throw (RuntimeException) lastException; } - String suffix = DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName)); - if ("pdf".equals(suffix) || "docx".equals(suffix)) { - return true; + throw DocumentParseBridgeException.taskFailed("未找到可处理当前任务ID的文档解析服务", lastException); + } + + private List availableServices() { + LinkedHashSet services = new LinkedHashSet(); + if (pptxDocumentParseService != null) { + services.add(pptxDocumentParseService); } - return false; + if (xlsxDocumentParseService != null) { + services.add(xlsxDocumentParseService); + } + if (pdfDocumentParseService != null) { + services.add(pdfDocumentParseService); + } + if (defaultDocumentParseService != null) { + services.add(defaultDocumentParseService); + } + return new ArrayList(services); } private ParseResult extractSingleResult(ParseResponse response, boolean resultFetchPhase) { diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseRequestFactory.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseRequestFactory.java index 1d6baac..4d6f395 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseRequestFactory.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseRequestFactory.java @@ -2,6 +2,9 @@ package tech.easyflow.ai.document.support; import com.easyagents.document.core.entity.ParseFile; import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.PdfParseRequest; +import com.easyagents.document.core.entity.PptxParseRequest; +import com.easyagents.document.core.entity.XlsxParseRequest; import org.springframework.stereotype.Component; import tech.easyflow.ai.document.exception.DocumentParseBridgeException; import tech.easyflow.ai.document.model.DocumentParseScenario; @@ -31,12 +34,28 @@ public class DocumentParseRequestFactory { if (scenario == null) { throw DocumentParseBridgeException.requestBuildFailed("解析场景不能为空"); } - ParseRequest request = new ParseRequest(); + ParseRequest request = createTypedRequest(source); request.addFile(ParseFile.of(source.getFileName(), source.getContentBytes(), source.getContentType())); applyScenario(request, scenario); return request; } + private ParseRequest createTypedRequest(LoadedDocumentSource source) { + DocumentParseSourceType sourceType = DocumentParseSourceType.resolve(source.getFileName(), source.getContentType()); + switch (sourceType) { + case PDF: + return new PdfParseRequest(); + case PPTX: + return new PptxParseRequest(); + case XLSX: + return new XlsxParseRequest(); + case DOCX: + return new ParseRequest(); + default: + throw DocumentParseBridgeException.requestBuildFailed("当前文件类型暂不支持桥接解析"); + } + } + private void applyScenario(ParseRequest request, DocumentParseScenario scenario) { switch (scenario) { case WORKFLOW_TEXT: diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseResultMapper.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseResultMapper.java index 4097c92..8dc5196 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseResultMapper.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseResultMapper.java @@ -69,6 +69,11 @@ public class DocumentParseResultMapper { status.setStatusUrl(taskStatus.getStatusUrl()); status.setResultUrl(taskStatus.getResultUrl()); status.setQueuedAhead(taskStatus.getQueuedAhead()); + status.setProgressPercent(taskStatus.getProgressPercent()); + status.setCurrentStage(taskStatus.getCurrentStage()); + status.setProcessedItems(taskStatus.getProcessedItems()); + status.setTotalItems(taskStatus.getTotalItems()); + status.setStatusMessage(taskStatus.getStatusMessage()); return status; } @@ -104,6 +109,11 @@ public class DocumentParseResultMapper { status.setStatusUrl(taskStatus.getStatusUrl()); status.setResultUrl(taskStatus.getResultUrl()); status.setQueuedAhead(taskStatus.getQueuedAhead()); + status.setProgressPercent(taskStatus.getProgressPercent()); + status.setCurrentStage(taskStatus.getCurrentStage()); + status.setProcessedItems(taskStatus.getProcessedItems()); + status.setTotalItems(taskStatus.getTotalItems()); + status.setStatusMessage(taskStatus.getStatusMessage()); } private String resolvePreferredText(ParseResult parseResult) { diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseSourceType.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseSourceType.java new file mode 100644 index 0000000..238747b --- /dev/null +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/document/support/DocumentParseSourceType.java @@ -0,0 +1,70 @@ +package tech.easyflow.ai.document.support; + +import org.springframework.util.StringUtils; +import tech.easyflow.ai.utils.DocUtil; + +/** + * 统一文档解析桥接支持的源文件类型。 + * + * @author Codex + * @since 2026-04-17 + */ +public enum DocumentParseSourceType { + + PDF, + DOCX, + PPTX, + XLSX, + UNSUPPORTED; + + /** + * 根据文件名与内容类型推断文档类型。 + * + * @param fileName 文件名 + * @param contentType MIME 类型 + * @return 文档类型 + */ + public static DocumentParseSourceType resolve(String fileName, String contentType) { + if (StringUtils.hasText(contentType)) { + String normalizedContentType = contentType.toLowerCase(); + if (normalizedContentType.contains("pdf")) { + return PDF; + } + if (normalizedContentType.contains("wordprocessingml.document")) { + return DOCX; + } + if (normalizedContentType.contains("presentationml.presentation")) { + return PPTX; + } + if (normalizedContentType.contains("spreadsheetml.sheet")) { + return XLSX; + } + } + if (!StringUtils.hasText(fileName) || !fileName.contains(".")) { + return UNSUPPORTED; + } + String suffix = DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName)); + if ("pdf".equals(suffix)) { + return PDF; + } + if ("docx".equals(suffix)) { + return DOCX; + } + if ("pptx".equals(suffix)) { + return PPTX; + } + if ("xlsx".equals(suffix)) { + return XLSX; + } + return UNSUPPORTED; + } + + /** + * 判断是否属于 Office 首版接入类型。 + * + * @return 是否是本次 Office 类型 + */ + public boolean isOffice() { + return this == PPTX || this == XLSX; + } +} diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/DocumentImportDtos.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/DocumentImportDtos.java index d80ffa3..13c505c 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/DocumentImportDtos.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/DocumentImportDtos.java @@ -286,6 +286,7 @@ public final class DocumentImportDtos { private String chunkId; private String chunkType; private String content; + private String renderMarkdown; private List headingPath = new ArrayList(); private Integer partNo; private Integer partTotal; @@ -335,6 +336,14 @@ public final class DocumentImportDtos { this.content = content; } + public String getRenderMarkdown() { + return renderMarkdown; + } + + public void setRenderMarkdown(String renderMarkdown) { + this.renderMarkdown = renderMarkdown; + } + public List getHeadingPath() { return headingPath; } diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/DocumentImportKeys.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/DocumentImportKeys.java index 3025b49..1393773 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/DocumentImportKeys.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/DocumentImportKeys.java @@ -22,4 +22,19 @@ public final class DocumentImportKeys { public static final String KEY_DOCUMENT_PARSE_METADATA = "parse.metadata"; public static final String KEY_DOCUMENT_PARSE_WARNINGS = "parse.warnings"; public static final String KEY_DOCUMENT_PROVIDER_TASK_ID = "parse.providerTaskId"; + public static final String KEY_DOCUMENT_PARSE_IMAGE_URLS = "parse.imageUrls"; + public static final String KEY_DOCUMENT_PARSE_IMAGE_COUNT = "parse.imageCount"; + public static final String KEY_DOCUMENT_PARSE_IMAGE_STORAGE_PREFIX = "parse.imageStoragePrefix"; + public static final String KEY_DOCUMENT_PARSE_PROGRESS_PERCENT = "parse.progressPercent"; + public static final String KEY_DOCUMENT_PARSE_CURRENT_STAGE = "parse.currentStage"; + public static final String KEY_DOCUMENT_PARSE_PROCESSED_ITEMS = "parse.processedItems"; + public static final String KEY_DOCUMENT_PARSE_TOTAL_ITEMS = "parse.totalItems"; + public static final String KEY_DOCUMENT_PARSE_STATUS_MESSAGE = "parse.statusMessage"; + public static final String KEY_DOCUMENT_RENDER_MARKDOWN = "renderMarkdown"; + public static final String KEY_DOCUMENT_PAGE_INDEX = "pageIndex"; + public static final String KEY_DOCUMENT_SHEET_NAME = "sheetName"; + public static final String KEY_DOCUMENT_ROW_START = "rowStart"; + public static final String KEY_DOCUMENT_ROW_END = "rowEnd"; + public static final String KEY_DOCUMENT_IMAGE_REFS = "imageRefs"; + public static final String KEY_DOCUMENT_PARSE_ARTIFACT_SUMMARY = "parseArtifactSummary"; } diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/task/DocumentImportTaskStatusStreamService.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/task/DocumentImportTaskStatusStreamService.java index e79d849..dec2542 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/task/DocumentImportTaskStatusStreamService.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/task/DocumentImportTaskStatusStreamService.java @@ -6,6 +6,7 @@ import org.springframework.stereotype.Service; import org.springframework.transaction.support.TransactionSynchronization; import org.springframework.transaction.support.TransactionSynchronizationManager; import org.springframework.web.servlet.mvc.method.annotation.SseEmitter; +import tech.easyflow.ai.documentimport.DocumentImportKeys; import tech.easyflow.ai.entity.Document; import tech.easyflow.ai.mapper.DocumentMapper; import tech.easyflow.common.web.exceptions.BusinessException; @@ -116,11 +117,21 @@ public class DocumentImportTaskStatusStreamService { payload.put("totalChunks", document.getTotalChunks()); payload.put("completedChunks", document.getCompletedChunks()); payload.put("failedChunks", document.getFailedChunks()); + payload.put("parseCurrentStage", readOptionAsString(document, DocumentImportKeys.KEY_DOCUMENT_PARSE_CURRENT_STAGE)); + payload.put("parseStatusMessage", readOptionAsString(document, DocumentImportKeys.KEY_DOCUMENT_PARSE_STATUS_MESSAGE)); payload.put("lastTaskError", document.getLastTaskError()); payload.put("taskModifiedAt", document.getTaskModifiedAt()); return payload; } + private String readOptionAsString(Document document, String key) { + if (document == null || document.getOptions() == null || key == null) { + return null; + } + Object value = document.getOptions().get(key); + return value == null ? null : String.valueOf(value); + } + private void sendAsync(String topicKey, SseEmitter emitter, String eventName, Map payload) { sseThreadPool.execute(() -> { try { diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/task/KnowledgeDocumentImportTaskAppService.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/task/KnowledgeDocumentImportTaskAppService.java index f24b62d..478305a 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/task/KnowledgeDocumentImportTaskAppService.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/documentimport/task/KnowledgeDocumentImportTaskAppService.java @@ -1,6 +1,7 @@ package tech.easyflow.ai.documentimport.task; import cn.dev33.satoken.stp.StpUtil; +import com.alibaba.fastjson2.JSON; import com.easyagents.core.document.Document; import com.easyagents.core.model.embedding.EmbeddingModel; import com.easyagents.core.model.embedding.EmbeddingOptions; @@ -10,11 +11,15 @@ import com.easyagents.core.store.StoreResult; import com.easyagents.rag.core.RagChunk; import com.easyagents.rag.core.RagDefaults; import com.easyagents.rag.core.RagStrategyCodes; +import com.easyagents.rag.core.RagChunkTypes; import com.easyagents.rag.ingestion.RagIngestionService; import com.easyagents.rag.ingestion.model.AnalysisResult; import com.easyagents.rag.ingestion.model.StrategyConfig; import com.easyagents.search.engine.service.DocumentSearcher; import com.easyagents.search.engine.service.KeywordSearchMetadataKeys; +import com.easyagents.document.core.entity.DocumentBlock; +import com.easyagents.document.core.entity.DocumentImage; +import com.easyagents.document.core.entity.DocumentTable; import com.mybatisflex.core.keygen.impl.FlexIDKeyGenerator; import com.mybatisflex.core.query.QueryWrapper; import org.slf4j.Logger; @@ -27,6 +32,7 @@ import org.springframework.transaction.support.TransactionSynchronization; import org.springframework.transaction.support.TransactionSynchronizationManager; import org.springframework.transaction.annotation.Transactional; import tech.easyflow.ai.config.SearcherFactory; +import tech.easyflow.ai.document.model.DocumentParseArtifacts; import tech.easyflow.ai.document.model.DocumentParseScenario; import tech.easyflow.ai.document.model.DocumentParseTaskInfo; import tech.easyflow.ai.document.model.DocumentParsedResult; @@ -35,6 +41,7 @@ import tech.easyflow.ai.document.service.DocumentParseBridgeService; import tech.easyflow.ai.documentimport.DocumentImportDtos; import tech.easyflow.ai.documentimport.DocumentImportKeys; import tech.easyflow.ai.documentimport.DocumentImportPreviewService; +import tech.easyflow.ai.easyagents.CustomMultipartFile; import tech.easyflow.ai.entity.DocumentChunk; import tech.easyflow.ai.entity.DocumentCollection; import tech.easyflow.ai.entity.DocumentImportTask; @@ -59,16 +66,21 @@ import javax.annotation.Resource; import java.io.IOException; import java.io.InputStream; import java.math.BigInteger; +import java.net.URLConnection; +import java.util.Base64; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Objects; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * 知识库文档任务化导入应用服务。 @@ -83,6 +95,11 @@ public class KnowledgeDocumentImportTaskAppService { private static final int PARSE_MONITOR_BATCH_SIZE = 20; private static final int INDEX_BATCH_SIZE = 20; private static final String SOURCE_RANGES_KEY = "sourceRanges"; + private static final String KNOWLEDGE_PARSE_IMAGE_CATEGORY = "knowledge-parse"; + private static final String OFFICE_PPTX_PAGE_STRATEGY = "OFFICE_PPTX_PAGE"; + private static final String OFFICE_XLSX_ROW_WINDOW_STRATEGY = "OFFICE_XLSX_ROW_WINDOW"; + private static final String SEARCH_RENDER_MARKDOWN_METADATA_KEY = "renderMarkdown"; + private static final Pattern MARKDOWN_IMAGE_PATTERN = Pattern.compile("!\\[(?:[^\\]]*)\\]\\(([^)]+)\\)"); @Resource private DocumentMapper documentMapper; @@ -540,8 +557,9 @@ public class KnowledgeDocumentImportTaskAppService { DocumentParsedResult parsedResult, String sourceFormat, String providerTaskId) { - String preferredText = resolvePreferredText(parsedResult); - if (!StringUtil.hasText(preferredText)) { + parsedResult = normalizeParsedImagesForKnowledgeImport(document, parsedResult); + ParsedKnowledgeContent parsedKnowledgeContent = buildParsedKnowledgeContent(document, parsedResult, sourceFormat); + if (!StringUtil.hasText(parsedKnowledgeContent.documentLlmContent)) { throw new BusinessException("文档解析结果为空"); } @@ -550,15 +568,32 @@ public class KnowledgeDocumentImportTaskAppService { if (StringUtil.hasText(providerTaskId)) { options.put(DocumentImportKeys.KEY_DOCUMENT_PROVIDER_TASK_ID, providerTaskId); } - if (parsedResult != null && parsedResult.getMetadata() != null && !parsedResult.getMetadata().isEmpty()) { + if (parsedResult.getMetadata() != null && !parsedResult.getMetadata().isEmpty()) { options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_METADATA, new LinkedHashMap(parsedResult.getMetadata())); } - if (parsedResult != null && parsedResult.getWarnings() != null && !parsedResult.getWarnings().isEmpty()) { + if (parsedResult.getWarnings() != null && !parsedResult.getWarnings().isEmpty()) { options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_WARNINGS, new ArrayList(parsedResult.getWarnings())); } + if (parsedResult.getMetadata() != null) { + Object imageUrls = parsedResult.getMetadata().get(DocumentImportKeys.KEY_DOCUMENT_PARSE_IMAGE_URLS); + if (imageUrls instanceof List) { + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_IMAGE_URLS, new ArrayList((List) imageUrls)); + } + Object imageCount = parsedResult.getMetadata().get(DocumentImportKeys.KEY_DOCUMENT_PARSE_IMAGE_COUNT); + if (imageCount != null) { + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_IMAGE_COUNT, imageCount); + } + Object imageStoragePrefix = parsedResult.getMetadata().get(DocumentImportKeys.KEY_DOCUMENT_PARSE_IMAGE_STORAGE_PREFIX); + if (StringUtil.hasText(asString(imageStoragePrefix))) { + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_IMAGE_STORAGE_PREFIX, imageStoragePrefix); + } + } + options.put(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN, parsedKnowledgeContent.documentRenderMarkdown); + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_ARTIFACT_SUMMARY, parsedKnowledgeContent.parseArtifactSummary); + clearDocumentParseProgress(options); Date now = new Date(); - document.setContent(preferredText); + document.setContent(parsedKnowledgeContent.documentLlmContent); document.setDocumentType(sourceFormat); document.setOptions(options); document.setProcessStatus(DocumentProcessStatus.READY_FOR_SEGMENT.name()); @@ -569,11 +604,108 @@ public class KnowledgeDocumentImportTaskAppService { document.setLastTaskError(null); persistDocumentTaskState(document, now); LOG.info("文档解析任务完成: taskId={}, documentId={}, processStatus={}, providerTaskId={}, contentLength={}", - task.getId(), document.getId(), DocumentProcessStatus.READY_FOR_SEGMENT.name(), providerTaskId, preferredText.length()); + task.getId(), document.getId(), DocumentProcessStatus.READY_FOR_SEGMENT.name(), providerTaskId, + parsedKnowledgeContent.documentLlmContent.length()); finishTask(task, now, DocumentImportTaskStatus.COMPLETED, null); } + /** + * 将解析结果中的图片上传到对象存储,并把 Markdown/结构化工件里的相对路径回写为可访问 URL。 + * + *

该逻辑仅服务于知识库导入链路,避免把对象存储耦合下沉到通用解析 provider 或 MinerU 自身。

+ * + * @param document 文档实体 + * @param parsedResult 解析结果 + * @return 已完成图片标准化的解析结果 + */ + private DocumentParsedResult normalizeParsedImagesForKnowledgeImport(tech.easyflow.ai.entity.Document document, + DocumentParsedResult parsedResult) { + if (document == null || parsedResult == null || parsedResult.getImages() == null || parsedResult.getImages().isEmpty()) { + return parsedResult; + } + Map storedImageUrls = uploadParsedImages(document, parsedResult.getImages()); + if (storedImageUrls.isEmpty()) { + return parsedResult; + } + rewriteParsedResultImageReferences(parsedResult, storedImageUrls); + parsedResult.getMetadata().put(DocumentImportKeys.KEY_DOCUMENT_PARSE_IMAGE_URLS, new ArrayList(new LinkedHashSet(storedImageUrls.values()))); + parsedResult.getMetadata().put(DocumentImportKeys.KEY_DOCUMENT_PARSE_IMAGE_COUNT, parsedResult.getImages().size()); + parsedResult.getMetadata().put(DocumentImportKeys.KEY_DOCUMENT_PARSE_IMAGE_STORAGE_PREFIX, buildKnowledgeParseImagePrePath(document)); + return parsedResult; + } + + /** + * 上传解析结果中的图片,并建立“原始相对路径/文件名 -> 最终 URL”的映射。 + * + * @param document 文档实体 + * @param images 解析出的图片集合 + * @return 图片引用映射 + */ + private Map uploadParsedImages(tech.easyflow.ai.entity.Document document, + List images) { + Map storedImageUrls = new LinkedHashMap(); + if (document == null || images == null || images.isEmpty()) { + return storedImageUrls; + } + Map baseNameCounts = countImageBaseNames(images); + String imagePrePath = buildKnowledgeParseImagePrePath(document); + int imageIndex = 0; + for (DocumentImage image : images) { + if (image == null) { + imageIndex++; + continue; + } + byte[] imageBytes = resolveParsedImageContent(image); + if (imageBytes.length == 0) { + imageIndex++; + continue; + } + String fileName = resolveParsedImageFileName(image, imageIndex); + String mimeType = resolveParsedImageMimeType(image, fileName); + CustomMultipartFile multipartFile = new CustomMultipartFile(imageBytes, fileName, fileName, mimeType); + String storedUrl = storageService.save(multipartFile, imagePrePath); + registerStoredImageUrl(storedImageUrls, image.getSourcePath(), storedUrl, baseNameCounts); + registerStoredImageUrl(storedImageUrls, image.getName(), storedUrl, baseNameCounts); + image.setSourcePath(storedUrl); + image.setDataUrl(null); + image.setContent(null); + if (!StringUtil.hasText(image.getMimeType())) { + image.setMimeType(mimeType); + } + imageIndex++; + } + return storedImageUrls; + } + + /** + * 重写解析结果中的 Markdown、图片块与结构化工件引用。 + * + * @param parsedResult 解析结果 + * @param storedImageUrls 原始路径到最终 URL 的映射 + */ + private void rewriteParsedResultImageReferences(DocumentParsedResult parsedResult, + Map storedImageUrls) { + if (parsedResult == null || storedImageUrls == null || storedImageUrls.isEmpty()) { + return; + } + String originalMarkdown = parsedResult.getMarkdown(); + String rewrittenMarkdown = rewriteMarkdownImageReferences(originalMarkdown, storedImageUrls); + if (rewrittenMarkdown != null) { + parsedResult.setMarkdown(rewrittenMarkdown); + if (!StringUtil.hasText(parsedResult.getPreferredText()) || Objects.equals(parsedResult.getPreferredText(), originalMarkdown)) { + parsedResult.setPreferredText(rewrittenMarkdown); + } + if (!StringUtil.hasText(parsedResult.getPlainText()) || Objects.equals(parsedResult.getPlainText(), originalMarkdown)) { + parsedResult.setPlainText(rewrittenMarkdown); + } + } + rewriteBlockImageReferences(parsedResult.getBlocks(), storedImageUrls); + rewriteTableImageReferences(parsedResult.getTables(), storedImageUrls); + rewriteDocumentImageReferences(parsedResult.getImages(), storedImageUrls); + rewriteArtifactImageReferences(parsedResult.getArtifacts(), storedImageUrls); + } + private void markParseFailed(DocumentImportTask task, tech.easyflow.ai.entity.Document document, String errorMessage) { @@ -587,6 +719,432 @@ public class KnowledgeDocumentImportTaskAppService { finishTask(task, now, DocumentImportTaskStatus.FAILED, errorMessage); } + /** + * 统计图片文件名,便于仅在“文件名唯一”时使用 basename 兜底匹配。 + * + * @param images 图片集合 + * @return 文件名计数 + */ + private Map countImageBaseNames(List images) { + Map counts = new LinkedHashMap(); + if (images == null || images.isEmpty()) { + return counts; + } + for (DocumentImage image : images) { + if (image == null) { + continue; + } + String baseName = resolveImageBaseName(image.getSourcePath(), image.getName()); + if (!StringUtil.hasText(baseName)) { + continue; + } + counts.put(baseName, counts.getOrDefault(baseName, 0) + 1); + } + return counts; + } + + /** + * 构建知识库导入图片存储前缀。 + * + * @param document 文档实体 + * @return 存储前缀 + */ + private String buildKnowledgeParseImagePrePath(tech.easyflow.ai.entity.Document document) { + String documentSegment = buildKnowledgeParseDocumentSegment(document); + return KNOWLEDGE_PARSE_IMAGE_CATEGORY + "/" + documentSegment + "/images"; + } + + /** + * 构建对象存储中的文档级目录名。 + * + * @param document 文档实体 + * @return 文档目录名 + */ + private String buildKnowledgeParseDocumentSegment(tech.easyflow.ai.entity.Document document) { + String documentId = document == null || document.getId() == null ? "unknown" : document.getId().toString(); + String safeFileName = sanitizeStorageSegment(removeFileExtension(document == null ? null : document.getTitle())); + return StringUtil.hasText(safeFileName) ? documentId + "_" + safeFileName : documentId; + } + + /** + * 去除文件扩展名,只保留主文件名。 + * + * @param fileName 原始文件名 + * @return 主文件名 + */ + private String removeFileExtension(String fileName) { + String pureFileName = fileName == null ? null : tech.easyflow.common.filestorage.utils.PathGeneratorUtil.getPureFileName(fileName); + if (!StringUtil.hasText(pureFileName)) { + return null; + } + int dotIndex = pureFileName.lastIndexOf('.'); + if (dotIndex <= 0) { + return pureFileName; + } + return pureFileName.substring(0, dotIndex); + } + + /** + * 将目录片段标准化为对象存储友好的名称。 + * + * @param rawSegment 原始片段 + * @return 规范化后的片段 + */ + private String sanitizeStorageSegment(String rawSegment) { + if (!StringUtil.hasText(rawSegment)) { + return null; + } + String normalized = rawSegment.trim() + .replaceAll("[\\\\/]+", "_") + .replaceAll("[^\\p{IsAlphabetic}\\p{IsDigit}_-]+", "_") + .replaceAll("_+", "_") + .replaceAll("^_+|_+$", ""); + if (!StringUtil.hasText(normalized)) { + return "document"; + } + return normalized.length() > 80 ? normalized.substring(0, 80) : normalized; + } + + /** + * 解析 data URL,提取二进制图片内容。 + * + * @param dataUrl data URL + * @return 图片字节数组 + */ + private byte[] decodeDataUrl(String dataUrl) { + if (!StringUtil.hasText(dataUrl)) { + return new byte[0]; + } + int commaIndex = dataUrl.indexOf(','); + if (commaIndex < 0 || commaIndex >= dataUrl.length() - 1) { + return new byte[0]; + } + try { + return Base64.getDecoder().decode(dataUrl.substring(commaIndex + 1)); + } catch (IllegalArgumentException exception) { + throw new BusinessException("解析文档图片数据失败"); + } + } + + /** + * 解析图片 MIME 类型。 + * + * @param image 图片实体 + * @param fileName 文件名 + * @return MIME 类型 + */ + private String resolveParsedImageMimeType(DocumentImage image, String fileName) { + if (image != null && StringUtil.hasText(image.getMimeType())) { + return image.getMimeType(); + } + String mimeType = URLConnection.guessContentTypeFromName(fileName); + return StringUtil.hasText(mimeType) ? mimeType : "image/png"; + } + + /** + * 为上传对象存储生成稳定文件名。 + * + * @param image 图片实体 + * @param imageIndex 图片序号 + * @return 文件名 + */ + private String resolveParsedImageFileName(DocumentImage image, int imageIndex) { + String candidate = resolveImageBaseName(image == null ? null : image.getSourcePath(), image == null ? null : image.getName()); + if (!StringUtil.hasText(candidate)) { + candidate = "image_" + (imageIndex + 1) + ".png"; + } + String pureFileName = tech.easyflow.common.filestorage.utils.PathGeneratorUtil.getPureFileName(candidate); + String normalized = pureFileName.replaceAll("[^\\p{IsAlphabetic}\\p{IsDigit}._-]+", "_"); + if (!StringUtil.hasText(normalized)) { + normalized = "image_" + (imageIndex + 1) + ".png"; + } + if (!normalized.contains(".")) { + normalized = normalized + ".png"; + } + return normalized; + } + + /** + * 注册图片路径映射。 + * + * @param storedImageUrls 映射表 + * @param originalPath 原始路径 + * @param storedUrl 最终 URL + * @param baseNameCounts 文件名计数 + */ + private void registerStoredImageUrl(Map storedImageUrls, + String originalPath, + String storedUrl, + Map baseNameCounts) { + if (!StringUtil.hasText(originalPath) || !StringUtil.hasText(storedUrl)) { + return; + } + storedImageUrls.putIfAbsent(originalPath, storedUrl); + storedImageUrls.putIfAbsent(trimLeadingCurrentPath(originalPath), storedUrl); + String baseName = resolveImageBaseName(originalPath, null); + if (StringUtil.hasText(baseName) && baseNameCounts.getOrDefault(baseName, 0) == 1) { + storedImageUrls.putIfAbsent(baseName, storedUrl); + } + } + + /** + * 重写 Markdown 中的图片引用。 + * + * @param markdown Markdown 内容 + * @param storedImageUrls 图片映射 + * @return 回写后的 Markdown + */ + private String rewriteMarkdownImageReferences(String markdown, Map storedImageUrls) { + if (!StringUtil.hasText(markdown) || storedImageUrls == null || storedImageUrls.isEmpty()) { + return markdown; + } + Matcher matcher = MARKDOWN_IMAGE_PATTERN.matcher(markdown); + StringBuffer output = new StringBuffer(); + boolean replaced = false; + while (matcher.find()) { + String originalPath = matcher.group(1); + String resolvedUrl = resolveStoredImageUrl(originalPath, storedImageUrls); + if (!StringUtil.hasText(resolvedUrl)) { + matcher.appendReplacement(output, Matcher.quoteReplacement(matcher.group(0))); + continue; + } + String replacedSegment = matcher.group(0).replace("(" + originalPath + ")", "(" + resolvedUrl + ")"); + matcher.appendReplacement(output, Matcher.quoteReplacement(replacedSegment)); + replaced = true; + } + if (!replaced) { + return markdown; + } + matcher.appendTail(output); + return output.toString(); + } + + /** + * 回写块级图片引用。 + * + * @param blocks 内容块集合 + * @param storedImageUrls 图片映射 + */ + private void rewriteBlockImageReferences(List blocks, Map storedImageUrls) { + if (blocks == null || blocks.isEmpty()) { + return; + } + for (DocumentBlock block : blocks) { + if (block == null || !StringUtil.hasText(block.getImagePath())) { + continue; + } + String resolvedUrl = resolveStoredImageUrl(block.getImagePath(), storedImageUrls); + if (StringUtil.hasText(resolvedUrl)) { + block.setImagePath(resolvedUrl); + } + } + } + + /** + * 回写表格图片引用。 + * + * @param tables 表格集合 + * @param storedImageUrls 图片映射 + */ + private void rewriteTableImageReferences(List tables, Map storedImageUrls) { + if (tables == null || tables.isEmpty()) { + return; + } + for (DocumentTable table : tables) { + if (table == null || !StringUtil.hasText(table.getImagePath())) { + continue; + } + String resolvedUrl = resolveStoredImageUrl(table.getImagePath(), storedImageUrls); + if (StringUtil.hasText(resolvedUrl)) { + table.setImagePath(resolvedUrl); + } + } + } + + /** + * 回写图片实体引用。 + * + * @param images 图片集合 + * @param storedImageUrls 图片映射 + */ + private void rewriteDocumentImageReferences(List images, Map storedImageUrls) { + if (images == null || images.isEmpty()) { + return; + } + for (DocumentImage image : images) { + if (image == null || !StringUtil.hasText(image.getSourcePath())) { + continue; + } + String resolvedUrl = resolveStoredImageUrl(image.getSourcePath(), storedImageUrls); + if (StringUtil.hasText(resolvedUrl)) { + image.setSourcePath(resolvedUrl); + } + } + } + + /** + * 回写结构化工件中的图片路径。 + * + * @param artifacts 解析工件 + * @param storedImageUrls 图片映射 + */ + private void rewriteArtifactImageReferences(DocumentParseArtifacts artifacts, + Map storedImageUrls) { + if (artifacts == null) { + return; + } + rewriteImageReferencesInNode(artifacts.getMiddleJson(), storedImageUrls); + rewriteImageReferencesInNode(artifacts.getContentList(), storedImageUrls); + rewriteImageReferencesInNode(artifacts.getModelOutput(), storedImageUrls); + if (artifacts.getExtraJsonArtifacts() != null && !artifacts.getExtraJsonArtifacts().isEmpty()) { + for (Map.Entry entry : artifacts.getExtraJsonArtifacts().entrySet()) { + Object normalizedNode = normalizeArtifactNode(entry.getValue()); + rewriteImageReferencesInNode(normalizedNode, storedImageUrls); + entry.setValue(normalizedNode); + } + } + } + + private Object normalizeArtifactNode(Object node) { + if (node == null || node instanceof Map || node instanceof List) { + return node; + } + return JSON.parse(JSON.toJSONString(node)); + } + + private byte[] resolveParsedImageContent(DocumentImage image) { + if (image == null) { + return new byte[0]; + } + if (image.getContent() != null && image.getContent().length > 0) { + return image.getContent(); + } + return decodeDataUrl(image.getDataUrl()); + } + + /** + * 递归回写 JSON/Object 树里的图片路径字段。 + * + * @param node 当前节点 + * @param storedImageUrls 图片映射 + */ + @SuppressWarnings("unchecked") + private void rewriteImageReferencesInNode(Object node, Map storedImageUrls) { + if (node instanceof Map mapNode) { + for (Map.Entry entry : ((Map) mapNode).entrySet()) { + Object value = entry.getValue(); + if (entry.getKey() instanceof String key) { + if (value instanceof String stringValue && isImageReferenceKey(key)) { + String resolvedUrl = resolveStoredImageUrl(stringValue, storedImageUrls); + if (StringUtil.hasText(resolvedUrl)) { + ((Map) mapNode).put(entry.getKey(), resolvedUrl); + } + continue; + } + if (value instanceof List listValue && isImageReferenceArrayKey(key)) { + rewriteImageReferenceList(listValue, storedImageUrls); + continue; + } + } + rewriteImageReferencesInNode(value, storedImageUrls); + } + return; + } + if (node instanceof List listNode) { + for (Object item : listNode) { + rewriteImageReferencesInNode(item, storedImageUrls); + } + } + } + + /** + * 判断字段是否为图片路径字段。 + * + * @param key 字段名 + * @return 是否是图片路径字段 + */ + private boolean isImageReferenceKey(String key) { + return "img_path".equals(key) + || "image_path".equals(key) + || "imagePath".equals(key) + || "sourcePath".equals(key); + } + + private boolean isImageReferenceArrayKey(String key) { + return "sourcePaths".equals(key) + || DocumentImportKeys.KEY_DOCUMENT_IMAGE_REFS.equals(key); + } + + @SuppressWarnings("unchecked") + private void rewriteImageReferenceList(List imageReferences, Map storedImageUrls) { + for (int index = 0; index < imageReferences.size(); index++) { + Object item = imageReferences.get(index); + if (item instanceof String imagePath) { + String resolvedUrl = resolveStoredImageUrl(imagePath, storedImageUrls); + if (StringUtil.hasText(resolvedUrl)) { + ((List) imageReferences).set(index, resolvedUrl); + } + continue; + } + rewriteImageReferencesInNode(item, storedImageUrls); + } + } + + /** + * 根据原始路径查找最终图片 URL。 + * + * @param originalPath 原始路径 + * @param storedImageUrls 图片映射 + * @return 最终 URL + */ + private String resolveStoredImageUrl(String originalPath, Map storedImageUrls) { + if (!StringUtil.hasText(originalPath) || storedImageUrls == null || storedImageUrls.isEmpty()) { + return null; + } + String exact = storedImageUrls.get(originalPath); + if (StringUtil.hasText(exact)) { + return exact; + } + String trimmedPath = trimLeadingCurrentPath(originalPath); + exact = storedImageUrls.get(trimmedPath); + if (StringUtil.hasText(exact)) { + return exact; + } + String baseName = resolveImageBaseName(originalPath, null); + if (StringUtil.hasText(baseName)) { + return storedImageUrls.get(baseName); + } + return null; + } + + /** + * 获取图片基础文件名。 + * + * @param sourcePath 原始路径 + * @param fallbackName 兜底名称 + * @return 基础文件名 + */ + private String resolveImageBaseName(String sourcePath, String fallbackName) { + String candidate = StringUtil.hasText(sourcePath) ? sourcePath : fallbackName; + if (!StringUtil.hasText(candidate)) { + return null; + } + return tech.easyflow.common.filestorage.utils.PathGeneratorUtil.getPureFileName(trimLeadingCurrentPath(candidate)); + } + + /** + * 去掉 Markdown 相对路径里的 "./" 前缀,便于统一匹配。 + * + * @param path 原始路径 + * @return 规范化后的路径 + */ + private String trimLeadingCurrentPath(String path) { + if (!StringUtil.hasText(path)) { + return path; + } + return path.replaceFirst("^\\./+", ""); + } + private void markIndexCompleted(DocumentImportTask task, tech.easyflow.ai.entity.Document document, int totalChunks) { @@ -684,9 +1242,40 @@ public class KnowledgeDocumentImportTaskAppService { document.setProcessStatus(DocumentProcessStatus.PARSING.name()); document.setLastTaskError(null); document.setProgressPercent(0); + Map options = copyOptions(document.getOptions()); + clearDocumentParseProgress(options); + document.setOptions(options); persistDocumentTaskState(document, now); } + private void updateDocumentParseProgress(tech.easyflow.ai.entity.Document document, + DocumentParseTaskInfo taskInfo) { + if (document == null || taskInfo == null) { + return; + } + tech.easyflow.ai.entity.Document current = requireDocument(document.getId()); + Map options = copyOptions(current.getOptions()); + options.put(DocumentImportKeys.KEY_DOCUMENT_PROVIDER_TASK_ID, taskInfo.getTaskId()); + if (taskInfo.getProgressPercent() != null) { + current.setProgressPercent(taskInfo.getProgressPercent()); + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_PROGRESS_PERCENT, taskInfo.getProgressPercent()); + } + if (StringUtil.hasText(taskInfo.getCurrentStage())) { + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_CURRENT_STAGE, taskInfo.getCurrentStage()); + } + if (taskInfo.getProcessedItems() != null) { + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_PROCESSED_ITEMS, taskInfo.getProcessedItems()); + } + if (taskInfo.getTotalItems() != null) { + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_TOTAL_ITEMS, taskInfo.getTotalItems()); + } + if (StringUtil.hasText(taskInfo.getStatusMessage())) { + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_STATUS_MESSAGE, taskInfo.getStatusMessage()); + } + current.setOptions(options); + persistDocumentTaskState(current, new Date()); + } + /** * 持久化文档任务状态并推送局部状态刷新事件。 * @@ -726,6 +1315,10 @@ public class KnowledgeDocumentImportTaskAppService { if (!StringUtil.hasText(document.getContent())) { throw new BusinessException("文档尚未完成解析"); } + String sourceFormat = normalizeSourceFormat(document); + if (isOfficeDocument(sourceFormat)) { + return buildOfficePreviewSession(knowledge, document, requestedStrategy, sourceFormat); + } AnalysisResult analysis = ragIngestionService.analyze(document.getContent(), normalizeSourceFormat(document)); StrategyConfig strategyConfig = resolveStrategyConfig(knowledge, requestedStrategy, analysis); List previewChunks = ragIngestionService.split(analysis, strategyConfig); @@ -749,6 +1342,633 @@ public class KnowledgeDocumentImportTaskAppService { return session; } + private DocumentImportDtos.PreviewSession buildOfficePreviewSession(DocumentCollection knowledge, + tech.easyflow.ai.entity.Document document, + StrategyConfig requestedStrategy, + String sourceFormat) { + StrategyConfig strategyConfig = resolveOfficeStrategyConfig(document, requestedStrategy, sourceFormat); + List documentChunks = buildOfficeDocumentChunks(document, sourceFormat, strategyConfig); + if (documentChunks.isEmpty()) { + throw new BusinessException("当前 Office 文档未生成有效分块"); + } + AnalysisResult analysis = buildOfficeAnalysis(document, sourceFormat, strategyConfig, documentChunks.size()); + + DocumentImportDtos.PreviewSession session = new DocumentImportDtos.PreviewSession(); + session.setKnowledgeId(knowledge.getId()); + session.setDocumentId(document.getId()); + session.setFilePath(document.getDocumentPath()); + session.setFileName(document.getTitle()); + session.setSourceFormat(sourceFormat); + session.setStrategyConfig(strategyConfig); + session.setAnalysis(analysis); + session.setDocument(document); + session.setDocumentChunks(documentChunks); + session.setPreviewChunks(new ArrayList()); + session.setCreatedAt(new Date()); + return session; + } + + private StrategyConfig resolveOfficeStrategyConfig(tech.easyflow.ai.entity.Document document, + StrategyConfig requestedStrategy, + String sourceFormat) { + StrategyConfig config = StrategyConfig.defaults(); + if (requestedStrategy != null) { + mergeStrategyConfig(config, requestedStrategy); + } + if ("pptx".equals(sourceFormat)) { + config.setStrategyCode(OFFICE_PPTX_PAGE_STRATEGY); + config.setRowsPerChunk(null); + return config; + } + Integer rowsPerChunk = requestedStrategy == null ? null : requestedStrategy.getRowsPerChunk(); + if (rowsPerChunk == null || rowsPerChunk <= 0) { + StrategyConfig storedStrategy = null; + try { + storedStrategy = readStoredStrategy(document); + } catch (Exception ignored) { + storedStrategy = null; + } + rowsPerChunk = storedStrategy == null ? null : storedStrategy.getRowsPerChunk(); + } + if (rowsPerChunk == null || rowsPerChunk <= 0) { + rowsPerChunk = 10; + } + config.setStrategyCode(OFFICE_XLSX_ROW_WINDOW_STRATEGY); + config.setRowsPerChunk(rowsPerChunk); + return config; + } + + private AnalysisResult buildOfficeAnalysis(tech.easyflow.ai.entity.Document document, + String sourceFormat, + StrategyConfig strategyConfig, + int totalChunks) { + AnalysisResult analysis = new AnalysisResult(); + analysis.setSourceFormat(sourceFormat); + analysis.setNormalizedContent(document.getContent()); + analysis.setRecommendedStrategyCode(strategyConfig.getStrategyCode()); + analysis.setRecommendedStrategyLabel(resolveStrategyLabel(sourceFormat, strategyConfig)); + analysis.setConfidence(1.0D); + analysis.getFeatures().put("officeChunkCount", totalChunks); + analysis.getFeatures().put("rowsPerChunk", strategyConfig.getRowsPerChunk()); + analysis.getFeatures().put("sourceFormat", sourceFormat); + return analysis; + } + + private ParsedKnowledgeContent buildParsedKnowledgeContent(tech.easyflow.ai.entity.Document document, + DocumentParsedResult parsedResult, + String sourceFormat) { + String renderMarkdown = StringUtil.hasText(parsedResult.getMarkdown()) + ? parsedResult.getMarkdown() + : resolvePreferredText(parsedResult); + Map parseArtifactSummary = extractParseArtifactSummary(parsedResult, sourceFormat); + if (!isOfficeDocument(sourceFormat)) { + return new ParsedKnowledgeContent(resolvePreferredText(parsedResult), renderMarkdown, parseArtifactSummary); + } + List documentChunks = buildOfficeDocumentChunks(document, sourceFormat, + resolveOfficeStrategyConfig(document, null, sourceFormat), parseArtifactSummary); + StringBuilder llmContent = new StringBuilder(); + for (DocumentChunk chunk : documentChunks) { + if (!StringUtil.hasText(chunk.getContent())) { + continue; + } + if (llmContent.length() > 0) { + llmContent.append("\n\n"); + } + llmContent.append(chunk.getContent().trim()); + } + return new ParsedKnowledgeContent(llmContent.toString().trim(), renderMarkdown, parseArtifactSummary); + } + + private Map extractParseArtifactSummary(DocumentParsedResult parsedResult, + String sourceFormat) { + if (parsedResult == null || parsedResult.getArtifacts() == null + || parsedResult.getArtifacts().getExtraJsonArtifacts() == null) { + return new LinkedHashMap(); + } + Object artifact = parsedResult.getArtifacts().getExtraJsonArtifacts().get(sourceFormat); + Object normalizedArtifact = normalizeArtifactNode(artifact); + if (normalizedArtifact instanceof Map map) { + return new LinkedHashMap((Map) map); + } + return new LinkedHashMap(); + } + + private boolean isOfficeDocument(String sourceFormat) { + return "pptx".equals(sourceFormat) || "xlsx".equals(sourceFormat); + } + + private List buildOfficeDocumentChunks(tech.easyflow.ai.entity.Document document, + String sourceFormat, + StrategyConfig strategyConfig) { + Object rawArtifactSummary = document.getOptions() == null + ? null + : document.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_PARSE_ARTIFACT_SUMMARY); + Map parseArtifactSummary = rawArtifactSummary instanceof Map + ? new LinkedHashMap((Map) rawArtifactSummary) + : new LinkedHashMap(); + return buildOfficeDocumentChunks(document, sourceFormat, strategyConfig, parseArtifactSummary); + } + + private List buildOfficeDocumentChunks(tech.easyflow.ai.entity.Document document, + String sourceFormat, + StrategyConfig strategyConfig, + Map parseArtifactSummary) { + if ("pptx".equals(sourceFormat)) { + return buildPptxDocumentChunks(document, parseArtifactSummary); + } + if ("xlsx".equals(sourceFormat)) { + int rowsPerChunk = strategyConfig == null || strategyConfig.getRowsPerChunk() == null + ? 10 + : Math.max(1, strategyConfig.getRowsPerChunk()); + return buildXlsxDocumentChunks(document, parseArtifactSummary, rowsPerChunk); + } + return new ArrayList(); + } + + @SuppressWarnings("unchecked") + private List buildPptxDocumentChunks(tech.easyflow.ai.entity.Document document, + Map parseArtifactSummary) { + List chunks = new ArrayList(); + List> slides = readMapList(parseArtifactSummary.get("slides")); + if (slides.isEmpty()) { + String renderMarkdown = readDocumentRenderMarkdown(document); + if (StringUtil.hasText(renderMarkdown)) { + chunks.add(buildOfficeChunk(document, 1, stripMarkdownImageSyntax(renderMarkdown), renderMarkdown, + new LinkedHashMap() {{ + put("chunkType", RagChunkTypes.SECTION); + put("sourceLabel", "Slide 1"); + put("headingPath", Collections.singletonList("Slide 1")); + }})); + } + return chunks; + } + int sorting = 1; + for (Map slide : slides) { + int slideIndex = asInteger(slide.get("slideIndex"), sorting - 1); + String sourceLabel = "Slide " + (slideIndex + 1); + String title = asString(slide.get("title")); + String ocrMarkdown = asString(slide.get("ocrMarkdown")); + String imagePath = asString(slide.get("imagePath")); + String imageName = asString(slide.get("imageName")); + String renderMarkdown = buildPptxChunkRenderMarkdown(slideIndex, title, imageName, imagePath, ocrMarkdown); + String llmContent = buildPptxChunkLlmContent(slideIndex, title, ocrMarkdown); + Map options = new LinkedHashMap(); + options.put("chunkType", RagChunkTypes.SECTION); + options.put("sourceLabel", sourceLabel); + options.put("headingPath", buildHeadingPath(sourceLabel, title)); + options.put("charCount", llmContent.length()); + options.put("tokenEstimate", Math.max(1, llmContent.length() / 4)); + options.put("partNo", 1); + options.put("partTotal", 1); + options.put("warnings", readStringList(slide.get("warnings"))); + options.put(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN, renderMarkdown); + options.put(DocumentImportKeys.KEY_DOCUMENT_PAGE_INDEX, slideIndex + 1); + options.put(DocumentImportKeys.KEY_DOCUMENT_IMAGE_REFS, imagePath == null + ? new ArrayList() + : new ArrayList(Collections.singletonList(imagePath))); + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_ARTIFACT_SUMMARY, new LinkedHashMap(slide)); + chunks.add(buildOfficeChunk(document, sorting++, llmContent, renderMarkdown, options)); + } + return chunks; + } + + private List buildXlsxDocumentChunks(tech.easyflow.ai.entity.Document document, + Map parseArtifactSummary, + int rowsPerChunk) { + List chunks = new ArrayList(); + List> sheets = readMapList(parseArtifactSummary.get("sheets")); + List> cellImages = readMapList(parseArtifactSummary.get("cellImages")); + int sorting = 1; + for (Map sheet : sheets) { + String sheetName = asString(sheet.get("sheetName")); + List> sheetRows = readMapList(sheet.get("rows")); + List> sheetImages = filterSheetImages(cellImages, sheetName); + if (sheetRows.isEmpty()) { + if (sheetImages.isEmpty()) { + continue; + } + chunks.add(buildXlsxImageOnlyChunk(document, sorting++, sheetName, sheet, sheetImages)); + continue; + } + Map headerRow = sheetRows.get(0); + List> dataRows = sheetRows.size() > 1 + ? new ArrayList>(sheetRows.subList(1, sheetRows.size())) + : new ArrayList>(); + if (dataRows.isEmpty()) { + chunks.add(buildXlsxWindowChunk(document, sorting++, sheetName, sheet, headerRow, + new ArrayList>(), sheetImages)); + continue; + } + for (int start = 0; start < dataRows.size(); start += rowsPerChunk) { + int end = Math.min(start + rowsPerChunk, dataRows.size()); + List> windowRows = new ArrayList>(dataRows.subList(start, end)); + chunks.add(buildXlsxWindowChunk(document, sorting++, sheetName, sheet, headerRow, windowRows, sheetImages)); + } + } + if (chunks.isEmpty()) { + String renderMarkdown = readDocumentRenderMarkdown(document); + if (StringUtil.hasText(renderMarkdown)) { + Map options = new LinkedHashMap(); + options.put("chunkType", RagChunkTypes.SECTION); + options.put("sourceLabel", document.getTitle()); + options.put("headingPath", Collections.singletonList(document.getTitle())); + chunks.add(buildOfficeChunk(document, 1, stripMarkdownImageSyntax(renderMarkdown), renderMarkdown, options)); + } + } + return chunks; + } + + private DocumentChunk buildXlsxWindowChunk(tech.easyflow.ai.entity.Document document, + int sorting, + String sheetName, + Map sheetSummary, + Map headerRow, + List> windowRows, + List> sheetImages) { + int headerRowIndex = asInteger(headerRow.get("rowIndex"), 0); + int rowStart = windowRows.isEmpty() ? headerRowIndex + 1 : asInteger(windowRows.get(0).get("rowIndex"), headerRowIndex) + 1; + int rowEnd = windowRows.isEmpty() + ? rowStart + : asInteger(windowRows.get(windowRows.size() - 1).get("rowIndex"), rowStart - 1) + 1; + List> selectedImages = selectWindowImages(sheetImages, rowStart, rowEnd, windowRows.isEmpty()); + String renderMarkdown = buildXlsxChunkRenderMarkdown(sheetName, headerRow, windowRows, selectedImages); + String llmContent = buildXlsxChunkLlmContent(sheetName, headerRow, windowRows, selectedImages); + Map options = new LinkedHashMap(); + options.put("chunkType", RagChunkTypes.SECTION); + options.put("sourceLabel", sheetName + " · " + rowStart + "-" + rowEnd + " 行"); + options.put("headingPath", Collections.singletonList(sheetName)); + options.put("charCount", llmContent.length()); + options.put("tokenEstimate", Math.max(1, llmContent.length() / 4)); + options.put("partNo", 1); + options.put("partTotal", 1); + options.put("warnings", new ArrayList()); + options.put(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN, renderMarkdown); + options.put(DocumentImportKeys.KEY_DOCUMENT_SHEET_NAME, sheetName); + options.put(DocumentImportKeys.KEY_DOCUMENT_ROW_START, rowStart); + options.put(DocumentImportKeys.KEY_DOCUMENT_ROW_END, rowEnd); + options.put(DocumentImportKeys.KEY_DOCUMENT_IMAGE_REFS, collectImageRefs(selectedImages)); + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_ARTIFACT_SUMMARY, buildXlsxChunkSummary(sheetSummary, rowStart, rowEnd, selectedImages)); + return buildOfficeChunk(document, sorting, llmContent, renderMarkdown, options); + } + + private DocumentChunk buildXlsxImageOnlyChunk(tech.easyflow.ai.entity.Document document, + int sorting, + String sheetName, + Map sheetSummary, + List> sheetImages) { + String renderMarkdown = buildXlsxImageOnlyRenderMarkdown(sheetName, sheetImages); + String llmContent = buildXlsxImageOnlyLlmContent(sheetName, sheetImages); + Map options = new LinkedHashMap(); + options.put("chunkType", RagChunkTypes.SECTION); + options.put("sourceLabel", sheetName); + options.put("headingPath", Collections.singletonList(sheetName)); + options.put("charCount", llmContent.length()); + options.put("tokenEstimate", Math.max(1, llmContent.length() / 4)); + options.put("partNo", 1); + options.put("partTotal", 1); + options.put("warnings", new ArrayList()); + options.put(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN, renderMarkdown); + options.put(DocumentImportKeys.KEY_DOCUMENT_SHEET_NAME, sheetName); + options.put(DocumentImportKeys.KEY_DOCUMENT_IMAGE_REFS, collectImageRefs(sheetImages)); + options.put(DocumentImportKeys.KEY_DOCUMENT_PARSE_ARTIFACT_SUMMARY, buildXlsxChunkSummary(sheetSummary, null, null, sheetImages)); + return buildOfficeChunk(document, sorting, llmContent, renderMarkdown, options); + } + + private DocumentChunk buildOfficeChunk(tech.easyflow.ai.entity.Document document, + int sorting, + String llmContent, + String renderMarkdown, + Map options) { + DocumentChunk chunk = new DocumentChunk(); + chunk.setId(generateId(chunk)); + chunk.setDocumentId(document.getId()); + chunk.setDocumentCollectionId(document.getCollectionId()); + chunk.setSorting(sorting); + chunk.setContent(llmContent); + Map normalizedOptions = options == null + ? new LinkedHashMap() + : new LinkedHashMap(options); + normalizedOptions.put(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN, renderMarkdown); + chunk.setOptions(normalizedOptions); + return chunk; + } + + private String buildPptxChunkRenderMarkdown(int slideIndex, + String title, + String imageName, + String imagePath, + String ocrMarkdown) { + StringBuilder builder = new StringBuilder(); + builder.append("# Slide ").append(slideIndex + 1).append("\n\n"); + if (StringUtil.hasText(title)) { + builder.append("## ").append(title.trim()).append("\n\n"); + } + if (StringUtil.hasText(imagePath)) { + builder.append("![") + .append(StringUtil.hasText(imageName) ? imageName : "slide-" + (slideIndex + 1)) + .append("](") + .append(imagePath) + .append(")\n\n"); + } + if (StringUtil.hasText(ocrMarkdown)) { + builder.append(ocrMarkdown.trim()); + } + return builder.toString().trim(); + } + + private String buildPptxChunkLlmContent(int slideIndex, + String title, + String ocrMarkdown) { + StringBuilder builder = new StringBuilder(); + builder.append("Slide ").append(slideIndex + 1); + if (StringUtil.hasText(title)) { + builder.append("\n标题:").append(title.trim()); + } + if (StringUtil.hasText(ocrMarkdown)) { + builder.append("\n\n").append(stripMarkdownImageSyntax(ocrMarkdown)); + } + return builder.toString().trim(); + } + + private String buildXlsxChunkRenderMarkdown(String sheetName, + Map headerRow, + List> windowRows, + List> selectedImages) { + StringBuilder builder = new StringBuilder(); + builder.append("# ").append(sheetName).append("\n\n"); + builder.append(buildMarkdownTable(headerRow, windowRows)); + appendXlsxImageAppendix(builder, sheetName, selectedImages); + return builder.toString().trim(); + } + + private String buildXlsxChunkLlmContent(String sheetName, + Map headerRow, + List> windowRows, + List> selectedImages) { + StringBuilder builder = new StringBuilder(); + builder.append("# ").append(sheetName).append("\n\n"); + builder.append(buildMarkdownTable(headerRow, windowRows)); + if (!selectedImages.isEmpty()) { + builder.append("\n\n图片 OCR:\n"); + for (Map image : selectedImages) { + builder.append("- ") + .append(asString(image.get("anchorCell"))) + .append(":") + .append(StringUtil.hasText(asString(image.get("ocrText"))) ? asString(image.get("ocrText")) : "无文本") + .append("\n"); + } + } + return stripMarkdownImageSyntax(builder.toString().trim()); + } + + private String buildXlsxImageOnlyRenderMarkdown(String sheetName, + List> sheetImages) { + StringBuilder builder = new StringBuilder(); + builder.append("# ").append(sheetName).append("\n\n"); + builder.append("## ").append(sheetName).append(" 图片内容\n\n"); + for (Map image : sheetImages) { + builder.append("[IMG:").append(asString(image.get("referenceKey"))).append("]\n\n"); + } + appendXlsxImageAppendix(builder, sheetName, sheetImages); + return builder.toString().trim(); + } + + private String buildXlsxImageOnlyLlmContent(String sheetName, + List> sheetImages) { + StringBuilder builder = new StringBuilder(); + builder.append("# ").append(sheetName).append("\n\n图片 OCR:\n"); + for (Map image : sheetImages) { + builder.append("- ") + .append(asString(image.get("anchorCell"))) + .append(":") + .append(StringUtil.hasText(asString(image.get("ocrText"))) ? asString(image.get("ocrText")) : "无文本") + .append("\n"); + } + return builder.toString().trim(); + } + + private String buildMarkdownTable(Map headerRow, + List> windowRows) { + List> tableRows = new ArrayList>(); + tableRows.add(headerRow); + tableRows.addAll(windowRows); + int maxCol = resolveMaxColumnCount(tableRows); + if (maxCol <= 0) { + return "_empty sheet_"; + } + List headerCells = resolveMarkdownRowValues(headerRow, maxCol, true); + StringBuilder builder = new StringBuilder(); + builder.append("| ").append(joinMarkdownCells(headerCells)).append(" |\n"); + builder.append("|"); + for (int index = 0; index < maxCol; index++) { + builder.append(" --- |"); + } + builder.append("\n"); + for (Map row : windowRows) { + builder.append("| ").append(joinMarkdownCells(resolveMarkdownRowValues(row, maxCol, false))).append(" |\n"); + } + return builder.toString().trim(); + } + + private void appendXlsxImageAppendix(StringBuilder builder, + String sheetName, + List> images) { + if (images == null || images.isEmpty()) { + return; + } + builder.append("\n\n## ").append(sheetName).append(" 图片说明\n\n"); + for (Map image : images) { + String referenceKey = asString(image.get("referenceKey")); + String sourcePath = asString(image.get("sourcePath")); + builder.append("![") + .append(referenceKey) + .append("](") + .append(sourcePath) + .append(")\n\n"); + builder.append("- 占位符:[IMG:").append(referenceKey).append("]\n"); + builder.append("- 锚点:").append(asString(image.get("anchorCell"))).append("\n"); + builder.append("- OCR:") + .append(StringUtil.hasText(asString(image.get("ocrText"))) ? asString(image.get("ocrText")) : "") + .append("\n\n"); + } + } + + private int resolveMaxColumnCount(List> rows) { + int maxCol = 0; + for (Map row : rows) { + List> cells = readMapList(row == null ? null : row.get("cells")); + for (Map cell : cells) { + maxCol = Math.max(maxCol, asInteger(cell.get("columnIndex"), 0) + 1); + } + } + return maxCol; + } + + private List resolveMarkdownRowValues(Map row, + int maxCol, + boolean headerRow) { + List values = new ArrayList(Collections.nCopies(maxCol, "")); + if (row == null) { + return values; + } + for (Map cell : readMapList(row.get("cells"))) { + int colIndex = asInteger(cell.get("columnIndex"), 0); + if (colIndex < 0 || colIndex >= maxCol) { + continue; + } + String baseText = asString(cell.get("text")); + String mergedText = mergeCellTextWithImages(baseText, readStringList(cell.get("imageKeys"))); + if (headerRow && !StringUtil.hasText(mergedText)) { + mergedText = "列" + (colIndex + 1); + } + values.set(colIndex, escapeMarkdownCell(mergedText)); + } + return values; + } + + private String mergeCellTextWithImages(String baseText, List imageKeys) { + StringBuilder builder = new StringBuilder(); + if (StringUtil.hasText(baseText)) { + builder.append(baseText.trim()); + } + for (String imageKey : imageKeys) { + if (!StringUtil.hasText(imageKey)) { + continue; + } + if (builder.length() > 0) { + builder.append('\n'); + } + builder.append("[IMG:").append(imageKey).append("]"); + } + return builder.toString(); + } + + private String escapeMarkdownCell(String text) { + if (!StringUtil.hasText(text)) { + return ""; + } + return text.replace("|", "\\|").replace("\r", " ").replace("\n", "
"); + } + + private String joinMarkdownCells(List cells) { + StringBuilder builder = new StringBuilder(); + for (int index = 0; index < cells.size(); index++) { + if (index > 0) { + builder.append(" | "); + } + builder.append(cells.get(index)); + } + return builder.toString(); + } + + private List> filterSheetImages(List> cellImages, + String sheetName) { + List> result = new ArrayList>(); + for (Map image : cellImages) { + if (Objects.equals(sheetName, asString(image.get("sheetName")))) { + result.add(new LinkedHashMap(image)); + } + } + return result; + } + + private List> selectWindowImages(List> sheetImages, + int rowStart, + int rowEnd, + boolean noDataRows) { + List> selected = new ArrayList>(); + for (Map image : sheetImages) { + int imageRow = asInteger(image.get("fromRow"), 0) + 1; + if (noDataRows || (imageRow >= rowStart && imageRow <= rowEnd) || (imageRow < rowStart && rowStart <= 2)) { + selected.add(new LinkedHashMap(image)); + } + } + return selected; + } + + private List collectImageRefs(List> images) { + List refs = new ArrayList(); + for (Map image : images) { + String sourcePath = asString(image.get("sourcePath")); + if (StringUtil.hasText(sourcePath)) { + refs.add(sourcePath); + } + } + return refs; + } + + private Map buildXlsxChunkSummary(Map sheetSummary, + Integer rowStart, + Integer rowEnd, + List> images) { + Map summary = new LinkedHashMap(); + summary.put("sheetName", asString(sheetSummary.get("sheetName"))); + summary.put("sheetIndex", sheetSummary.get("sheetIndex")); + summary.put(DocumentImportKeys.KEY_DOCUMENT_ROW_START, rowStart); + summary.put(DocumentImportKeys.KEY_DOCUMENT_ROW_END, rowEnd); + summary.put("images", new ArrayList>(images)); + return summary; + } + + private List> readMapList(Object value) { + List> result = new ArrayList>(); + if (!(value instanceof List rawList)) { + return result; + } + for (Object item : rawList) { + if (item instanceof Map mapItem) { + result.add(new LinkedHashMap((Map) mapItem)); + } + } + return result; + } + + private List readStringList(Object value) { + List result = new ArrayList(); + if (!(value instanceof List rawList)) { + return result; + } + for (Object item : rawList) { + if (StringUtil.hasText(asString(item))) { + result.add(asString(item)); + } + } + return result; + } + + private List buildHeadingPath(String sourceLabel, String title) { + List headingPath = new ArrayList(); + headingPath.add(sourceLabel); + if (StringUtil.hasText(title)) { + headingPath.add(title.trim()); + } + return headingPath; + } + + private String readDocumentRenderMarkdown(tech.easyflow.ai.entity.Document document) { + return document == null || document.getOptions() == null + ? null + : asString(document.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN)); + } + + private String stripMarkdownImageSyntax(String markdown) { + if (!StringUtil.hasText(markdown)) { + return markdown; + } + return markdown.replaceAll("!\\[[^\\]]*\\]\\([^)]*\\)", "").trim(); + } + + private String resolveStrategyLabel(String sourceFormat, StrategyConfig strategyConfig) { + String strategyCode = strategyConfig == null ? null : strategyConfig.getStrategyCode(); + if ("pptx".equals(sourceFormat) || OFFICE_PPTX_PAGE_STRATEGY.equals(strategyCode)) { + return "按页分块"; + } + if ("xlsx".equals(sourceFormat) || OFFICE_XLSX_ROW_WINDOW_STRATEGY.equals(strategyCode)) { + return "按 Sheet / 行窗口"; + } + return ragIngestionService.toStrategyLabel(strategyCode); + } + /** * 组装处理页预览响应。 * @@ -766,11 +1986,11 @@ public class KnowledgeDocumentImportTaskAppService { item.setFileName(document.getTitle()); item.setNormalizedContent(session.getAnalysis() == null ? null : session.getAnalysis().getNormalizedContent()); item.setStrategyCode(session.getStrategyConfig().getStrategyCode()); - item.setStrategyLabel(ragIngestionService.toStrategyLabel(session.getStrategyConfig().getStrategyCode())); + item.setStrategyLabel(resolveStrategyLabel(session.getSourceFormat(), session.getStrategyConfig())); item.setAnalysis(session.getAnalysis()); item.setTotalChunks(session.getDocumentChunks().size()); - item.setTotalWarnings(countWarnings(session.getPreviewChunks())); - item.setChunks(toPreviewChunkResults(session.getPreviewChunks())); + item.setTotalWarnings(countChunkWarnings(session.getDocumentChunks())); + item.setChunks(toPreviewChunkResults(session.getDocumentChunks())); return item; } @@ -778,9 +1998,10 @@ public class KnowledgeDocumentImportTaskAppService { DocumentImportDtos.PreviewSession session) { Map options = copyOptions(document.getOptions()); options.put(DocumentImportKeys.KEY_DOCUMENT_STRATEGY_CODE, session.getStrategyConfig().getStrategyCode()); - options.put(DocumentImportKeys.KEY_DOCUMENT_STRATEGY_LABEL, ragIngestionService.toStrategyLabel(session.getStrategyConfig().getStrategyCode())); + options.put(DocumentImportKeys.KEY_DOCUMENT_STRATEGY_LABEL, resolveStrategyLabel(session.getSourceFormat(), session.getStrategyConfig())); options.put(DocumentImportKeys.KEY_DOCUMENT_STRATEGY_SNAPSHOT, strategyConfigToMap(session.getStrategyConfig())); - options.put(DocumentImportKeys.KEY_DOCUMENT_ANALYSIS_SUMMARY, session.getAnalysis().getFeatures()); + options.put(DocumentImportKeys.KEY_DOCUMENT_ANALYSIS_SUMMARY, + session.getAnalysis() == null ? new LinkedHashMap() : session.getAnalysis().getFeatures()); options.put(DocumentImportKeys.KEY_DOCUMENT_SOURCE_FILE_EXT, session.getSourceFormat()); options.put(DocumentImportKeys.KEY_DOCUMENT_PREVIEW_VERSION, "v2"); @@ -1085,8 +2306,8 @@ public class KnowledgeDocumentImportTaskAppService { } private void assertSupportedImportFile(String fileExt) { - if (!Arrays.asList("pdf", "docx", "txt", "md").contains(fileExt)) { - throw new BusinessException("当前仅支持 pdf/docx/txt/md 文档导入"); + if (!Arrays.asList("pdf", "docx", "txt", "md", "pptx", "xlsx").contains(fileExt)) { + throw new BusinessException("当前仅支持 pdf/docx/txt/md/pptx/xlsx 文档导入"); } } @@ -1097,7 +2318,7 @@ public class KnowledgeDocumentImportTaskAppService { * @return 是否走桥接解析 */ private boolean shouldUseDocumentParseBridge(String fileExt) { - return "pdf".equals(fileExt) || "docx".equals(fileExt); + return "pdf".equals(fileExt) || "docx".equals(fileExt) || "pptx".equals(fileExt) || "xlsx".equals(fileExt); } /** @@ -1161,6 +2382,7 @@ public class KnowledgeDocumentImportTaskAppService { if (isTaskFailed(providerStatus)) { throw new BusinessException(taskInfo == null ? "文档解析失败" : taskInfo.getError()); } + updateDocumentParseProgress(document, taskInfo); touchRunningTask(task); } catch (BusinessException e) { throw e; @@ -1200,6 +2422,12 @@ public class KnowledgeDocumentImportTaskAppService { if ("docx".equals(fileExt)) { return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"; } + if ("pptx".equals(fileExt)) { + return "application/vnd.openxmlformats-officedocument.presentationml.presentation"; + } + if ("xlsx".equals(fileExt)) { + return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; + } return null; } @@ -1399,13 +2627,14 @@ public class KnowledgeDocumentImportTaskAppService { return map; } - private int countWarnings(List chunks) { + private int countChunkWarnings(List chunks) { int total = 0; if (chunks == null) { return 0; } - for (RagChunk chunk : chunks) { - total += chunk.getWarnings() == null ? 0 : chunk.getWarnings().size(); + for (DocumentChunk chunk : chunks) { + List warnings = readChunkWarnings(chunk); + total += warnings.size(); } return total; } @@ -1416,25 +2645,29 @@ public class KnowledgeDocumentImportTaskAppService { * @param chunks 分块列表 * @return 预览分块结果 */ - private List toPreviewChunkResults(List chunks) { + private List toPreviewChunkResults(List chunks) { List result = new ArrayList(); if (chunks == null) { return result; } - for (RagChunk chunk : chunks) { + for (DocumentChunk chunk : chunks) { DocumentImportDtos.PreviewChunkResult item = new DocumentImportDtos.PreviewChunkResult(); - item.setAnswer(chunk.getAnswer()); - item.setCharCount(chunk.getCharCount()); - item.setChunkId(chunk.getChunkId()); - item.setChunkType(chunk.getChunkType()); + Map options = chunk.getOptions() == null + ? Collections.emptyMap() + : chunk.getOptions(); + item.setAnswer(asString(options.get("qaAnswer"))); + item.setCharCount(asInteger(options.get("charCount"), 0)); + item.setChunkId(chunk.getId() == null ? null : String.valueOf(chunk.getId())); + item.setChunkType(asString(options.get("chunkType"))); item.setContent(chunk.getContent()); - item.setHeadingPath(chunk.getHeadingPath() == null ? new ArrayList() : new ArrayList(chunk.getHeadingPath())); - item.setPartNo(chunk.getPartNo()); - item.setPartTotal(chunk.getPartTotal()); - item.setQuestion(chunk.getQuestion()); - item.setSourceLabel(chunk.getSourceLabel()); - item.setTokenEstimate(chunk.getTokenEstimate()); - item.setWarnings(chunk.getWarnings() == null ? new ArrayList() : new ArrayList(chunk.getWarnings())); + item.setRenderMarkdown(asString(options.get(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN))); + item.setHeadingPath(readStringList(options.get("headingPath"))); + item.setPartNo(asInteger(options.get("partNo"), 1)); + item.setPartTotal(asInteger(options.get("partTotal"), 1)); + item.setQuestion(asString(options.get("qaQuestion"))); + item.setSourceLabel(asString(options.get("sourceLabel"))); + item.setTokenEstimate(asInteger(options.get("tokenEstimate"), 0)); + item.setWarnings(readChunkWarnings(chunk)); item.setSourceRanges(copySourceRanges(chunk)); result.add(item); } @@ -1500,6 +2733,48 @@ public class KnowledgeDocumentImportTaskAppService { return result; } + @SuppressWarnings("unchecked") + private List copySourceRanges(DocumentChunk chunk) { + List result = new ArrayList(); + if (chunk == null || chunk.getOptions() == null) { + return result; + } + Object rawRanges = chunk.getOptions().get(SOURCE_RANGES_KEY); + if (!(rawRanges instanceof List rangeList)) { + return result; + } + for (Object item : rangeList) { + if (!(item instanceof Map rawRange)) { + continue; + } + DocumentImportDtos.PreviewSourceRange range = new DocumentImportDtos.PreviewSourceRange(); + range.setStart(asInteger(rawRange.get("start"), null)); + range.setEnd(asInteger(rawRange.get("end"), null)); + if (range.getStart() != null && range.getEnd() != null) { + result.add(range); + } + } + return result; + } + + private List readChunkWarnings(DocumentChunk chunk) { + if (chunk == null || chunk.getOptions() == null) { + return new ArrayList(); + } + return readStringList(chunk.getOptions().get("warnings")); + } + + private void clearDocumentParseProgress(Map options) { + if (options == null || options.isEmpty()) { + return; + } + options.remove(DocumentImportKeys.KEY_DOCUMENT_PARSE_PROGRESS_PERCENT); + options.remove(DocumentImportKeys.KEY_DOCUMENT_PARSE_CURRENT_STAGE); + options.remove(DocumentImportKeys.KEY_DOCUMENT_PARSE_PROCESSED_ITEMS); + options.remove(DocumentImportKeys.KEY_DOCUMENT_PARSE_TOTAL_ITEMS); + options.remove(DocumentImportKeys.KEY_DOCUMENT_PARSE_STATUS_MESSAGE); + } + private BigInteger generateId(Object entity) { FlexIDKeyGenerator generator = new FlexIDKeyGenerator(); return new BigInteger(String.valueOf(generator.generate(entity, null))); @@ -1569,6 +2844,22 @@ public class KnowledgeDocumentImportTaskAppService { return value == null ? 0 : value; } + private static class ParsedKnowledgeContent { + private final String documentLlmContent; + private final String documentRenderMarkdown; + private final Map parseArtifactSummary; + + private ParsedKnowledgeContent(String documentLlmContent, + String documentRenderMarkdown, + Map parseArtifactSummary) { + this.documentLlmContent = documentLlmContent; + this.documentRenderMarkdown = documentRenderMarkdown; + this.parseArtifactSummary = parseArtifactSummary == null + ? new LinkedHashMap() + : new LinkedHashMap(parseArtifactSummary); + } + } + private static class StoreExecutionContext { private final DocumentCollection knowledge; private final EmbeddingModel embeddingModel; diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/dto/KnowledgeSearchResultItem.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/dto/KnowledgeSearchResultItem.java index 4d61697..b1b384f 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/dto/KnowledgeSearchResultItem.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/dto/KnowledgeSearchResultItem.java @@ -4,6 +4,8 @@ public class KnowledgeSearchResultItem { private Integer sorting; private String content; + private String renderMarkdown; + private String sourceFileName; private Double score; private String hitSource; private Double vectorScore; @@ -25,6 +27,22 @@ public class KnowledgeSearchResultItem { this.content = content; } + public String getRenderMarkdown() { + return renderMarkdown; + } + + public void setRenderMarkdown(String renderMarkdown) { + this.renderMarkdown = renderMarkdown; + } + + public String getSourceFileName() { + return sourceFileName; + } + + public void setSourceFileName(String sourceFileName) { + this.sourceFileName = sourceFileName; + } + public Double getScore() { return score; } diff --git a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/service/impl/DocumentCollectionServiceImpl.java b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/service/impl/DocumentCollectionServiceImpl.java index df91789..8bdd4b2 100644 --- a/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/service/impl/DocumentCollectionServiceImpl.java +++ b/easyflow-modules/easyflow-module-ai/src/main/java/tech/easyflow/ai/service/impl/DocumentCollectionServiceImpl.java @@ -32,6 +32,7 @@ import tech.easyflow.ai.entity.FaqItem; import tech.easyflow.ai.entity.Model; import tech.easyflow.ai.enums.DocumentProcessStatus; import tech.easyflow.ai.enums.PublishStatus; +import tech.easyflow.ai.documentimport.DocumentImportKeys; import tech.easyflow.ai.mapper.DocumentChunkMapper; import tech.easyflow.ai.mapper.DocumentCollectionMapper; import tech.easyflow.ai.mapper.DocumentMapper; @@ -406,6 +407,14 @@ public class DocumentCollectionServiceImpl extends ServiceImpl> images) { diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java index 2da518f..1074d4a 100644 --- a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/service/impl/DocumentParseBridgeServiceImplTest.java @@ -6,6 +6,9 @@ import com.easyagents.document.core.entity.ParseResponse; import com.easyagents.document.core.entity.ParseResult; import com.easyagents.document.core.entity.ParseTaskInfo; import com.easyagents.document.core.entity.ParseTaskStatus; +import com.easyagents.document.pdf.PdfDocumentParseService; +import com.easyagents.document.pptx.PptxDocumentParseService; +import com.easyagents.document.xlsx.XlsxDocumentParseService; import org.junit.Assert; import org.junit.Test; import tech.easyflow.ai.document.exception.DocumentParseBridgeException; @@ -37,8 +40,8 @@ public class DocumentParseBridgeServiceImplTest { */ @Test public void shouldParseSuccessfully() { - FakeDocumentParseService parseService = new FakeDocumentParseService(); - DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService); + FakePdfDocumentParseService parseService = new FakePdfDocumentParseService(); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService, null, null, parseService); DocumentParsedResult document = bridgeService.parse(buildSource(), DocumentParseScenario.WORKFLOW_TEXT); @@ -52,8 +55,8 @@ public class DocumentParseBridgeServiceImplTest { */ @Test public void shouldSupportAsyncFlow() { - FakeDocumentParseService parseService = new FakeDocumentParseService(); - DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService); + FakePdfDocumentParseService parseService = new FakePdfDocumentParseService(); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService, null, null, parseService); DocumentParseTaskStatus taskStatus = bridgeService.submit(buildSource(), DocumentParseScenario.KNOWLEDGE_IMPORT); DocumentParseTaskStatus queriedStatus = bridgeService.queryTask("task-1"); @@ -69,9 +72,9 @@ public class DocumentParseBridgeServiceImplTest { */ @Test public void shouldQueryTaskInfoSuccessfully() { - FakeDocumentParseService parseService = new FakeDocumentParseService(); + FakePdfDocumentParseService parseService = new FakePdfDocumentParseService(); parseService.taskStatusValue = "completed"; - DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService, null, null, parseService); DocumentParseTaskInfo taskInfo = bridgeService.queryTaskInfo("task-1"); @@ -85,7 +88,7 @@ public class DocumentParseBridgeServiceImplTest { */ @Test public void shouldThrowWhenServiceDisabled() { - DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, null, null, null); try { bridgeService.parse(buildSource(), DocumentParseScenario.WORKFLOW_TEXT); @@ -95,9 +98,29 @@ public class DocumentParseBridgeServiceImplTest { } } - private DocumentParseBridgeServiceImpl buildBridgeService(DocumentParseService parseService) { + @Test + public void shouldRoutePptxToDedicatedService() { + FakePptxDocumentParseService pptxService = new FakePptxDocumentParseService(); + FakePdfDocumentParseService defaultService = new FakePdfDocumentParseService(); + DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, pptxService, null, defaultService); + + DocumentParsedResult result = bridgeService.parse(buildSource("slides.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation"), DocumentParseScenario.KNOWLEDGE_IMPORT); + + Assert.assertEquals("# pptx", result.getPreferredText()); + Assert.assertEquals(1, pptxService.parseCallCount); + Assert.assertEquals(0, defaultService.parseCallCount); + } + + private DocumentParseBridgeServiceImpl buildBridgeService(PdfDocumentParseService pdfDocumentParseService, + PptxDocumentParseService pptxDocumentParseService, + XlsxDocumentParseService xlsxDocumentParseService, + DocumentParseService parseService) { return new DocumentParseBridgeServiceImpl( parseService, + pdfDocumentParseService, + pptxDocumentParseService, + xlsxDocumentParseService, new DocumentSourceLoader(new InMemoryFileStorageService()), new DocumentParseRequestFactory(), new DocumentParseResultMapper() @@ -105,8 +128,12 @@ public class DocumentParseBridgeServiceImplTest { } private DocumentSourceRef buildSource() { - DocumentSourceRef sourceRef = DocumentSourceRef.ofBytes("demo.pdf", "pdf-data".getBytes(StandardCharsets.UTF_8)); - sourceRef.setContentType("application/pdf"); + return buildSource("demo.pdf", "application/pdf"); + } + + private DocumentSourceRef buildSource(String fileName, String contentType) { + DocumentSourceRef sourceRef = DocumentSourceRef.ofBytes(fileName, "pdf-data".getBytes(StandardCharsets.UTF_8)); + sourceRef.setContentType(contentType); sourceRef.setSize(8L); return sourceRef; } @@ -133,13 +160,15 @@ public class DocumentParseBridgeServiceImplTest { } } - private static class FakeDocumentParseService implements DocumentParseService { + private static class FakePdfDocumentParseService implements PdfDocumentParseService { private ParseRequest lastParseRequest; private String taskStatusValue = "running"; + private int parseCallCount; @Override public ParseResponse parse(ParseRequest request) { + parseCallCount++; this.lastParseRequest = request; return buildResponse(); } @@ -187,4 +216,36 @@ public class DocumentParseBridgeServiceImplTest { return response; } } + + private static class FakePptxDocumentParseService implements PptxDocumentParseService { + + private int parseCallCount; + + @Override + public ParseResponse parse(ParseRequest request) { + parseCallCount++; + ParseResult result = new ParseResult(); + result.setFileName("slides.pptx"); + result.setMarkdown("# pptx"); + result.setPlainText("pptx"); + ParseResponse response = new ParseResponse(); + response.setResults(Collections.singletonList(result)); + return response; + } + + @Override + public ParseTaskStatus submit(ParseRequest request) { + throw new UnsupportedOperationException(); + } + + @Override + public ParseTaskStatus queryTask(String taskId) { + throw new UnsupportedOperationException(); + } + + @Override + public ParseResponse queryResult(String taskId) { + throw new UnsupportedOperationException(); + } + } } diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseRequestFactoryTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseRequestFactoryTest.java index ec1f17d..681a4e3 100644 --- a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseRequestFactoryTest.java +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseRequestFactoryTest.java @@ -1,6 +1,9 @@ package tech.easyflow.ai.document.support; import com.easyagents.document.core.entity.ParseRequest; +import com.easyagents.document.core.entity.PdfParseRequest; +import com.easyagents.document.core.entity.PptxParseRequest; +import com.easyagents.document.core.entity.XlsxParseRequest; import org.junit.Assert; import org.junit.Test; import tech.easyflow.ai.document.model.DocumentParseScenario; @@ -26,6 +29,7 @@ public class DocumentParseRequestFactoryTest { Assert.assertFalse(request.getReturnMiddleJson()); Assert.assertFalse(request.getReturnContentList()); Assert.assertFalse(request.getReturnImages()); + Assert.assertTrue(request instanceof PdfParseRequest); } /** @@ -41,12 +45,33 @@ public class DocumentParseRequestFactoryTest { Assert.assertTrue(request.getReturnMiddleJson()); Assert.assertTrue(request.getReturnContentList()); Assert.assertTrue(request.getReturnImages()); + Assert.assertTrue(request instanceof PdfParseRequest); + } + + /** + * 验证 PPTX / XLSX 会构建对应的强类型请求。 + */ + @Test + public void shouldBuildOfficeTypedRequests() { + DocumentParseRequestFactory factory = new DocumentParseRequestFactory(); + + ParseRequest pptxRequest = factory.build(buildSource("slides.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation"), DocumentParseScenario.KNOWLEDGE_IMPORT); + ParseRequest xlsxRequest = factory.build(buildSource("table.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), DocumentParseScenario.KNOWLEDGE_IMPORT); + + Assert.assertTrue(pptxRequest instanceof PptxParseRequest); + Assert.assertTrue(xlsxRequest instanceof XlsxParseRequest); } private LoadedDocumentSource buildSource() { + return buildSource("demo.pdf", "application/pdf"); + } + + private LoadedDocumentSource buildSource(String fileName, String contentType) { LoadedDocumentSource source = new LoadedDocumentSource(); - source.setFileName("demo.pdf"); - source.setContentType("application/pdf"); + source.setFileName(fileName); + source.setContentType(contentType); source.setContentBytes("pdf-data".getBytes()); source.setSize(8L); return source; diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseResultMapperTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseResultMapperTest.java index 0547337..d14e4d3 100644 --- a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseResultMapperTest.java +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/document/support/DocumentParseResultMapperTest.java @@ -4,6 +4,7 @@ import com.easyagents.document.core.entity.ParseArtifacts; import com.easyagents.document.core.entity.ParseResult; import com.easyagents.document.core.entity.ParseResponse; import com.easyagents.document.core.entity.ParseTaskInfo; +import com.easyagents.document.core.entity.ParseTaskStatus; import org.junit.Assert; import org.junit.Test; import tech.easyflow.ai.document.model.DocumentParseTaskInfo; @@ -65,6 +66,8 @@ public class DocumentParseResultMapperTest { ParseTaskInfo taskInfo = new ParseTaskInfo(); taskInfo.setTaskId("task-1"); taskInfo.setStatus("completed"); + taskInfo.setProgressPercent(100); + taskInfo.setCurrentStage("completed"); ParseResult result = new ParseResult(); result.setFileName("demo.pdf"); @@ -76,7 +79,33 @@ public class DocumentParseResultMapperTest { DocumentParseTaskInfo mapped = mapper.map(taskInfo); Assert.assertEquals("task-1", mapped.getTaskId()); + Assert.assertEquals(Integer.valueOf(100), mapped.getProgressPercent()); + Assert.assertEquals("completed", mapped.getCurrentStage()); Assert.assertNotNull(mapped.getResult()); Assert.assertEquals("# title", mapped.getResult().getPreferredText()); } + + /** + * 验证异步进度字段被完整透传。 + */ + @Test + public void shouldMapTaskStatusProgressFields() { + DocumentParseResultMapper mapper = new DocumentParseResultMapper(); + ParseTaskStatus status = new ParseTaskStatus(); + status.setTaskId("task-2"); + status.setStatus("running"); + status.setProgressPercent(45); + status.setCurrentStage("ocr"); + status.setProcessedItems(9); + status.setTotalItems(20); + status.setStatusMessage("正在识别图片"); + + tech.easyflow.ai.document.model.DocumentParseTaskStatus mapped = mapper.map(status); + + Assert.assertEquals(Integer.valueOf(45), mapped.getProgressPercent()); + Assert.assertEquals("ocr", mapped.getCurrentStage()); + Assert.assertEquals(Integer.valueOf(9), mapped.getProcessedItems()); + Assert.assertEquals(Integer.valueOf(20), mapped.getTotalItems()); + Assert.assertEquals("正在识别图片", mapped.getStatusMessage()); + } } diff --git a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/documentimport/task/KnowledgeDocumentImportTaskAppServiceTest.java b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/documentimport/task/KnowledgeDocumentImportTaskAppServiceTest.java index 185992e..8bd91ad 100644 --- a/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/documentimport/task/KnowledgeDocumentImportTaskAppServiceTest.java +++ b/easyflow-modules/easyflow-module-ai/src/test/java/tech/easyflow/ai/documentimport/task/KnowledgeDocumentImportTaskAppServiceTest.java @@ -1,17 +1,33 @@ package tech.easyflow.ai.documentimport.task; +import com.easyagents.document.core.entity.DocumentBlock; +import com.easyagents.document.core.entity.DocumentImage; +import com.easyagents.document.core.entity.DocumentTable; +import com.easyagents.rag.ingestion.model.StrategyConfig; import org.junit.Assert; import org.junit.Test; +import org.springframework.web.multipart.MultipartFile; +import tech.easyflow.ai.document.model.DocumentParseArtifacts; +import tech.easyflow.ai.document.model.DocumentParsedResult; +import tech.easyflow.ai.documentimport.DocumentImportKeys; +import tech.easyflow.ai.entity.DocumentChunk; import tech.easyflow.ai.entity.DocumentImportTask; import tech.easyflow.ai.enums.DocumentImportTaskStatus; import tech.easyflow.ai.enums.DocumentProcessStatus; import tech.easyflow.ai.mapper.DocumentMapper; import tech.easyflow.ai.service.DocumentImportTaskService; +import tech.easyflow.common.filestorage.FileStorageService; import java.lang.reflect.Field; import java.lang.reflect.Method; import java.lang.reflect.Proxy; import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Base64; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; import java.util.concurrent.atomic.AtomicReference; /** @@ -84,6 +100,258 @@ public class KnowledgeDocumentImportTaskAppServiceTest { Assert.assertEquals("新错误", updatedTask.getErrorSummary()); } + /** + * 验证知识库导入会把解析图片上传到对象存储,并同步改写 Markdown 与结构化引用。 + * + * @throws Exception 反射调用异常 + */ + @Test + public void normalizeParsedImagesForKnowledgeImportShouldUploadAndRewriteReferences() throws Exception { + KnowledgeDocumentImportTaskAppService service = new KnowledgeDocumentImportTaskAppService(); + AtomicReference savedPrePathRef = new AtomicReference(); + AtomicReference savedFilenameRef = new AtomicReference(); + setField(service, "storageService", mockFileStorageService(savedPrePathRef, savedFilenameRef)); + + tech.easyflow.ai.entity.Document document = new tech.easyflow.ai.entity.Document(); + document.setId(BigInteger.valueOf(88)); + document.setTitle("产品说明书(终版).pdf"); + + DocumentParsedResult parsedResult = new DocumentParsedResult(); + parsedResult.setMarkdown("图例如下:\n![](images/sample-image.png)"); + parsedResult.setPreferredText(parsedResult.getMarkdown()); + parsedResult.setPlainText(parsedResult.getMarkdown()); + + DocumentImage image = new DocumentImage(); + image.setName("sample-image.png"); + image.setSourcePath("images/sample-image.png"); + image.setMimeType("image/png"); + image.setDataUrl("data:image/png;base64," + Base64.getEncoder().encodeToString("demo".getBytes(StandardCharsets.UTF_8))); + parsedResult.setImages(new ArrayList(List.of(image))); + + DocumentBlock block = new DocumentBlock(); + block.setImagePath("images/sample-image.png"); + parsedResult.setBlocks(new ArrayList(List.of(block))); + + DocumentTable table = new DocumentTable(); + table.setImagePath("images/sample-image.png"); + parsedResult.setTables(new ArrayList(List.of(table))); + + DocumentParseArtifacts artifacts = new DocumentParseArtifacts(); + List> contentList = new ArrayList>(); + Map contentItem = new LinkedHashMap(); + contentItem.put("img_path", "images/sample-image.png"); + contentList.add(contentItem); + artifacts.setContentList(contentList); + Map xlsxArtifact = new LinkedHashMap(); + List> sheetImages = new ArrayList>(); + sheetImages.add(new LinkedHashMap() {{ + put("sheetName", "Sheet1"); + put("sourcePaths", new ArrayList(List.of("images/sample-image.png"))); + }}); + xlsxArtifact.put("sheetImages", sheetImages); + artifacts.setExtraJsonArtifacts(new LinkedHashMap() {{ + put("xlsx", xlsxArtifact); + }}); + parsedResult.setArtifacts(artifacts); + + Method method = KnowledgeDocumentImportTaskAppService.class.getDeclaredMethod( + "normalizeParsedImagesForKnowledgeImport", + tech.easyflow.ai.entity.Document.class, + DocumentParsedResult.class + ); + method.setAccessible(true); + DocumentParsedResult normalized = (DocumentParsedResult) method.invoke(service, document, parsedResult); + + Assert.assertNotNull(normalized); + Assert.assertEquals("knowledge-parse/88_产品说明书_终版/images", savedPrePathRef.get()); + Assert.assertEquals("sample-image.png", savedFilenameRef.get()); + + String expectedUrl = "http://localhost:39000/easyflow/attachment/knowledge-parse/88_产品说明书_终版/images/sample-image.png"; + Assert.assertTrue(normalized.getMarkdown().contains(expectedUrl)); + Assert.assertEquals(expectedUrl, normalized.getBlocks().get(0).getImagePath()); + Assert.assertEquals(expectedUrl, normalized.getTables().get(0).getImagePath()); + Assert.assertEquals(expectedUrl, normalized.getImages().get(0).getSourcePath()); + Assert.assertNull(normalized.getImages().get(0).getDataUrl()); + Object rewrittenContentList = normalized.getArtifacts().getContentList(); + Assert.assertTrue(rewrittenContentList instanceof List); + Assert.assertEquals(expectedUrl, ((Map) ((List) rewrittenContentList).get(0)).get("img_path")); + Object rewrittenSheetImages = ((Map) normalized.getArtifacts().getExtraJsonArtifacts().get("xlsx")).get("sheetImages"); + Assert.assertTrue(rewrittenSheetImages instanceof List); + Object sourcePaths = ((Map) ((List) rewrittenSheetImages).get(0)).get("sourcePaths"); + Assert.assertEquals(expectedUrl, ((List) sourcePaths).get(0)); + } + + /** + * 验证 PPTX 会基于页级工件生成稳定的知识库分块。 + * + * @throws Exception 反射调用异常 + */ + @Test + public void buildOfficeDocumentChunksShouldSplitPptxBySlide() throws Exception { + KnowledgeDocumentImportTaskAppService service = new KnowledgeDocumentImportTaskAppService(); + tech.easyflow.ai.entity.Document document = new tech.easyflow.ai.entity.Document(); + document.setId(BigInteger.valueOf(101)); + document.setCollectionId(BigInteger.valueOf(201)); + document.setTitle("季度汇报.pptx"); + + Map parseArtifactSummary = new LinkedHashMap(); + List> slides = new ArrayList>(); + slides.add(new LinkedHashMap() {{ + put("slideIndex", 0); + put("title", "封面"); + put("ocrMarkdown", "本页介绍季度目标。"); + put("imagePath", "https://example.com/slides/slide-001.png"); + put("imageName", "slide-001-page"); + }}); + slides.add(new LinkedHashMap() {{ + put("slideIndex", 1); + put("title", "经营分析"); + put("ocrMarkdown", "收入同比增长 18%。"); + put("imagePath", "https://example.com/slides/slide-002.png"); + put("imageName", "slide-002-page"); + }}); + parseArtifactSummary.put("slides", slides); + + Method method = KnowledgeDocumentImportTaskAppService.class.getDeclaredMethod( + "buildOfficeDocumentChunks", + tech.easyflow.ai.entity.Document.class, + String.class, + StrategyConfig.class, + Map.class + ); + method.setAccessible(true); + + @SuppressWarnings("unchecked") + List chunks = (List) method.invoke( + service, + document, + "pptx", + null, + parseArtifactSummary + ); + + Assert.assertEquals(2, chunks.size()); + DocumentChunk firstChunk = chunks.get(0); + Assert.assertTrue(firstChunk.getContent().contains("Slide 1")); + Assert.assertTrue(firstChunk.getContent().contains("本页介绍季度目标")); + Assert.assertEquals("https://example.com/slides/slide-001.png", + ((List) firstChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_IMAGE_REFS)).get(0)); + Assert.assertEquals(1, firstChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_PAGE_INDEX)); + Assert.assertTrue(String.valueOf(firstChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN)) + .contains("slide-001.png")); + } + + /** + * 验证 XLSX 纯图片 Sheet 不会退化为空内容,并会输出稳定图片引用。 + * + * @throws Exception 反射调用异常 + */ + @Test + public void buildOfficeDocumentChunksShouldKeepImageOnlyXlsxSheetReferences() throws Exception { + KnowledgeDocumentImportTaskAppService service = new KnowledgeDocumentImportTaskAppService(); + tech.easyflow.ai.entity.Document document = new tech.easyflow.ai.entity.Document(); + document.setId(BigInteger.valueOf(102)); + document.setCollectionId(BigInteger.valueOf(202)); + document.setTitle("巡检记录.xlsx"); + + Map parseArtifactSummary = new LinkedHashMap(); + List> sheets = new ArrayList>(); + sheets.add(new LinkedHashMap() {{ + put("sheetName", "图片页"); + put("sheetIndex", 0); + put("rows", new ArrayList>()); + }}); + parseArtifactSummary.put("sheets", sheets); + + List> cellImages = new ArrayList>(); + cellImages.add(new LinkedHashMap() {{ + put("sheetName", "图片页"); + put("referenceKey", "image-sheet-r2c2-001"); + put("sourcePath", "https://example.com/xlsx/sheet/image-001.jpeg"); + put("anchorCell", "B2"); + put("ocrText", "设备状态正常"); + put("fromRow", 1); + }}); + parseArtifactSummary.put("cellImages", cellImages); + + StrategyConfig strategyConfig = new StrategyConfig(); + strategyConfig.setRowsPerChunk(10); + + Method method = KnowledgeDocumentImportTaskAppService.class.getDeclaredMethod( + "buildOfficeDocumentChunks", + tech.easyflow.ai.entity.Document.class, + String.class, + StrategyConfig.class, + Map.class + ); + method.setAccessible(true); + + @SuppressWarnings("unchecked") + List chunks = (List) method.invoke( + service, + document, + "xlsx", + strategyConfig, + parseArtifactSummary + ); + + Assert.assertEquals(1, chunks.size()); + DocumentChunk onlyChunk = chunks.get(0); + Assert.assertTrue(onlyChunk.getContent().contains("图片 OCR")); + Assert.assertTrue(onlyChunk.getContent().contains("设备状态正常")); + Assert.assertEquals("图片页", onlyChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_SHEET_NAME)); + Assert.assertEquals("https://example.com/xlsx/sheet/image-001.jpeg", + ((List) onlyChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_IMAGE_REFS)).get(0)); + String renderMarkdown = String.valueOf(onlyChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN)); + Assert.assertTrue(renderMarkdown.contains("[IMG:image-sheet-r2c2-001]")); + Assert.assertTrue(renderMarkdown.contains("![image-sheet-r2c2-001](https://example.com/xlsx/sheet/image-001.jpeg)")); + } + + /** + * 验证空白 Sheet 不会被误判成纯图片分块。 + * + * @throws Exception 反射调用异常 + */ + @Test + public void buildOfficeDocumentChunksShouldSkipBlankXlsxSheetWithoutImages() throws Exception { + KnowledgeDocumentImportTaskAppService service = new KnowledgeDocumentImportTaskAppService(); + tech.easyflow.ai.entity.Document document = new tech.easyflow.ai.entity.Document(); + document.setId(BigInteger.valueOf(103)); + document.setCollectionId(BigInteger.valueOf(203)); + document.setTitle("空白工作簿.xlsx"); + + Map parseArtifactSummary = new LinkedHashMap(); + parseArtifactSummary.put("sheets", new ArrayList>(List.of(new LinkedHashMap() {{ + put("sheetName", "空白页"); + put("sheetIndex", 0); + put("rows", new ArrayList>()); + }}))); + parseArtifactSummary.put("cellImages", new ArrayList>()); + + StrategyConfig strategyConfig = new StrategyConfig(); + strategyConfig.setRowsPerChunk(10); + + Method method = KnowledgeDocumentImportTaskAppService.class.getDeclaredMethod( + "buildOfficeDocumentChunks", + tech.easyflow.ai.entity.Document.class, + String.class, + StrategyConfig.class, + Map.class + ); + method.setAccessible(true); + + @SuppressWarnings("unchecked") + List chunks = (List) method.invoke( + service, + document, + "xlsx", + strategyConfig, + parseArtifactSummary + ); + + Assert.assertTrue(chunks.isEmpty()); + } + private static DocumentMapper mockDocumentMapper(tech.easyflow.ai.entity.Document persistedDocument, AtomicReference updatedDocumentRef) { return (DocumentMapper) Proxy.newProxyInstance( @@ -116,6 +384,22 @@ public class KnowledgeDocumentImportTaskAppServiceTest { ); } + private static FileStorageService mockFileStorageService(AtomicReference savedPrePathRef, + AtomicReference savedFilenameRef) { + return (FileStorageService) Proxy.newProxyInstance( + FileStorageService.class.getClassLoader(), + new Class[]{FileStorageService.class}, + (proxy, method, args) -> { + if ("save".equals(method.getName()) && args != null && args.length == 2 && args[0] instanceof MultipartFile file) { + savedPrePathRef.set((String) args[1]); + savedFilenameRef.set(file.getOriginalFilename()); + return "http://localhost:39000/easyflow/attachment/" + args[1] + "/" + file.getOriginalFilename(); + } + return defaultValue(method.getReturnType()); + } + ); + } + private static void setField(Object target, String fieldName, Object value) throws Exception { Field field = KnowledgeDocumentImportTaskAppService.class.getDeclaredField(fieldName); field.setAccessible(true); diff --git a/easyflow-starter/easyflow-starter-all/src/main/resources/application.yml b/easyflow-starter/easyflow-starter-all/src/main/resources/application.yml index 226a191..ae65751 100644 --- a/easyflow-starter/easyflow-starter-all/src/main/resources/application.yml +++ b/easyflow-starter/easyflow-starter-all/src/main/resources/application.yml @@ -166,7 +166,7 @@ dromara: # easy-agents 文档解析统一配置 easy-agents: document: - pdf: + ocr: provider: mineru mineru: # 统一文档解析桥接层直接复用 easy-agents 的 provider 配置,不在 easyflow 再复制一套配置体系 diff --git a/easyflow-ui-admin/app/src/views/ai/documentCollection/ChunkDocumentTable.vue b/easyflow-ui-admin/app/src/views/ai/documentCollection/ChunkDocumentTable.vue index eb5ab86..f539643 100644 --- a/easyflow-ui-admin/app/src/views/ai/documentCollection/ChunkDocumentTable.vue +++ b/easyflow-ui-admin/app/src/views/ai/documentCollection/ChunkDocumentTable.vue @@ -1,10 +1,11 @@