feat: 支持知识库导入 PPTX 与 XLSX 文档
- 打通 Office 文档桥接解析、解析进度承接与图片引用改写 - 落地 PPTX 按页分块、XLSX 行窗口分块以及预览与检索渲染闭环
This commit is contained in:
@@ -112,7 +112,6 @@
|
||||
<groupId>com.easyagents</groupId>
|
||||
<artifactId>easy-agents-mcp</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
|
||||
@@ -35,10 +35,14 @@ public class DocumentParseBridgeException extends RuntimeException {
|
||||
public static DocumentParseBridgeException serviceNotEnabled() {
|
||||
return new DocumentParseBridgeException(
|
||||
"service_not_enabled",
|
||||
"统一文档解析服务未启用,请先配置 easy-agents.document.pdf.provider"
|
||||
"统一文档解析服务未启用,请先配置 easy-agents.document.ocr.provider=mineru"
|
||||
);
|
||||
}
|
||||
|
||||
public static DocumentParseBridgeException serviceNotEnabled(String message) {
|
||||
return new DocumentParseBridgeException("service_not_enabled", message);
|
||||
}
|
||||
|
||||
public static DocumentParseBridgeException unsupportedSource(String message) {
|
||||
return new DocumentParseBridgeException("unsupported_source", message);
|
||||
}
|
||||
|
||||
@@ -22,6 +22,11 @@ public class DocumentParseTaskStatus {
|
||||
private String statusUrl;
|
||||
private String resultUrl;
|
||||
private Integer queuedAhead;
|
||||
private Integer progressPercent;
|
||||
private String currentStage;
|
||||
private Integer processedItems;
|
||||
private Integer totalItems;
|
||||
private String statusMessage;
|
||||
|
||||
public String getTaskId() {
|
||||
return taskId;
|
||||
@@ -110,4 +115,44 @@ public class DocumentParseTaskStatus {
|
||||
public void setQueuedAhead(Integer queuedAhead) {
|
||||
this.queuedAhead = queuedAhead;
|
||||
}
|
||||
|
||||
public Integer getProgressPercent() {
|
||||
return progressPercent;
|
||||
}
|
||||
|
||||
public void setProgressPercent(Integer progressPercent) {
|
||||
this.progressPercent = progressPercent;
|
||||
}
|
||||
|
||||
public String getCurrentStage() {
|
||||
return currentStage;
|
||||
}
|
||||
|
||||
public void setCurrentStage(String currentStage) {
|
||||
this.currentStage = currentStage;
|
||||
}
|
||||
|
||||
public Integer getProcessedItems() {
|
||||
return processedItems;
|
||||
}
|
||||
|
||||
public void setProcessedItems(Integer processedItems) {
|
||||
this.processedItems = processedItems;
|
||||
}
|
||||
|
||||
public Integer getTotalItems() {
|
||||
return totalItems;
|
||||
}
|
||||
|
||||
public void setTotalItems(Integer totalItems) {
|
||||
this.totalItems = totalItems;
|
||||
}
|
||||
|
||||
public String getStatusMessage() {
|
||||
return statusMessage;
|
||||
}
|
||||
|
||||
public void setStatusMessage(String statusMessage) {
|
||||
this.statusMessage = statusMessage;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,10 @@ import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import com.easyagents.document.pdf.PdfDocumentParseService;
|
||||
import com.easyagents.document.pptx.PptxDocumentParseService;
|
||||
import com.easyagents.document.xlsx.XlsxDocumentParseService;
|
||||
import org.springframework.beans.factory.annotation.Qualifier;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.lang.Nullable;
|
||||
@@ -20,8 +24,13 @@ import tech.easyflow.ai.document.service.DocumentParseBridgeService;
|
||||
import tech.easyflow.ai.document.support.DocumentSourceLoader;
|
||||
import tech.easyflow.ai.document.support.DocumentParseRequestFactory;
|
||||
import tech.easyflow.ai.document.support.DocumentParseResultMapper;
|
||||
import tech.easyflow.ai.document.support.DocumentParseSourceType;
|
||||
import tech.easyflow.ai.document.support.LoadedDocumentSource;
|
||||
import tech.easyflow.ai.utils.DocUtil;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
/**
|
||||
* 统一文档解析桥接门面默认实现。
|
||||
@@ -33,18 +42,33 @@ import tech.easyflow.ai.utils.DocUtil;
|
||||
public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeService {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(DocumentParseBridgeServiceImpl.class);
|
||||
private static final String DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME = "documentParseService";
|
||||
|
||||
@Nullable
|
||||
private final DocumentParseService documentParseService;
|
||||
private final DocumentParseService defaultDocumentParseService;
|
||||
@Nullable
|
||||
private final PdfDocumentParseService pdfDocumentParseService;
|
||||
@Nullable
|
||||
private final PptxDocumentParseService pptxDocumentParseService;
|
||||
@Nullable
|
||||
private final XlsxDocumentParseService xlsxDocumentParseService;
|
||||
private final DocumentSourceLoader documentSourceLoader;
|
||||
private final DocumentParseRequestFactory parseRequestFactory;
|
||||
private final DocumentParseResultMapper parseResultMapper;
|
||||
|
||||
public DocumentParseBridgeServiceImpl(@Nullable DocumentParseService documentParseService,
|
||||
public DocumentParseBridgeServiceImpl(@Nullable
|
||||
@Qualifier(DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME)
|
||||
DocumentParseService defaultDocumentParseService,
|
||||
@Nullable PdfDocumentParseService pdfDocumentParseService,
|
||||
@Nullable PptxDocumentParseService pptxDocumentParseService,
|
||||
@Nullable XlsxDocumentParseService xlsxDocumentParseService,
|
||||
DocumentSourceLoader documentSourceLoader,
|
||||
DocumentParseRequestFactory parseRequestFactory,
|
||||
DocumentParseResultMapper parseResultMapper) {
|
||||
this.documentParseService = documentParseService;
|
||||
this.defaultDocumentParseService = defaultDocumentParseService;
|
||||
this.pdfDocumentParseService = pdfDocumentParseService;
|
||||
this.pptxDocumentParseService = pptxDocumentParseService;
|
||||
this.xlsxDocumentParseService = xlsxDocumentParseService;
|
||||
this.documentSourceLoader = documentSourceLoader;
|
||||
this.parseRequestFactory = parseRequestFactory;
|
||||
this.parseResultMapper = parseResultMapper;
|
||||
@@ -59,7 +83,8 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic
|
||||
LoadedDocumentSource loadedSource = prepareSupportedSource(source);
|
||||
LOG.info("桥接服务开始同步解析文档: fileName={}, contentType={}, scenario={}",
|
||||
loadedSource.getFileName(), loadedSource.getContentType(), scenario);
|
||||
ParseResponse response = requireService().parse(parseRequestFactory.build(loadedSource, scenario));
|
||||
DocumentParseService parseService = resolveService(loadedSource);
|
||||
ParseResponse response = parseService.parse(parseRequestFactory.build(loadedSource, scenario));
|
||||
DocumentParsedResult result = parseResultMapper.map(extractSingleResult(response, false));
|
||||
LOG.info("桥接服务同步解析完成: fileName={}, scenario={}, preferredTextLength={}",
|
||||
loadedSource.getFileName(), scenario, resolveTextLength(result));
|
||||
@@ -84,7 +109,8 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic
|
||||
LoadedDocumentSource loadedSource = prepareSupportedSource(source);
|
||||
LOG.info("桥接服务开始提交异步解析任务: fileName={}, contentType={}, scenario={}",
|
||||
loadedSource.getFileName(), loadedSource.getContentType(), scenario);
|
||||
ParseTaskStatus taskStatus = requireService().submit(parseRequestFactory.build(loadedSource, scenario));
|
||||
DocumentParseService parseService = resolveService(loadedSource);
|
||||
ParseTaskStatus taskStatus = parseService.submit(parseRequestFactory.build(loadedSource, scenario));
|
||||
DocumentParseTaskStatus mappedStatus = parseResultMapper.map(taskStatus);
|
||||
LOG.info("桥接服务异步解析任务提交完成: fileName={}, scenario={}, providerTaskId={}, status={}",
|
||||
loadedSource.getFileName(), scenario, mappedStatus.getTaskId(), mappedStatus.getStatus());
|
||||
@@ -109,7 +135,8 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic
|
||||
throw DocumentParseBridgeException.taskFailed("taskId 不能为空");
|
||||
}
|
||||
try {
|
||||
return parseResultMapper.map(requireService().queryTask(taskId));
|
||||
ParseTaskStatus taskStatus = executeAgainstTaskService(taskId, service -> service.queryTask(taskId));
|
||||
return parseResultMapper.map(taskStatus);
|
||||
} catch (DocumentParseBridgeException e) {
|
||||
throw e;
|
||||
} catch (Exception e) {
|
||||
@@ -127,7 +154,7 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic
|
||||
}
|
||||
try {
|
||||
LOG.info("桥接服务开始获取异步解析结果: providerTaskId={}", taskId);
|
||||
ParseResponse response = requireService().queryResult(taskId);
|
||||
ParseResponse response = executeAgainstTaskService(taskId, service -> service.queryResult(taskId));
|
||||
DocumentParsedResult result = parseResultMapper.map(extractSingleResult(response, true));
|
||||
LOG.info("桥接服务获取异步解析结果完成: providerTaskId={}, preferredTextLength={}",
|
||||
taskId, resolveTextLength(result));
|
||||
@@ -150,7 +177,7 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic
|
||||
throw DocumentParseBridgeException.taskFailed("taskId 不能为空");
|
||||
}
|
||||
try {
|
||||
ParseTaskInfo taskInfo = requireService().queryTaskInfo(taskId);
|
||||
ParseTaskInfo taskInfo = executeAgainstTaskService(taskId, service -> service.queryTaskInfo(taskId));
|
||||
DocumentParseTaskInfo mappedTaskInfo = parseResultMapper.map(taskInfo);
|
||||
LOG.info("桥接服务查询异步解析任务状态: providerTaskId={}, status={}, hasResult={}",
|
||||
taskId,
|
||||
@@ -177,39 +204,84 @@ public class DocumentParseBridgeServiceImpl implements DocumentParseBridgeServic
|
||||
return text == null ? 0 : text.length();
|
||||
}
|
||||
|
||||
private DocumentParseService requireService() {
|
||||
if (documentParseService == null) {
|
||||
throw DocumentParseBridgeException.serviceNotEnabled();
|
||||
}
|
||||
return documentParseService;
|
||||
}
|
||||
|
||||
private LoadedDocumentSource prepareSupportedSource(DocumentSourceRef source) {
|
||||
LoadedDocumentSource loadedSource = documentSourceLoader.load(source);
|
||||
if (!isSupportedByBridge(loadedSource)) {
|
||||
throw DocumentParseBridgeException.unsupportedSource("统一文档解析桥接当前仅支持 PDF、DOCX 文件");
|
||||
throw DocumentParseBridgeException.unsupportedSource("统一文档解析桥接当前仅支持 PDF、DOCX、PPTX、XLSX 文件");
|
||||
}
|
||||
return loadedSource;
|
||||
}
|
||||
|
||||
private boolean isSupportedByBridge(LoadedDocumentSource loadedSource) {
|
||||
String contentType = loadedSource.getContentType();
|
||||
if (StringUtils.hasText(contentType)) {
|
||||
String normalizedContentType = contentType.toLowerCase();
|
||||
if (normalizedContentType.contains("pdf")
|
||||
|| normalizedContentType.contains("wordprocessingml.document")) {
|
||||
return true;
|
||||
return DocumentParseSourceType.resolve(loadedSource.getFileName(), loadedSource.getContentType()) != DocumentParseSourceType.UNSUPPORTED;
|
||||
}
|
||||
|
||||
private DocumentParseService resolveService(LoadedDocumentSource loadedSource) {
|
||||
DocumentParseSourceType sourceType = DocumentParseSourceType.resolve(loadedSource.getFileName(), loadedSource.getContentType());
|
||||
switch (sourceType) {
|
||||
case PDF:
|
||||
return requireSpecificService(pdfDocumentParseService, defaultDocumentParseService, "PDF");
|
||||
case DOCX:
|
||||
return requireSpecificService(defaultDocumentParseService, pdfDocumentParseService, "DOCX");
|
||||
case PPTX:
|
||||
return requireSpecificService(pptxDocumentParseService, null, "PPTX");
|
||||
case XLSX:
|
||||
return requireSpecificService(xlsxDocumentParseService, null, "XLSX");
|
||||
default:
|
||||
throw DocumentParseBridgeException.unsupportedSource("当前文件类型暂不支持桥接解析");
|
||||
}
|
||||
}
|
||||
|
||||
private DocumentParseService requireSpecificService(@Nullable DocumentParseService primaryService,
|
||||
@Nullable DocumentParseService fallbackService,
|
||||
String sourceType) {
|
||||
if (primaryService != null) {
|
||||
return primaryService;
|
||||
}
|
||||
if (fallbackService != null) {
|
||||
return fallbackService;
|
||||
}
|
||||
throw DocumentParseBridgeException.serviceNotEnabled("未启用 " + sourceType + " 文档解析服务");
|
||||
}
|
||||
|
||||
private <T> T executeAgainstTaskService(String taskId, Function<DocumentParseService, T> action) {
|
||||
List<DocumentParseService> services = availableServices();
|
||||
if (services.isEmpty()) {
|
||||
throw DocumentParseBridgeException.serviceNotEnabled();
|
||||
}
|
||||
Exception lastException = null;
|
||||
for (DocumentParseService service : services) {
|
||||
try {
|
||||
return action.apply(service);
|
||||
} catch (Exception exception) {
|
||||
lastException = exception;
|
||||
LOG.debug("桥接服务任务查询尝试失败,准备切换下一个解析服务: taskId={}, service={}",
|
||||
taskId,
|
||||
service.getClass().getSimpleName(),
|
||||
exception);
|
||||
}
|
||||
}
|
||||
String fileName = loadedSource.getFileName();
|
||||
if (!StringUtils.hasText(fileName) || !fileName.contains(".")) {
|
||||
return false;
|
||||
if (lastException instanceof RuntimeException) {
|
||||
throw (RuntimeException) lastException;
|
||||
}
|
||||
String suffix = DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName));
|
||||
if ("pdf".equals(suffix) || "docx".equals(suffix)) {
|
||||
return true;
|
||||
throw DocumentParseBridgeException.taskFailed("未找到可处理当前任务ID的文档解析服务", lastException);
|
||||
}
|
||||
|
||||
private List<DocumentParseService> availableServices() {
|
||||
LinkedHashSet<DocumentParseService> services = new LinkedHashSet<DocumentParseService>();
|
||||
if (pptxDocumentParseService != null) {
|
||||
services.add(pptxDocumentParseService);
|
||||
}
|
||||
return false;
|
||||
if (xlsxDocumentParseService != null) {
|
||||
services.add(xlsxDocumentParseService);
|
||||
}
|
||||
if (pdfDocumentParseService != null) {
|
||||
services.add(pdfDocumentParseService);
|
||||
}
|
||||
if (defaultDocumentParseService != null) {
|
||||
services.add(defaultDocumentParseService);
|
||||
}
|
||||
return new ArrayList<DocumentParseService>(services);
|
||||
}
|
||||
|
||||
private ParseResult extractSingleResult(ParseResponse response, boolean resultFetchPhase) {
|
||||
|
||||
@@ -2,6 +2,9 @@ package tech.easyflow.ai.document.support;
|
||||
|
||||
import com.easyagents.document.core.entity.ParseFile;
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.PdfParseRequest;
|
||||
import com.easyagents.document.core.entity.PptxParseRequest;
|
||||
import com.easyagents.document.core.entity.XlsxParseRequest;
|
||||
import org.springframework.stereotype.Component;
|
||||
import tech.easyflow.ai.document.exception.DocumentParseBridgeException;
|
||||
import tech.easyflow.ai.document.model.DocumentParseScenario;
|
||||
@@ -31,12 +34,28 @@ public class DocumentParseRequestFactory {
|
||||
if (scenario == null) {
|
||||
throw DocumentParseBridgeException.requestBuildFailed("解析场景不能为空");
|
||||
}
|
||||
ParseRequest request = new ParseRequest();
|
||||
ParseRequest request = createTypedRequest(source);
|
||||
request.addFile(ParseFile.of(source.getFileName(), source.getContentBytes(), source.getContentType()));
|
||||
applyScenario(request, scenario);
|
||||
return request;
|
||||
}
|
||||
|
||||
private ParseRequest createTypedRequest(LoadedDocumentSource source) {
|
||||
DocumentParseSourceType sourceType = DocumentParseSourceType.resolve(source.getFileName(), source.getContentType());
|
||||
switch (sourceType) {
|
||||
case PDF:
|
||||
return new PdfParseRequest();
|
||||
case PPTX:
|
||||
return new PptxParseRequest();
|
||||
case XLSX:
|
||||
return new XlsxParseRequest();
|
||||
case DOCX:
|
||||
return new ParseRequest();
|
||||
default:
|
||||
throw DocumentParseBridgeException.requestBuildFailed("当前文件类型暂不支持桥接解析");
|
||||
}
|
||||
}
|
||||
|
||||
private void applyScenario(ParseRequest request, DocumentParseScenario scenario) {
|
||||
switch (scenario) {
|
||||
case WORKFLOW_TEXT:
|
||||
|
||||
@@ -69,6 +69,11 @@ public class DocumentParseResultMapper {
|
||||
status.setStatusUrl(taskStatus.getStatusUrl());
|
||||
status.setResultUrl(taskStatus.getResultUrl());
|
||||
status.setQueuedAhead(taskStatus.getQueuedAhead());
|
||||
status.setProgressPercent(taskStatus.getProgressPercent());
|
||||
status.setCurrentStage(taskStatus.getCurrentStage());
|
||||
status.setProcessedItems(taskStatus.getProcessedItems());
|
||||
status.setTotalItems(taskStatus.getTotalItems());
|
||||
status.setStatusMessage(taskStatus.getStatusMessage());
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -104,6 +109,11 @@ public class DocumentParseResultMapper {
|
||||
status.setStatusUrl(taskStatus.getStatusUrl());
|
||||
status.setResultUrl(taskStatus.getResultUrl());
|
||||
status.setQueuedAhead(taskStatus.getQueuedAhead());
|
||||
status.setProgressPercent(taskStatus.getProgressPercent());
|
||||
status.setCurrentStage(taskStatus.getCurrentStage());
|
||||
status.setProcessedItems(taskStatus.getProcessedItems());
|
||||
status.setTotalItems(taskStatus.getTotalItems());
|
||||
status.setStatusMessage(taskStatus.getStatusMessage());
|
||||
}
|
||||
|
||||
private String resolvePreferredText(ParseResult parseResult) {
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
package tech.easyflow.ai.document.support;
|
||||
|
||||
import org.springframework.util.StringUtils;
|
||||
import tech.easyflow.ai.utils.DocUtil;
|
||||
|
||||
/**
|
||||
* 统一文档解析桥接支持的源文件类型。
|
||||
*
|
||||
* @author Codex
|
||||
* @since 2026-04-17
|
||||
*/
|
||||
public enum DocumentParseSourceType {
|
||||
|
||||
PDF,
|
||||
DOCX,
|
||||
PPTX,
|
||||
XLSX,
|
||||
UNSUPPORTED;
|
||||
|
||||
/**
|
||||
* 根据文件名与内容类型推断文档类型。
|
||||
*
|
||||
* @param fileName 文件名
|
||||
* @param contentType MIME 类型
|
||||
* @return 文档类型
|
||||
*/
|
||||
public static DocumentParseSourceType resolve(String fileName, String contentType) {
|
||||
if (StringUtils.hasText(contentType)) {
|
||||
String normalizedContentType = contentType.toLowerCase();
|
||||
if (normalizedContentType.contains("pdf")) {
|
||||
return PDF;
|
||||
}
|
||||
if (normalizedContentType.contains("wordprocessingml.document")) {
|
||||
return DOCX;
|
||||
}
|
||||
if (normalizedContentType.contains("presentationml.presentation")) {
|
||||
return PPTX;
|
||||
}
|
||||
if (normalizedContentType.contains("spreadsheetml.sheet")) {
|
||||
return XLSX;
|
||||
}
|
||||
}
|
||||
if (!StringUtils.hasText(fileName) || !fileName.contains(".")) {
|
||||
return UNSUPPORTED;
|
||||
}
|
||||
String suffix = DocUtil.normalizeSuffix(DocUtil.getSuffix(fileName));
|
||||
if ("pdf".equals(suffix)) {
|
||||
return PDF;
|
||||
}
|
||||
if ("docx".equals(suffix)) {
|
||||
return DOCX;
|
||||
}
|
||||
if ("pptx".equals(suffix)) {
|
||||
return PPTX;
|
||||
}
|
||||
if ("xlsx".equals(suffix)) {
|
||||
return XLSX;
|
||||
}
|
||||
return UNSUPPORTED;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否属于 Office 首版接入类型。
|
||||
*
|
||||
* @return 是否是本次 Office 类型
|
||||
*/
|
||||
public boolean isOffice() {
|
||||
return this == PPTX || this == XLSX;
|
||||
}
|
||||
}
|
||||
@@ -286,6 +286,7 @@ public final class DocumentImportDtos {
|
||||
private String chunkId;
|
||||
private String chunkType;
|
||||
private String content;
|
||||
private String renderMarkdown;
|
||||
private List<String> headingPath = new ArrayList<String>();
|
||||
private Integer partNo;
|
||||
private Integer partTotal;
|
||||
@@ -335,6 +336,14 @@ public final class DocumentImportDtos {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
public String getRenderMarkdown() {
|
||||
return renderMarkdown;
|
||||
}
|
||||
|
||||
public void setRenderMarkdown(String renderMarkdown) {
|
||||
this.renderMarkdown = renderMarkdown;
|
||||
}
|
||||
|
||||
public List<String> getHeadingPath() {
|
||||
return headingPath;
|
||||
}
|
||||
|
||||
@@ -22,4 +22,19 @@ public final class DocumentImportKeys {
|
||||
public static final String KEY_DOCUMENT_PARSE_METADATA = "parse.metadata";
|
||||
public static final String KEY_DOCUMENT_PARSE_WARNINGS = "parse.warnings";
|
||||
public static final String KEY_DOCUMENT_PROVIDER_TASK_ID = "parse.providerTaskId";
|
||||
public static final String KEY_DOCUMENT_PARSE_IMAGE_URLS = "parse.imageUrls";
|
||||
public static final String KEY_DOCUMENT_PARSE_IMAGE_COUNT = "parse.imageCount";
|
||||
public static final String KEY_DOCUMENT_PARSE_IMAGE_STORAGE_PREFIX = "parse.imageStoragePrefix";
|
||||
public static final String KEY_DOCUMENT_PARSE_PROGRESS_PERCENT = "parse.progressPercent";
|
||||
public static final String KEY_DOCUMENT_PARSE_CURRENT_STAGE = "parse.currentStage";
|
||||
public static final String KEY_DOCUMENT_PARSE_PROCESSED_ITEMS = "parse.processedItems";
|
||||
public static final String KEY_DOCUMENT_PARSE_TOTAL_ITEMS = "parse.totalItems";
|
||||
public static final String KEY_DOCUMENT_PARSE_STATUS_MESSAGE = "parse.statusMessage";
|
||||
public static final String KEY_DOCUMENT_RENDER_MARKDOWN = "renderMarkdown";
|
||||
public static final String KEY_DOCUMENT_PAGE_INDEX = "pageIndex";
|
||||
public static final String KEY_DOCUMENT_SHEET_NAME = "sheetName";
|
||||
public static final String KEY_DOCUMENT_ROW_START = "rowStart";
|
||||
public static final String KEY_DOCUMENT_ROW_END = "rowEnd";
|
||||
public static final String KEY_DOCUMENT_IMAGE_REFS = "imageRefs";
|
||||
public static final String KEY_DOCUMENT_PARSE_ARTIFACT_SUMMARY = "parseArtifactSummary";
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import org.springframework.stereotype.Service;
|
||||
import org.springframework.transaction.support.TransactionSynchronization;
|
||||
import org.springframework.transaction.support.TransactionSynchronizationManager;
|
||||
import org.springframework.web.servlet.mvc.method.annotation.SseEmitter;
|
||||
import tech.easyflow.ai.documentimport.DocumentImportKeys;
|
||||
import tech.easyflow.ai.entity.Document;
|
||||
import tech.easyflow.ai.mapper.DocumentMapper;
|
||||
import tech.easyflow.common.web.exceptions.BusinessException;
|
||||
@@ -116,11 +117,21 @@ public class DocumentImportTaskStatusStreamService {
|
||||
payload.put("totalChunks", document.getTotalChunks());
|
||||
payload.put("completedChunks", document.getCompletedChunks());
|
||||
payload.put("failedChunks", document.getFailedChunks());
|
||||
payload.put("parseCurrentStage", readOptionAsString(document, DocumentImportKeys.KEY_DOCUMENT_PARSE_CURRENT_STAGE));
|
||||
payload.put("parseStatusMessage", readOptionAsString(document, DocumentImportKeys.KEY_DOCUMENT_PARSE_STATUS_MESSAGE));
|
||||
payload.put("lastTaskError", document.getLastTaskError());
|
||||
payload.put("taskModifiedAt", document.getTaskModifiedAt());
|
||||
return payload;
|
||||
}
|
||||
|
||||
private String readOptionAsString(Document document, String key) {
|
||||
if (document == null || document.getOptions() == null || key == null) {
|
||||
return null;
|
||||
}
|
||||
Object value = document.getOptions().get(key);
|
||||
return value == null ? null : String.valueOf(value);
|
||||
}
|
||||
|
||||
private void sendAsync(String topicKey, SseEmitter emitter, String eventName, Map<String, Object> payload) {
|
||||
sseThreadPool.execute(() -> {
|
||||
try {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -4,6 +4,8 @@ public class KnowledgeSearchResultItem {
|
||||
|
||||
private Integer sorting;
|
||||
private String content;
|
||||
private String renderMarkdown;
|
||||
private String sourceFileName;
|
||||
private Double score;
|
||||
private String hitSource;
|
||||
private Double vectorScore;
|
||||
@@ -25,6 +27,22 @@ public class KnowledgeSearchResultItem {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
public String getRenderMarkdown() {
|
||||
return renderMarkdown;
|
||||
}
|
||||
|
||||
public void setRenderMarkdown(String renderMarkdown) {
|
||||
this.renderMarkdown = renderMarkdown;
|
||||
}
|
||||
|
||||
public String getSourceFileName() {
|
||||
return sourceFileName;
|
||||
}
|
||||
|
||||
public void setSourceFileName(String sourceFileName) {
|
||||
this.sourceFileName = sourceFileName;
|
||||
}
|
||||
|
||||
public Double getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
@@ -32,6 +32,7 @@ import tech.easyflow.ai.entity.FaqItem;
|
||||
import tech.easyflow.ai.entity.Model;
|
||||
import tech.easyflow.ai.enums.DocumentProcessStatus;
|
||||
import tech.easyflow.ai.enums.PublishStatus;
|
||||
import tech.easyflow.ai.documentimport.DocumentImportKeys;
|
||||
import tech.easyflow.ai.mapper.DocumentChunkMapper;
|
||||
import tech.easyflow.ai.mapper.DocumentCollectionMapper;
|
||||
import tech.easyflow.ai.mapper.DocumentMapper;
|
||||
@@ -406,6 +407,14 @@ public class DocumentCollectionServiceImpl extends ServiceImpl<DocumentCollectio
|
||||
return false;
|
||||
}
|
||||
item.setContent(content);
|
||||
String renderMarkdown = hitSnapshot.findChunkRenderMarkdown(item.getId());
|
||||
if (StringUtil.hasText(renderMarkdown)) {
|
||||
item.addMetadata("renderMarkdown", renderMarkdown);
|
||||
}
|
||||
String sourceFileName = hitSnapshot.findSourceFileName(item.getId());
|
||||
if (StringUtil.hasText(sourceFileName)) {
|
||||
item.addMetadata("sourceFileName", sourceFileName);
|
||||
}
|
||||
return true;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
@@ -596,6 +605,30 @@ public class DocumentCollectionServiceImpl extends ServiceImpl<DocumentCollectio
|
||||
}
|
||||
return StringUtil.noText(documentChunk.getContent()) ? null : documentChunk.getContent();
|
||||
}
|
||||
|
||||
private String findChunkRenderMarkdown(Object chunkId) {
|
||||
DocumentChunk documentChunk = chunkMap.get(String.valueOf(chunkId));
|
||||
if (documentChunk == null || documentChunk.getDocumentId() == null || documentChunk.getOptions() == null) {
|
||||
return null;
|
||||
}
|
||||
if (!documentMap.containsKey(String.valueOf(documentChunk.getDocumentId()))) {
|
||||
return null;
|
||||
}
|
||||
Object renderMarkdown = documentChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN);
|
||||
return renderMarkdown == null ? null : String.valueOf(renderMarkdown);
|
||||
}
|
||||
|
||||
private String findSourceFileName(Object chunkId) {
|
||||
DocumentChunk documentChunk = chunkMap.get(String.valueOf(chunkId));
|
||||
if (documentChunk == null || documentChunk.getDocumentId() == null) {
|
||||
return null;
|
||||
}
|
||||
tech.easyflow.ai.entity.Document sourceDocument = documentMap.get(String.valueOf(documentChunk.getDocumentId()));
|
||||
if (sourceDocument == null || StringUtil.noText(sourceDocument.getTitle())) {
|
||||
return null;
|
||||
}
|
||||
return sourceDocument.getTitle();
|
||||
}
|
||||
}
|
||||
|
||||
private String buildFaqPromptContent(FaqItem faqItem, List<Map<String, String>> images) {
|
||||
|
||||
@@ -6,6 +6,9 @@ import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import com.easyagents.document.pdf.PdfDocumentParseService;
|
||||
import com.easyagents.document.pptx.PptxDocumentParseService;
|
||||
import com.easyagents.document.xlsx.XlsxDocumentParseService;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import tech.easyflow.ai.document.exception.DocumentParseBridgeException;
|
||||
@@ -37,8 +40,8 @@ public class DocumentParseBridgeServiceImplTest {
|
||||
*/
|
||||
@Test
|
||||
public void shouldParseSuccessfully() {
|
||||
FakeDocumentParseService parseService = new FakeDocumentParseService();
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService);
|
||||
FakePdfDocumentParseService parseService = new FakePdfDocumentParseService();
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService, null, null, parseService);
|
||||
|
||||
DocumentParsedResult document = bridgeService.parse(buildSource(), DocumentParseScenario.WORKFLOW_TEXT);
|
||||
|
||||
@@ -52,8 +55,8 @@ public class DocumentParseBridgeServiceImplTest {
|
||||
*/
|
||||
@Test
|
||||
public void shouldSupportAsyncFlow() {
|
||||
FakeDocumentParseService parseService = new FakeDocumentParseService();
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService);
|
||||
FakePdfDocumentParseService parseService = new FakePdfDocumentParseService();
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService, null, null, parseService);
|
||||
|
||||
DocumentParseTaskStatus taskStatus = bridgeService.submit(buildSource(), DocumentParseScenario.KNOWLEDGE_IMPORT);
|
||||
DocumentParseTaskStatus queriedStatus = bridgeService.queryTask("task-1");
|
||||
@@ -69,9 +72,9 @@ public class DocumentParseBridgeServiceImplTest {
|
||||
*/
|
||||
@Test
|
||||
public void shouldQueryTaskInfoSuccessfully() {
|
||||
FakeDocumentParseService parseService = new FakeDocumentParseService();
|
||||
FakePdfDocumentParseService parseService = new FakePdfDocumentParseService();
|
||||
parseService.taskStatusValue = "completed";
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService);
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(parseService, null, null, parseService);
|
||||
|
||||
DocumentParseTaskInfo taskInfo = bridgeService.queryTaskInfo("task-1");
|
||||
|
||||
@@ -85,7 +88,7 @@ public class DocumentParseBridgeServiceImplTest {
|
||||
*/
|
||||
@Test
|
||||
public void shouldThrowWhenServiceDisabled() {
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null);
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, null, null, null);
|
||||
|
||||
try {
|
||||
bridgeService.parse(buildSource(), DocumentParseScenario.WORKFLOW_TEXT);
|
||||
@@ -95,9 +98,29 @@ public class DocumentParseBridgeServiceImplTest {
|
||||
}
|
||||
}
|
||||
|
||||
private DocumentParseBridgeServiceImpl buildBridgeService(DocumentParseService parseService) {
|
||||
@Test
|
||||
public void shouldRoutePptxToDedicatedService() {
|
||||
FakePptxDocumentParseService pptxService = new FakePptxDocumentParseService();
|
||||
FakePdfDocumentParseService defaultService = new FakePdfDocumentParseService();
|
||||
DocumentParseBridgeServiceImpl bridgeService = buildBridgeService(null, pptxService, null, defaultService);
|
||||
|
||||
DocumentParsedResult result = bridgeService.parse(buildSource("slides.pptx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation"), DocumentParseScenario.KNOWLEDGE_IMPORT);
|
||||
|
||||
Assert.assertEquals("# pptx", result.getPreferredText());
|
||||
Assert.assertEquals(1, pptxService.parseCallCount);
|
||||
Assert.assertEquals(0, defaultService.parseCallCount);
|
||||
}
|
||||
|
||||
private DocumentParseBridgeServiceImpl buildBridgeService(PdfDocumentParseService pdfDocumentParseService,
|
||||
PptxDocumentParseService pptxDocumentParseService,
|
||||
XlsxDocumentParseService xlsxDocumentParseService,
|
||||
DocumentParseService parseService) {
|
||||
return new DocumentParseBridgeServiceImpl(
|
||||
parseService,
|
||||
pdfDocumentParseService,
|
||||
pptxDocumentParseService,
|
||||
xlsxDocumentParseService,
|
||||
new DocumentSourceLoader(new InMemoryFileStorageService()),
|
||||
new DocumentParseRequestFactory(),
|
||||
new DocumentParseResultMapper()
|
||||
@@ -105,8 +128,12 @@ public class DocumentParseBridgeServiceImplTest {
|
||||
}
|
||||
|
||||
private DocumentSourceRef buildSource() {
|
||||
DocumentSourceRef sourceRef = DocumentSourceRef.ofBytes("demo.pdf", "pdf-data".getBytes(StandardCharsets.UTF_8));
|
||||
sourceRef.setContentType("application/pdf");
|
||||
return buildSource("demo.pdf", "application/pdf");
|
||||
}
|
||||
|
||||
private DocumentSourceRef buildSource(String fileName, String contentType) {
|
||||
DocumentSourceRef sourceRef = DocumentSourceRef.ofBytes(fileName, "pdf-data".getBytes(StandardCharsets.UTF_8));
|
||||
sourceRef.setContentType(contentType);
|
||||
sourceRef.setSize(8L);
|
||||
return sourceRef;
|
||||
}
|
||||
@@ -133,13 +160,15 @@ public class DocumentParseBridgeServiceImplTest {
|
||||
}
|
||||
}
|
||||
|
||||
private static class FakeDocumentParseService implements DocumentParseService {
|
||||
private static class FakePdfDocumentParseService implements PdfDocumentParseService {
|
||||
|
||||
private ParseRequest lastParseRequest;
|
||||
private String taskStatusValue = "running";
|
||||
private int parseCallCount;
|
||||
|
||||
@Override
|
||||
public ParseResponse parse(ParseRequest request) {
|
||||
parseCallCount++;
|
||||
this.lastParseRequest = request;
|
||||
return buildResponse();
|
||||
}
|
||||
@@ -187,4 +216,36 @@ public class DocumentParseBridgeServiceImplTest {
|
||||
return response;
|
||||
}
|
||||
}
|
||||
|
||||
private static class FakePptxDocumentParseService implements PptxDocumentParseService {
|
||||
|
||||
private int parseCallCount;
|
||||
|
||||
@Override
|
||||
public ParseResponse parse(ParseRequest request) {
|
||||
parseCallCount++;
|
||||
ParseResult result = new ParseResult();
|
||||
result.setFileName("slides.pptx");
|
||||
result.setMarkdown("# pptx");
|
||||
result.setPlainText("pptx");
|
||||
ParseResponse response = new ParseResponse();
|
||||
response.setResults(Collections.singletonList(result));
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus submit(ParseRequest request) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
package tech.easyflow.ai.document.support;
|
||||
|
||||
import com.easyagents.document.core.entity.ParseRequest;
|
||||
import com.easyagents.document.core.entity.PdfParseRequest;
|
||||
import com.easyagents.document.core.entity.PptxParseRequest;
|
||||
import com.easyagents.document.core.entity.XlsxParseRequest;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import tech.easyflow.ai.document.model.DocumentParseScenario;
|
||||
@@ -26,6 +29,7 @@ public class DocumentParseRequestFactoryTest {
|
||||
Assert.assertFalse(request.getReturnMiddleJson());
|
||||
Assert.assertFalse(request.getReturnContentList());
|
||||
Assert.assertFalse(request.getReturnImages());
|
||||
Assert.assertTrue(request instanceof PdfParseRequest);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -41,12 +45,33 @@ public class DocumentParseRequestFactoryTest {
|
||||
Assert.assertTrue(request.getReturnMiddleJson());
|
||||
Assert.assertTrue(request.getReturnContentList());
|
||||
Assert.assertTrue(request.getReturnImages());
|
||||
Assert.assertTrue(request instanceof PdfParseRequest);
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证 PPTX / XLSX 会构建对应的强类型请求。
|
||||
*/
|
||||
@Test
|
||||
public void shouldBuildOfficeTypedRequests() {
|
||||
DocumentParseRequestFactory factory = new DocumentParseRequestFactory();
|
||||
|
||||
ParseRequest pptxRequest = factory.build(buildSource("slides.pptx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation"), DocumentParseScenario.KNOWLEDGE_IMPORT);
|
||||
ParseRequest xlsxRequest = factory.build(buildSource("table.xlsx",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), DocumentParseScenario.KNOWLEDGE_IMPORT);
|
||||
|
||||
Assert.assertTrue(pptxRequest instanceof PptxParseRequest);
|
||||
Assert.assertTrue(xlsxRequest instanceof XlsxParseRequest);
|
||||
}
|
||||
|
||||
private LoadedDocumentSource buildSource() {
|
||||
return buildSource("demo.pdf", "application/pdf");
|
||||
}
|
||||
|
||||
private LoadedDocumentSource buildSource(String fileName, String contentType) {
|
||||
LoadedDocumentSource source = new LoadedDocumentSource();
|
||||
source.setFileName("demo.pdf");
|
||||
source.setContentType("application/pdf");
|
||||
source.setFileName(fileName);
|
||||
source.setContentType(contentType);
|
||||
source.setContentBytes("pdf-data".getBytes());
|
||||
source.setSize(8L);
|
||||
return source;
|
||||
|
||||
@@ -4,6 +4,7 @@ import com.easyagents.document.core.entity.ParseArtifacts;
|
||||
import com.easyagents.document.core.entity.ParseResult;
|
||||
import com.easyagents.document.core.entity.ParseResponse;
|
||||
import com.easyagents.document.core.entity.ParseTaskInfo;
|
||||
import com.easyagents.document.core.entity.ParseTaskStatus;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import tech.easyflow.ai.document.model.DocumentParseTaskInfo;
|
||||
@@ -65,6 +66,8 @@ public class DocumentParseResultMapperTest {
|
||||
ParseTaskInfo taskInfo = new ParseTaskInfo();
|
||||
taskInfo.setTaskId("task-1");
|
||||
taskInfo.setStatus("completed");
|
||||
taskInfo.setProgressPercent(100);
|
||||
taskInfo.setCurrentStage("completed");
|
||||
|
||||
ParseResult result = new ParseResult();
|
||||
result.setFileName("demo.pdf");
|
||||
@@ -76,7 +79,33 @@ public class DocumentParseResultMapperTest {
|
||||
DocumentParseTaskInfo mapped = mapper.map(taskInfo);
|
||||
|
||||
Assert.assertEquals("task-1", mapped.getTaskId());
|
||||
Assert.assertEquals(Integer.valueOf(100), mapped.getProgressPercent());
|
||||
Assert.assertEquals("completed", mapped.getCurrentStage());
|
||||
Assert.assertNotNull(mapped.getResult());
|
||||
Assert.assertEquals("# title", mapped.getResult().getPreferredText());
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证异步进度字段被完整透传。
|
||||
*/
|
||||
@Test
|
||||
public void shouldMapTaskStatusProgressFields() {
|
||||
DocumentParseResultMapper mapper = new DocumentParseResultMapper();
|
||||
ParseTaskStatus status = new ParseTaskStatus();
|
||||
status.setTaskId("task-2");
|
||||
status.setStatus("running");
|
||||
status.setProgressPercent(45);
|
||||
status.setCurrentStage("ocr");
|
||||
status.setProcessedItems(9);
|
||||
status.setTotalItems(20);
|
||||
status.setStatusMessage("正在识别图片");
|
||||
|
||||
tech.easyflow.ai.document.model.DocumentParseTaskStatus mapped = mapper.map(status);
|
||||
|
||||
Assert.assertEquals(Integer.valueOf(45), mapped.getProgressPercent());
|
||||
Assert.assertEquals("ocr", mapped.getCurrentStage());
|
||||
Assert.assertEquals(Integer.valueOf(9), mapped.getProcessedItems());
|
||||
Assert.assertEquals(Integer.valueOf(20), mapped.getTotalItems());
|
||||
Assert.assertEquals("正在识别图片", mapped.getStatusMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,17 +1,33 @@
|
||||
package tech.easyflow.ai.documentimport.task;
|
||||
|
||||
import com.easyagents.document.core.entity.DocumentBlock;
|
||||
import com.easyagents.document.core.entity.DocumentImage;
|
||||
import com.easyagents.document.core.entity.DocumentTable;
|
||||
import com.easyagents.rag.ingestion.model.StrategyConfig;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import tech.easyflow.ai.document.model.DocumentParseArtifacts;
|
||||
import tech.easyflow.ai.document.model.DocumentParsedResult;
|
||||
import tech.easyflow.ai.documentimport.DocumentImportKeys;
|
||||
import tech.easyflow.ai.entity.DocumentChunk;
|
||||
import tech.easyflow.ai.entity.DocumentImportTask;
|
||||
import tech.easyflow.ai.enums.DocumentImportTaskStatus;
|
||||
import tech.easyflow.ai.enums.DocumentProcessStatus;
|
||||
import tech.easyflow.ai.mapper.DocumentMapper;
|
||||
import tech.easyflow.ai.service.DocumentImportTaskService;
|
||||
import tech.easyflow.common.filestorage.FileStorageService;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.Method;
|
||||
import java.lang.reflect.Proxy;
|
||||
import java.math.BigInteger;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Base64;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
/**
|
||||
@@ -84,6 +100,258 @@ public class KnowledgeDocumentImportTaskAppServiceTest {
|
||||
Assert.assertEquals("新错误", updatedTask.getErrorSummary());
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证知识库导入会把解析图片上传到对象存储,并同步改写 Markdown 与结构化引用。
|
||||
*
|
||||
* @throws Exception 反射调用异常
|
||||
*/
|
||||
@Test
|
||||
public void normalizeParsedImagesForKnowledgeImportShouldUploadAndRewriteReferences() throws Exception {
|
||||
KnowledgeDocumentImportTaskAppService service = new KnowledgeDocumentImportTaskAppService();
|
||||
AtomicReference<String> savedPrePathRef = new AtomicReference<String>();
|
||||
AtomicReference<String> savedFilenameRef = new AtomicReference<String>();
|
||||
setField(service, "storageService", mockFileStorageService(savedPrePathRef, savedFilenameRef));
|
||||
|
||||
tech.easyflow.ai.entity.Document document = new tech.easyflow.ai.entity.Document();
|
||||
document.setId(BigInteger.valueOf(88));
|
||||
document.setTitle("产品说明书(终版).pdf");
|
||||
|
||||
DocumentParsedResult parsedResult = new DocumentParsedResult();
|
||||
parsedResult.setMarkdown("图例如下:\n");
|
||||
parsedResult.setPreferredText(parsedResult.getMarkdown());
|
||||
parsedResult.setPlainText(parsedResult.getMarkdown());
|
||||
|
||||
DocumentImage image = new DocumentImage();
|
||||
image.setName("sample-image.png");
|
||||
image.setSourcePath("images/sample-image.png");
|
||||
image.setMimeType("image/png");
|
||||
image.setDataUrl("data:image/png;base64," + Base64.getEncoder().encodeToString("demo".getBytes(StandardCharsets.UTF_8)));
|
||||
parsedResult.setImages(new ArrayList<DocumentImage>(List.of(image)));
|
||||
|
||||
DocumentBlock block = new DocumentBlock();
|
||||
block.setImagePath("images/sample-image.png");
|
||||
parsedResult.setBlocks(new ArrayList<DocumentBlock>(List.of(block)));
|
||||
|
||||
DocumentTable table = new DocumentTable();
|
||||
table.setImagePath("images/sample-image.png");
|
||||
parsedResult.setTables(new ArrayList<DocumentTable>(List.of(table)));
|
||||
|
||||
DocumentParseArtifacts artifacts = new DocumentParseArtifacts();
|
||||
List<Map<String, Object>> contentList = new ArrayList<Map<String, Object>>();
|
||||
Map<String, Object> contentItem = new LinkedHashMap<String, Object>();
|
||||
contentItem.put("img_path", "images/sample-image.png");
|
||||
contentList.add(contentItem);
|
||||
artifacts.setContentList(contentList);
|
||||
Map<String, Object> xlsxArtifact = new LinkedHashMap<String, Object>();
|
||||
List<Map<String, Object>> sheetImages = new ArrayList<Map<String, Object>>();
|
||||
sheetImages.add(new LinkedHashMap<String, Object>() {{
|
||||
put("sheetName", "Sheet1");
|
||||
put("sourcePaths", new ArrayList<String>(List.of("images/sample-image.png")));
|
||||
}});
|
||||
xlsxArtifact.put("sheetImages", sheetImages);
|
||||
artifacts.setExtraJsonArtifacts(new LinkedHashMap<String, Object>() {{
|
||||
put("xlsx", xlsxArtifact);
|
||||
}});
|
||||
parsedResult.setArtifacts(artifacts);
|
||||
|
||||
Method method = KnowledgeDocumentImportTaskAppService.class.getDeclaredMethod(
|
||||
"normalizeParsedImagesForKnowledgeImport",
|
||||
tech.easyflow.ai.entity.Document.class,
|
||||
DocumentParsedResult.class
|
||||
);
|
||||
method.setAccessible(true);
|
||||
DocumentParsedResult normalized = (DocumentParsedResult) method.invoke(service, document, parsedResult);
|
||||
|
||||
Assert.assertNotNull(normalized);
|
||||
Assert.assertEquals("knowledge-parse/88_产品说明书_终版/images", savedPrePathRef.get());
|
||||
Assert.assertEquals("sample-image.png", savedFilenameRef.get());
|
||||
|
||||
String expectedUrl = "http://localhost:39000/easyflow/attachment/knowledge-parse/88_产品说明书_终版/images/sample-image.png";
|
||||
Assert.assertTrue(normalized.getMarkdown().contains(expectedUrl));
|
||||
Assert.assertEquals(expectedUrl, normalized.getBlocks().get(0).getImagePath());
|
||||
Assert.assertEquals(expectedUrl, normalized.getTables().get(0).getImagePath());
|
||||
Assert.assertEquals(expectedUrl, normalized.getImages().get(0).getSourcePath());
|
||||
Assert.assertNull(normalized.getImages().get(0).getDataUrl());
|
||||
Object rewrittenContentList = normalized.getArtifacts().getContentList();
|
||||
Assert.assertTrue(rewrittenContentList instanceof List<?>);
|
||||
Assert.assertEquals(expectedUrl, ((Map<?, ?>) ((List<?>) rewrittenContentList).get(0)).get("img_path"));
|
||||
Object rewrittenSheetImages = ((Map<?, ?>) normalized.getArtifacts().getExtraJsonArtifacts().get("xlsx")).get("sheetImages");
|
||||
Assert.assertTrue(rewrittenSheetImages instanceof List<?>);
|
||||
Object sourcePaths = ((Map<?, ?>) ((List<?>) rewrittenSheetImages).get(0)).get("sourcePaths");
|
||||
Assert.assertEquals(expectedUrl, ((List<?>) sourcePaths).get(0));
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证 PPTX 会基于页级工件生成稳定的知识库分块。
|
||||
*
|
||||
* @throws Exception 反射调用异常
|
||||
*/
|
||||
@Test
|
||||
public void buildOfficeDocumentChunksShouldSplitPptxBySlide() throws Exception {
|
||||
KnowledgeDocumentImportTaskAppService service = new KnowledgeDocumentImportTaskAppService();
|
||||
tech.easyflow.ai.entity.Document document = new tech.easyflow.ai.entity.Document();
|
||||
document.setId(BigInteger.valueOf(101));
|
||||
document.setCollectionId(BigInteger.valueOf(201));
|
||||
document.setTitle("季度汇报.pptx");
|
||||
|
||||
Map<String, Object> parseArtifactSummary = new LinkedHashMap<String, Object>();
|
||||
List<Map<String, Object>> slides = new ArrayList<Map<String, Object>>();
|
||||
slides.add(new LinkedHashMap<String, Object>() {{
|
||||
put("slideIndex", 0);
|
||||
put("title", "封面");
|
||||
put("ocrMarkdown", "本页介绍季度目标。");
|
||||
put("imagePath", "https://example.com/slides/slide-001.png");
|
||||
put("imageName", "slide-001-page");
|
||||
}});
|
||||
slides.add(new LinkedHashMap<String, Object>() {{
|
||||
put("slideIndex", 1);
|
||||
put("title", "经营分析");
|
||||
put("ocrMarkdown", "收入同比增长 18%。");
|
||||
put("imagePath", "https://example.com/slides/slide-002.png");
|
||||
put("imageName", "slide-002-page");
|
||||
}});
|
||||
parseArtifactSummary.put("slides", slides);
|
||||
|
||||
Method method = KnowledgeDocumentImportTaskAppService.class.getDeclaredMethod(
|
||||
"buildOfficeDocumentChunks",
|
||||
tech.easyflow.ai.entity.Document.class,
|
||||
String.class,
|
||||
StrategyConfig.class,
|
||||
Map.class
|
||||
);
|
||||
method.setAccessible(true);
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
List<DocumentChunk> chunks = (List<DocumentChunk>) method.invoke(
|
||||
service,
|
||||
document,
|
||||
"pptx",
|
||||
null,
|
||||
parseArtifactSummary
|
||||
);
|
||||
|
||||
Assert.assertEquals(2, chunks.size());
|
||||
DocumentChunk firstChunk = chunks.get(0);
|
||||
Assert.assertTrue(firstChunk.getContent().contains("Slide 1"));
|
||||
Assert.assertTrue(firstChunk.getContent().contains("本页介绍季度目标"));
|
||||
Assert.assertEquals("https://example.com/slides/slide-001.png",
|
||||
((List<?>) firstChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_IMAGE_REFS)).get(0));
|
||||
Assert.assertEquals(1, firstChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_PAGE_INDEX));
|
||||
Assert.assertTrue(String.valueOf(firstChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN))
|
||||
.contains("slide-001.png"));
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证 XLSX 纯图片 Sheet 不会退化为空内容,并会输出稳定图片引用。
|
||||
*
|
||||
* @throws Exception 反射调用异常
|
||||
*/
|
||||
@Test
|
||||
public void buildOfficeDocumentChunksShouldKeepImageOnlyXlsxSheetReferences() throws Exception {
|
||||
KnowledgeDocumentImportTaskAppService service = new KnowledgeDocumentImportTaskAppService();
|
||||
tech.easyflow.ai.entity.Document document = new tech.easyflow.ai.entity.Document();
|
||||
document.setId(BigInteger.valueOf(102));
|
||||
document.setCollectionId(BigInteger.valueOf(202));
|
||||
document.setTitle("巡检记录.xlsx");
|
||||
|
||||
Map<String, Object> parseArtifactSummary = new LinkedHashMap<String, Object>();
|
||||
List<Map<String, Object>> sheets = new ArrayList<Map<String, Object>>();
|
||||
sheets.add(new LinkedHashMap<String, Object>() {{
|
||||
put("sheetName", "图片页");
|
||||
put("sheetIndex", 0);
|
||||
put("rows", new ArrayList<Map<String, Object>>());
|
||||
}});
|
||||
parseArtifactSummary.put("sheets", sheets);
|
||||
|
||||
List<Map<String, Object>> cellImages = new ArrayList<Map<String, Object>>();
|
||||
cellImages.add(new LinkedHashMap<String, Object>() {{
|
||||
put("sheetName", "图片页");
|
||||
put("referenceKey", "image-sheet-r2c2-001");
|
||||
put("sourcePath", "https://example.com/xlsx/sheet/image-001.jpeg");
|
||||
put("anchorCell", "B2");
|
||||
put("ocrText", "设备状态正常");
|
||||
put("fromRow", 1);
|
||||
}});
|
||||
parseArtifactSummary.put("cellImages", cellImages);
|
||||
|
||||
StrategyConfig strategyConfig = new StrategyConfig();
|
||||
strategyConfig.setRowsPerChunk(10);
|
||||
|
||||
Method method = KnowledgeDocumentImportTaskAppService.class.getDeclaredMethod(
|
||||
"buildOfficeDocumentChunks",
|
||||
tech.easyflow.ai.entity.Document.class,
|
||||
String.class,
|
||||
StrategyConfig.class,
|
||||
Map.class
|
||||
);
|
||||
method.setAccessible(true);
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
List<DocumentChunk> chunks = (List<DocumentChunk>) method.invoke(
|
||||
service,
|
||||
document,
|
||||
"xlsx",
|
||||
strategyConfig,
|
||||
parseArtifactSummary
|
||||
);
|
||||
|
||||
Assert.assertEquals(1, chunks.size());
|
||||
DocumentChunk onlyChunk = chunks.get(0);
|
||||
Assert.assertTrue(onlyChunk.getContent().contains("图片 OCR"));
|
||||
Assert.assertTrue(onlyChunk.getContent().contains("设备状态正常"));
|
||||
Assert.assertEquals("图片页", onlyChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_SHEET_NAME));
|
||||
Assert.assertEquals("https://example.com/xlsx/sheet/image-001.jpeg",
|
||||
((List<?>) onlyChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_IMAGE_REFS)).get(0));
|
||||
String renderMarkdown = String.valueOf(onlyChunk.getOptions().get(DocumentImportKeys.KEY_DOCUMENT_RENDER_MARKDOWN));
|
||||
Assert.assertTrue(renderMarkdown.contains("[IMG:image-sheet-r2c2-001]"));
|
||||
Assert.assertTrue(renderMarkdown.contains(""));
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证空白 Sheet 不会被误判成纯图片分块。
|
||||
*
|
||||
* @throws Exception 反射调用异常
|
||||
*/
|
||||
@Test
|
||||
public void buildOfficeDocumentChunksShouldSkipBlankXlsxSheetWithoutImages() throws Exception {
|
||||
KnowledgeDocumentImportTaskAppService service = new KnowledgeDocumentImportTaskAppService();
|
||||
tech.easyflow.ai.entity.Document document = new tech.easyflow.ai.entity.Document();
|
||||
document.setId(BigInteger.valueOf(103));
|
||||
document.setCollectionId(BigInteger.valueOf(203));
|
||||
document.setTitle("空白工作簿.xlsx");
|
||||
|
||||
Map<String, Object> parseArtifactSummary = new LinkedHashMap<String, Object>();
|
||||
parseArtifactSummary.put("sheets", new ArrayList<Map<String, Object>>(List.of(new LinkedHashMap<String, Object>() {{
|
||||
put("sheetName", "空白页");
|
||||
put("sheetIndex", 0);
|
||||
put("rows", new ArrayList<Map<String, Object>>());
|
||||
}})));
|
||||
parseArtifactSummary.put("cellImages", new ArrayList<Map<String, Object>>());
|
||||
|
||||
StrategyConfig strategyConfig = new StrategyConfig();
|
||||
strategyConfig.setRowsPerChunk(10);
|
||||
|
||||
Method method = KnowledgeDocumentImportTaskAppService.class.getDeclaredMethod(
|
||||
"buildOfficeDocumentChunks",
|
||||
tech.easyflow.ai.entity.Document.class,
|
||||
String.class,
|
||||
StrategyConfig.class,
|
||||
Map.class
|
||||
);
|
||||
method.setAccessible(true);
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
List<DocumentChunk> chunks = (List<DocumentChunk>) method.invoke(
|
||||
service,
|
||||
document,
|
||||
"xlsx",
|
||||
strategyConfig,
|
||||
parseArtifactSummary
|
||||
);
|
||||
|
||||
Assert.assertTrue(chunks.isEmpty());
|
||||
}
|
||||
|
||||
private static DocumentMapper mockDocumentMapper(tech.easyflow.ai.entity.Document persistedDocument,
|
||||
AtomicReference<tech.easyflow.ai.entity.Document> updatedDocumentRef) {
|
||||
return (DocumentMapper) Proxy.newProxyInstance(
|
||||
@@ -116,6 +384,22 @@ public class KnowledgeDocumentImportTaskAppServiceTest {
|
||||
);
|
||||
}
|
||||
|
||||
private static FileStorageService mockFileStorageService(AtomicReference<String> savedPrePathRef,
|
||||
AtomicReference<String> savedFilenameRef) {
|
||||
return (FileStorageService) Proxy.newProxyInstance(
|
||||
FileStorageService.class.getClassLoader(),
|
||||
new Class<?>[]{FileStorageService.class},
|
||||
(proxy, method, args) -> {
|
||||
if ("save".equals(method.getName()) && args != null && args.length == 2 && args[0] instanceof MultipartFile file) {
|
||||
savedPrePathRef.set((String) args[1]);
|
||||
savedFilenameRef.set(file.getOriginalFilename());
|
||||
return "http://localhost:39000/easyflow/attachment/" + args[1] + "/" + file.getOriginalFilename();
|
||||
}
|
||||
return defaultValue(method.getReturnType());
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
private static void setField(Object target, String fieldName, Object value) throws Exception {
|
||||
Field field = KnowledgeDocumentImportTaskAppService.class.getDeclaredField(fieldName);
|
||||
field.setAccessible(true);
|
||||
|
||||
Reference in New Issue
Block a user