feat: 增强知识库分块策略流程

- 增加导入分析预览提交与预览态缓存键

- 支持知识库分块策略配置与分块预览

- 重构知识库导入与确认导入前端流程
This commit is contained in:
2026-03-29 17:27:12 +08:00
parent 22ceabff96
commit b6213d0933
11 changed files with 2078 additions and 600 deletions

View File

@@ -0,0 +1,555 @@
package tech.easyflow.ai.documentimport;
import com.easyagents.rag.core.RagChunk;
import com.easyagents.rag.ingestion.model.AnalysisResult;
import com.easyagents.rag.ingestion.model.StrategyConfig;
import tech.easyflow.ai.entity.Document;
import tech.easyflow.ai.entity.DocumentChunk;
import java.io.Serializable;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public final class DocumentImportDtos {
private DocumentImportDtos() {
}
public static class FileItem implements Serializable {
private String filePath;
private String fileName;
public String getFilePath() {
return filePath;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
}
public static class AnalyzeRequest implements Serializable {
private BigInteger knowledgeId;
private List<FileItem> files = new ArrayList<FileItem>();
public BigInteger getKnowledgeId() {
return knowledgeId;
}
public void setKnowledgeId(BigInteger knowledgeId) {
this.knowledgeId = knowledgeId;
}
public List<FileItem> getFiles() {
return files;
}
public void setFiles(List<FileItem> files) {
this.files = files;
}
}
public static class PreviewFileRequest implements Serializable {
private String filePath;
private String fileName;
private StrategyConfig strategyConfig = StrategyConfig.defaults();
public String getFilePath() {
return filePath;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public StrategyConfig getStrategyConfig() {
return strategyConfig;
}
public void setStrategyConfig(StrategyConfig strategyConfig) {
this.strategyConfig = strategyConfig;
}
}
public static class PreviewRequest implements Serializable {
private BigInteger knowledgeId;
private List<PreviewFileRequest> files = new ArrayList<PreviewFileRequest>();
public BigInteger getKnowledgeId() {
return knowledgeId;
}
public void setKnowledgeId(BigInteger knowledgeId) {
this.knowledgeId = knowledgeId;
}
public List<PreviewFileRequest> getFiles() {
return files;
}
public void setFiles(List<PreviewFileRequest> files) {
this.files = files;
}
}
public static class CommitRequest implements Serializable {
private BigInteger knowledgeId;
private List<String> previewSessionIds = new ArrayList<String>();
public BigInteger getKnowledgeId() {
return knowledgeId;
}
public void setKnowledgeId(BigInteger knowledgeId) {
this.knowledgeId = knowledgeId;
}
public List<String> getPreviewSessionIds() {
return previewSessionIds;
}
public void setPreviewSessionIds(List<String> previewSessionIds) {
this.previewSessionIds = previewSessionIds;
}
}
public static class SplitterProfileSaveRequest implements Serializable {
private BigInteger knowledgeId;
private String defaultStrategyCode;
private Boolean autoRecommendEnabled;
private String fallbackStrategyCode;
private Map<String, Object> strategyProfiles = new LinkedHashMap<String, Object>();
public BigInteger getKnowledgeId() {
return knowledgeId;
}
public void setKnowledgeId(BigInteger knowledgeId) {
this.knowledgeId = knowledgeId;
}
public String getDefaultStrategyCode() {
return defaultStrategyCode;
}
public void setDefaultStrategyCode(String defaultStrategyCode) {
this.defaultStrategyCode = defaultStrategyCode;
}
public Boolean getAutoRecommendEnabled() {
return autoRecommendEnabled;
}
public void setAutoRecommendEnabled(Boolean autoRecommendEnabled) {
this.autoRecommendEnabled = autoRecommendEnabled;
}
public String getFallbackStrategyCode() {
return fallbackStrategyCode;
}
public void setFallbackStrategyCode(String fallbackStrategyCode) {
this.fallbackStrategyCode = fallbackStrategyCode;
}
public Map<String, Object> getStrategyProfiles() {
return strategyProfiles;
}
public void setStrategyProfiles(Map<String, Object> strategyProfiles) {
this.strategyProfiles = strategyProfiles;
}
}
public static class AnalyzeItem implements Serializable {
private String filePath;
private String fileName;
private AnalysisResult analysis;
private StrategyConfig strategyConfig = StrategyConfig.defaults();
public String getFilePath() {
return filePath;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public AnalysisResult getAnalysis() {
return analysis;
}
public void setAnalysis(AnalysisResult analysis) {
this.analysis = analysis;
}
public StrategyConfig getStrategyConfig() {
return strategyConfig;
}
public void setStrategyConfig(StrategyConfig strategyConfig) {
this.strategyConfig = strategyConfig;
}
}
public static class AnalyzeResponse implements Serializable {
private Integer total;
private List<AnalyzeItem> items = new ArrayList<AnalyzeItem>();
public Integer getTotal() {
return total;
}
public void setTotal(Integer total) {
this.total = total;
}
public List<AnalyzeItem> getItems() {
return items;
}
public void setItems(List<AnalyzeItem> items) {
this.items = items;
}
}
public static class PreviewFileResult implements Serializable {
private String previewSessionId;
private String filePath;
private String fileName;
private String strategyCode;
private String strategyLabel;
private AnalysisResult analysis;
private Integer totalChunks;
private Integer totalWarnings;
private List<RagChunk> chunks = new ArrayList<RagChunk>();
public String getPreviewSessionId() {
return previewSessionId;
}
public void setPreviewSessionId(String previewSessionId) {
this.previewSessionId = previewSessionId;
}
public String getFilePath() {
return filePath;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public String getStrategyCode() {
return strategyCode;
}
public void setStrategyCode(String strategyCode) {
this.strategyCode = strategyCode;
}
public String getStrategyLabel() {
return strategyLabel;
}
public void setStrategyLabel(String strategyLabel) {
this.strategyLabel = strategyLabel;
}
public AnalysisResult getAnalysis() {
return analysis;
}
public void setAnalysis(AnalysisResult analysis) {
this.analysis = analysis;
}
public Integer getTotalChunks() {
return totalChunks;
}
public void setTotalChunks(Integer totalChunks) {
this.totalChunks = totalChunks;
}
public Integer getTotalWarnings() {
return totalWarnings;
}
public void setTotalWarnings(Integer totalWarnings) {
this.totalWarnings = totalWarnings;
}
public List<RagChunk> getChunks() {
return chunks;
}
public void setChunks(List<RagChunk> chunks) {
this.chunks = chunks;
}
}
public static class PreviewResponse implements Serializable {
private Integer totalFiles;
private Integer totalChunks;
private List<PreviewFileResult> items = new ArrayList<PreviewFileResult>();
public Integer getTotalFiles() {
return totalFiles;
}
public void setTotalFiles(Integer totalFiles) {
this.totalFiles = totalFiles;
}
public Integer getTotalChunks() {
return totalChunks;
}
public void setTotalChunks(Integer totalChunks) {
this.totalChunks = totalChunks;
}
public List<PreviewFileResult> getItems() {
return items;
}
public void setItems(List<PreviewFileResult> items) {
this.items = items;
}
}
public static class CommitFileResult implements Serializable {
private String previewSessionId;
private String fileName;
private Boolean success;
private String reason;
private BigInteger documentId;
private Integer chunkCount;
public String getPreviewSessionId() {
return previewSessionId;
}
public void setPreviewSessionId(String previewSessionId) {
this.previewSessionId = previewSessionId;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public Boolean getSuccess() {
return success;
}
public void setSuccess(Boolean success) {
this.success = success;
}
public String getReason() {
return reason;
}
public void setReason(String reason) {
this.reason = reason;
}
public BigInteger getDocumentId() {
return documentId;
}
public void setDocumentId(BigInteger documentId) {
this.documentId = documentId;
}
public Integer getChunkCount() {
return chunkCount;
}
public void setChunkCount(Integer chunkCount) {
this.chunkCount = chunkCount;
}
}
public static class CommitResponse implements Serializable {
private Integer totalFiles;
private Integer successCount;
private Integer errorCount;
private List<CommitFileResult> results = new ArrayList<CommitFileResult>();
public Integer getTotalFiles() {
return totalFiles;
}
public void setTotalFiles(Integer totalFiles) {
this.totalFiles = totalFiles;
}
public Integer getSuccessCount() {
return successCount;
}
public void setSuccessCount(Integer successCount) {
this.successCount = successCount;
}
public Integer getErrorCount() {
return errorCount;
}
public void setErrorCount(Integer errorCount) {
this.errorCount = errorCount;
}
public List<CommitFileResult> getResults() {
return results;
}
public void setResults(List<CommitFileResult> results) {
this.results = results;
}
}
public static class PreviewSession implements Serializable {
private String sessionId;
private BigInteger knowledgeId;
private String filePath;
private String fileName;
private String sourceFormat;
private StrategyConfig strategyConfig;
private AnalysisResult analysis;
private Document document;
private List<DocumentChunk> documentChunks = new ArrayList<DocumentChunk>();
private List<RagChunk> previewChunks = new ArrayList<RagChunk>();
private Date createdAt;
public String getSessionId() {
return sessionId;
}
public void setSessionId(String sessionId) {
this.sessionId = sessionId;
}
public BigInteger getKnowledgeId() {
return knowledgeId;
}
public void setKnowledgeId(BigInteger knowledgeId) {
this.knowledgeId = knowledgeId;
}
public String getFilePath() {
return filePath;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
public String getSourceFormat() {
return sourceFormat;
}
public void setSourceFormat(String sourceFormat) {
this.sourceFormat = sourceFormat;
}
public StrategyConfig getStrategyConfig() {
return strategyConfig;
}
public void setStrategyConfig(StrategyConfig strategyConfig) {
this.strategyConfig = strategyConfig;
}
public AnalysisResult getAnalysis() {
return analysis;
}
public void setAnalysis(AnalysisResult analysis) {
this.analysis = analysis;
}
public Document getDocument() {
return document;
}
public void setDocument(Document document) {
this.document = document;
}
public List<DocumentChunk> getDocumentChunks() {
return documentChunks;
}
public void setDocumentChunks(List<DocumentChunk> documentChunks) {
this.documentChunks = documentChunks;
}
public List<RagChunk> getPreviewChunks() {
return previewChunks;
}
public void setPreviewChunks(List<RagChunk> previewChunks) {
this.previewChunks = previewChunks;
}
public Date getCreatedAt() {
return createdAt;
}
public void setCreatedAt(Date createdAt) {
this.createdAt = createdAt;
}
}
}

View File

@@ -0,0 +1,21 @@
package tech.easyflow.ai.documentimport;
public final class DocumentImportKeys {
private DocumentImportKeys() {
}
public static final String CACHE_KEY_PREFIX = "easyflow:document:import:preview:";
public static final String KEY_SPLITTER_DEFAULT_STRATEGY = "splitter.defaultStrategyCode";
public static final String KEY_SPLITTER_AUTO_RECOMMEND_ENABLED = "splitter.autoRecommendEnabled";
public static final String KEY_SPLITTER_FALLBACK_STRATEGY = "splitter.fallbackStrategyCode";
public static final String KEY_SPLITTER_STRATEGY_PROFILES = "splitter.strategyProfiles";
public static final String KEY_DOCUMENT_STRATEGY_CODE = "splitter.strategyCode";
public static final String KEY_DOCUMENT_STRATEGY_LABEL = "splitter.strategyLabel";
public static final String KEY_DOCUMENT_STRATEGY_SNAPSHOT = "splitter.strategySnapshot";
public static final String KEY_DOCUMENT_ANALYSIS_SUMMARY = "splitter.analysisSummary";
public static final String KEY_DOCUMENT_SOURCE_FILE_EXT = "splitter.sourceFileExt";
public static final String KEY_DOCUMENT_PREVIEW_VERSION = "splitter.previewVersion";
}

View File

@@ -0,0 +1,45 @@
package tech.easyflow.ai.documentimport;
import com.alicp.jetcache.Cache;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.stereotype.Service;
import tech.easyflow.common.web.exceptions.BusinessException;
import java.time.Duration;
import java.util.concurrent.TimeUnit;
import java.util.UUID;
@Service
public class DocumentImportPreviewService {
private static final Duration SESSION_TTL = Duration.ofMinutes(30);
private final Cache<String, Object> defaultCache;
public DocumentImportPreviewService(@Qualifier("defaultCache") Cache<String, Object> defaultCache) {
this.defaultCache = defaultCache;
}
public String put(DocumentImportDtos.PreviewSession session) {
String sessionId = UUID.randomUUID().toString().replace("-", "");
session.setSessionId(sessionId);
defaultCache.put(buildKey(sessionId), session, SESSION_TTL.toMinutes(), TimeUnit.MINUTES);
return sessionId;
}
public DocumentImportDtos.PreviewSession getRequired(String sessionId) {
Object cached = defaultCache.get(buildKey(sessionId));
if (!(cached instanceof DocumentImportDtos.PreviewSession)) {
throw new BusinessException("预览会话已失效,请重新生成预览");
}
return (DocumentImportDtos.PreviewSession) cached;
}
public void remove(String sessionId) {
defaultCache.remove(buildKey(sessionId));
}
private String buildKey(String sessionId) {
return DocumentImportKeys.CACHE_KEY_PREFIX + sessionId;
}
}

View File

@@ -3,6 +3,7 @@ package tech.easyflow.ai.service;
import tech.easyflow.ai.entity.Document;
import com.mybatisflex.core.paginate.Page;
import com.mybatisflex.core.service.IService;
import tech.easyflow.ai.documentimport.DocumentImportDtos;
import tech.easyflow.ai.entity.DocumentChunk;
import tech.easyflow.ai.entity.DocumentCollectionSplitParams;
import tech.easyflow.common.domain.Result;
@@ -25,4 +26,10 @@ public interface DocumentService extends IService<Document> {
Result textSplit(DocumentCollectionSplitParams documentCollectionSplitParams);
Result saveTextResult(List<DocumentChunk> documentChunks, Document document);
Result<DocumentImportDtos.AnalyzeResponse> analyzeImport(DocumentImportDtos.AnalyzeRequest request);
Result<DocumentImportDtos.PreviewResponse> previewImport(DocumentImportDtos.PreviewRequest request);
Result<DocumentImportDtos.CommitResponse> commitImport(DocumentImportDtos.CommitRequest request);
}

View File

@@ -12,6 +12,12 @@ import com.easyagents.core.model.embedding.EmbeddingOptions;
import com.easyagents.core.store.DocumentStore;
import com.easyagents.core.store.StoreOptions;
import com.easyagents.core.store.StoreResult;
import com.easyagents.rag.core.RagChunk;
import com.easyagents.rag.core.RagDefaults;
import com.easyagents.rag.core.RagStrategyCodes;
import com.easyagents.rag.ingestion.RagIngestionService;
import com.easyagents.rag.ingestion.model.AnalysisResult;
import com.easyagents.rag.ingestion.model.StrategyConfig;
import com.easyagents.search.engine.service.DocumentSearcher;
import com.mybatisflex.core.keygen.impl.FlexIDKeyGenerator;
import com.mybatisflex.core.paginate.Page;
@@ -24,6 +30,9 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import tech.easyflow.ai.config.SearcherFactory;
import tech.easyflow.ai.documentimport.DocumentImportDtos;
import tech.easyflow.ai.documentimport.DocumentImportKeys;
import tech.easyflow.ai.documentimport.DocumentImportPreviewService;
import tech.easyflow.ai.entity.*;
import tech.easyflow.ai.mapper.DocumentChunkMapper;
import tech.easyflow.ai.mapper.DocumentMapper;
@@ -42,6 +51,7 @@ import javax.annotation.Resource;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.math.BigDecimal;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
@@ -81,6 +91,12 @@ public class DocumentServiceImpl extends ServiceImpl<DocumentMapper, Document> i
@Autowired
private SearcherFactory searcherFactory;
@Autowired
private RagIngestionService ragIngestionService;
@Autowired
private DocumentImportPreviewService documentImportPreviewService;
@Override
public Page<Document> getDocumentList(String knowledgeId, int pageSize, int pageNum, String fileName) {
QueryWrapper queryWrapper=QueryWrapper.create()
@@ -250,23 +266,397 @@ public class DocumentServiceImpl extends ServiceImpl<DocumentMapper, Document> i
return Result.fail(1, "切割结果无有效文本,无法进行向量化");
}
Boolean result = storeDocument(document, validChunks);
if (result) {
this.getMapper().insert(document);
AtomicInteger sort = new AtomicInteger(1);
validChunks.forEach(item -> {
item.setDocumentCollectionId(document.getCollectionId());
item.setSorting(sort.get());
item.setDocumentId(document.getId());
sort.getAndIncrement();
documentChunkService.save(item);
});
StoreExecutionContext storeContext = prepareStoreContext(document);
storeDocumentChunks(storeContext, validChunks);
try {
persistDocumentWithChunks(document, validChunks);
updateKnowledgeAfterStore(storeContext);
return Result.ok();
} catch (Exception e) {
cleanupPersistedDocument(document);
rollbackStoredChunks(storeContext, validChunks);
Log.error("保存文档失败: documentId={}, title={}", document.getId(), document.getTitle(), e);
throw new BusinessException("保存失败:" + e.getMessage());
}
return Result.fail(1, "保存失败");
}
protected Boolean storeDocument(Document entity, List<DocumentChunk> documentChunks) {
StoreExecutionContext storeContext = prepareStoreContext(entity);
storeDocumentChunks(storeContext, documentChunks);
updateKnowledgeAfterStore(storeContext);
return true;
}
@Override
public Result<DocumentImportDtos.AnalyzeResponse> analyzeImport(DocumentImportDtos.AnalyzeRequest request) {
DocumentCollection knowledge = assertDocumentCollection(request.getKnowledgeId());
if (request.getFiles() == null || request.getFiles().isEmpty()) {
throw new BusinessException("请先上传文件");
}
List<DocumentImportDtos.AnalyzeItem> items = new ArrayList<>();
for (DocumentImportDtos.FileItem file : request.getFiles()) {
AnalysisResult analysis = analyzeSingleFile(file.getFilePath(), file.getFileName());
StrategyConfig strategyConfig = resolveStrategyConfig(
knowledge,
null,
analysis
);
DocumentImportDtos.AnalyzeItem item = new DocumentImportDtos.AnalyzeItem();
item.setFilePath(file.getFilePath());
item.setFileName(file.getFileName());
item.setAnalysis(analysis);
item.setStrategyConfig(strategyConfig);
items.add(item);
}
DocumentImportDtos.AnalyzeResponse response = new DocumentImportDtos.AnalyzeResponse();
response.setItems(items);
response.setTotal(items.size());
return Result.ok(response);
}
@Override
public Result<DocumentImportDtos.PreviewResponse> previewImport(DocumentImportDtos.PreviewRequest request) {
DocumentCollection knowledge = assertDocumentCollection(request.getKnowledgeId());
if (request.getFiles() == null || request.getFiles().isEmpty()) {
throw new BusinessException("请先上传文件");
}
List<DocumentImportDtos.PreviewFileResult> items = new ArrayList<>();
int totalChunks = 0;
for (DocumentImportDtos.PreviewFileRequest file : request.getFiles()) {
DocumentImportDtos.PreviewSession session = buildPreviewSession(knowledge, file);
String sessionId = documentImportPreviewService.put(session);
DocumentImportDtos.PreviewFileResult item = new DocumentImportDtos.PreviewFileResult();
item.setPreviewSessionId(sessionId);
item.setFilePath(file.getFilePath());
item.setFileName(file.getFileName());
item.setStrategyCode(session.getStrategyConfig().getStrategyCode());
item.setStrategyLabel(ragIngestionService.toStrategyLabel(session.getStrategyConfig().getStrategyCode()));
item.setAnalysis(session.getAnalysis());
item.setChunks(session.getPreviewChunks());
item.setTotalChunks(session.getPreviewChunks().size());
item.setTotalWarnings(countWarnings(session.getPreviewChunks()));
items.add(item);
totalChunks += session.getPreviewChunks().size();
}
DocumentImportDtos.PreviewResponse response = new DocumentImportDtos.PreviewResponse();
response.setItems(items);
response.setTotalFiles(items.size());
response.setTotalChunks(totalChunks);
return Result.ok(response);
}
@Override
public Result<DocumentImportDtos.CommitResponse> commitImport(DocumentImportDtos.CommitRequest request) {
DocumentCollection knowledge = assertDocumentCollection(request.getKnowledgeId());
if (request.getPreviewSessionIds() == null || request.getPreviewSessionIds().isEmpty()) {
throw new BusinessException("请选择需要提交的预览结果");
}
List<DocumentImportDtos.CommitFileResult> results = new ArrayList<>();
int successCount = 0;
int errorCount = 0;
for (String previewSessionId : request.getPreviewSessionIds()) {
DocumentImportDtos.CommitFileResult result = new DocumentImportDtos.CommitFileResult();
result.setPreviewSessionId(previewSessionId);
try {
DocumentImportDtos.PreviewSession session = documentImportPreviewService.getRequired(previewSessionId);
if (!Objects.equals(session.getKnowledgeId(), knowledge.getId())) {
throw new BusinessException("预览会话与当前知识库不匹配");
}
commitSingleSession(session);
result.setSuccess(true);
result.setFileName(session.getFileName());
result.setDocumentId(session.getDocument().getId());
result.setChunkCount(session.getDocumentChunks().size());
documentImportPreviewService.remove(previewSessionId);
successCount++;
} catch (Exception e) {
result.setSuccess(false);
result.setReason(e.getMessage());
errorCount++;
}
results.add(result);
}
DocumentImportDtos.CommitResponse response = new DocumentImportDtos.CommitResponse();
response.setTotalFiles(results.size());
response.setSuccessCount(successCount);
response.setErrorCount(errorCount);
response.setResults(results);
return Result.ok(response);
}
private void commitSingleSession(DocumentImportDtos.PreviewSession session) {
Document document = session.getDocument();
document.setCreated(new Date());
document.setModified(new Date());
document.setCreatedBy(BigInteger.valueOf(StpUtil.getLoginIdAsLong()));
document.setModifiedBy(BigInteger.valueOf(StpUtil.getLoginIdAsLong()));
for (DocumentChunk chunk : session.getDocumentChunks()) {
chunk.setDocumentId(document.getId());
chunk.setDocumentCollectionId(document.getCollectionId());
}
StoreExecutionContext storeContext = prepareStoreContext(document);
storeDocumentChunks(storeContext, session.getDocumentChunks());
try {
persistDocumentWithChunks(document, session.getDocumentChunks());
updateKnowledgeAfterStore(storeContext);
} catch (Exception e) {
cleanupPersistedDocument(document);
rollbackStoredChunks(storeContext, session.getDocumentChunks());
throw new BusinessException("提交导入失败:" + e.getMessage());
}
}
private DocumentImportDtos.PreviewSession buildPreviewSession(DocumentCollection knowledge,
DocumentImportDtos.PreviewFileRequest fileRequest) {
AnalysisResult analysis = analyzeSingleFile(fileRequest.getFilePath(), fileRequest.getFileName());
StrategyConfig strategyConfig = resolveStrategyConfig(knowledge, fileRequest.getStrategyConfig(), analysis);
List<RagChunk> previewChunks = ragIngestionService.split(analysis, strategyConfig);
if (previewChunks.isEmpty()) {
throw new BusinessException("未生成有效分块,请调整策略后重试");
}
FlexIDKeyGenerator flexIDKeyGenerator = new FlexIDKeyGenerator();
Document document = buildPreviewDocument(flexIDKeyGenerator, knowledge, fileRequest, analysis, strategyConfig);
List<DocumentChunk> documentChunks = buildDocumentChunks(flexIDKeyGenerator, document, previewChunks);
DocumentImportDtos.PreviewSession session = new DocumentImportDtos.PreviewSession();
session.setKnowledgeId(knowledge.getId());
session.setFilePath(fileRequest.getFilePath());
session.setFileName(fileRequest.getFileName());
session.setSourceFormat(analysis.getSourceFormat());
session.setStrategyConfig(strategyConfig);
session.setAnalysis(analysis);
session.setDocument(document);
session.setDocumentChunks(documentChunks);
session.setPreviewChunks(previewChunks);
session.setCreatedAt(new Date());
return session;
}
private Document buildPreviewDocument(FlexIDKeyGenerator flexIDKeyGenerator,
DocumentCollection knowledge,
DocumentImportDtos.PreviewFileRequest fileRequest,
AnalysisResult analysis,
StrategyConfig strategyConfig) {
Document document = new Document();
document.setId(new BigInteger(String.valueOf(flexIDKeyGenerator.generate(document, null))));
document.setCollectionId(knowledge.getId());
document.setDocumentType(analysis.getSourceFormat());
document.setDocumentPath(fileRequest.getFilePath());
document.setTitle(fileRequest.getFileName());
document.setContent(analysis.getNormalizedContent());
document.setCreated(new Date());
document.setModified(new Date());
document.setModifiedBy(BigInteger.valueOf(StpUtil.getLoginIdAsLong()));
Map<String, Object> options = new LinkedHashMap<>();
options.put(DocumentImportKeys.KEY_DOCUMENT_STRATEGY_CODE, strategyConfig.getStrategyCode());
options.put(DocumentImportKeys.KEY_DOCUMENT_STRATEGY_LABEL, ragIngestionService.toStrategyLabel(strategyConfig.getStrategyCode()));
options.put(DocumentImportKeys.KEY_DOCUMENT_STRATEGY_SNAPSHOT, strategyConfigToMap(strategyConfig));
options.put(DocumentImportKeys.KEY_DOCUMENT_ANALYSIS_SUMMARY, analysis.getFeatures());
options.put(DocumentImportKeys.KEY_DOCUMENT_SOURCE_FILE_EXT, analysis.getSourceFormat());
options.put(DocumentImportKeys.KEY_DOCUMENT_PREVIEW_VERSION, "v1");
document.setOptions(options);
return document;
}
private List<DocumentChunk> buildDocumentChunks(FlexIDKeyGenerator flexIDKeyGenerator,
Document document,
List<RagChunk> previewChunks) {
List<DocumentChunk> chunks = new ArrayList<>();
for (int i = 0; i < previewChunks.size(); i++) {
RagChunk previewChunk = previewChunks.get(i);
DocumentChunk chunk = new DocumentChunk();
chunk.setId(new BigInteger(String.valueOf(flexIDKeyGenerator.generate(chunk, null))));
chunk.setDocumentId(document.getId());
chunk.setDocumentCollectionId(document.getCollectionId());
chunk.setContent(previewChunk.getContent());
chunk.setSorting(i + 1);
Map<String, Object> options = new LinkedHashMap<>(previewChunk.getOptions());
options.put("chunkType", previewChunk.getChunkType());
options.put("sourceLabel", previewChunk.getSourceLabel());
options.put("headingPath", previewChunk.getHeadingPath());
options.put("charCount", previewChunk.getCharCount());
options.put("tokenEstimate", previewChunk.getTokenEstimate());
options.put("qaQuestion", previewChunk.getQuestion());
options.put("qaAnswer", previewChunk.getAnswer());
options.put("partNo", previewChunk.getPartNo());
options.put("partTotal", previewChunk.getPartTotal());
options.put("warnings", previewChunk.getWarnings());
chunk.setOptions(options);
chunks.add(chunk);
}
return chunks;
}
private AnalysisResult analyzeSingleFile(String filePath, String fileName) {
String fileExt = normalizeFileExtension(fileName, filePath);
assertSupportedImportFile(fileExt);
String content = readFileContent(filePath, fileName);
return ragIngestionService.analyze(content, fileExt);
}
private String readFileContent(String filePath, String fileName) {
try (InputStream inputStream = storageService.readStream(filePath)) {
return File2TextUtil.readFromStream(inputStream, fileName, null);
} catch (IOException e) {
Log.error("读取导入文件失败: filePath={}, fileName={}", filePath, fileName, e);
throw new BusinessException("文件解析失败:" + e.getMessage());
}
}
private void assertSupportedImportFile(String fileExt) {
if (!Arrays.asList("pdf", "docx", "txt", "md").contains(fileExt)) {
throw new BusinessException("当前仅支持 pdf/docx/txt/md 文档导入");
}
}
private String normalizeFileExtension(String fileName, String filePath) {
String target = StringUtil.hasText(fileName) ? fileName : filePath;
String ext = FileUtil.getFileTypeByExtension(target);
return ext == null ? "" : ext.toLowerCase(Locale.ROOT);
}
private DocumentCollection assertDocumentCollection(BigInteger knowledgeId) {
DocumentCollection knowledge = knowledgeService.getById(knowledgeId);
if (knowledge == null) {
throw new BusinessException("知识库不存在");
}
if (knowledge.isFaqCollection()) {
throw new BusinessException("FAQ知识库不支持文档上传");
}
return knowledge;
}
private StrategyConfig resolveStrategyConfig(DocumentCollection knowledge,
StrategyConfig requestConfig,
AnalysisResult analysisResult) {
Map<String, Object> options = knowledge.getOptions() == null
? Collections.emptyMap()
: knowledge.getOptions();
String recommended = analysisResult.getRecommendedStrategyCode();
String defaultStrategyCode = asString(options.get(DocumentImportKeys.KEY_SPLITTER_DEFAULT_STRATEGY));
String fallbackStrategyCode = asString(options.get(DocumentImportKeys.KEY_SPLITTER_FALLBACK_STRATEGY));
Boolean autoRecommendEnabled = asBoolean(options.get(DocumentImportKeys.KEY_SPLITTER_AUTO_RECOMMEND_ENABLED), true);
StrategyConfig config = readProfileConfig(options, defaultStrategyCode);
if (config == null) {
config = StrategyConfig.defaults();
}
String requestedStrategyCode = requestConfig == null ? null : requestConfig.getStrategyCode();
String strategyCode = StringUtil.hasText(requestedStrategyCode)
? requestedStrategyCode
: config.getStrategyCode();
if (!StringUtil.hasText(strategyCode) || RagStrategyCodes.AUTO.equals(strategyCode)) {
strategyCode = Boolean.TRUE.equals(autoRecommendEnabled)
? recommended
: (StringUtil.hasText(defaultStrategyCode) ? defaultStrategyCode : recommended);
}
if (!StringUtil.hasText(strategyCode)) {
strategyCode = StringUtil.hasText(fallbackStrategyCode)
? fallbackStrategyCode
: RagStrategyCodes.PARAGRAPH_LENGTH;
}
StrategyConfig profileConfig = readProfileConfig(options, strategyCode);
if (profileConfig != null) {
mergeStrategyConfig(config, profileConfig);
}
if (requestConfig != null) {
mergeStrategyConfig(config, requestConfig);
}
config.setStrategyCode(strategyCode);
if (config.getChunkSize() == null || config.getChunkSize() <= 0) {
config.setChunkSize(RagDefaults.CHUNK_SIZE);
}
if (config.getOverlapSize() == null || config.getOverlapSize() < 0) {
config.setOverlapSize(RagDefaults.OVERLAP_SIZE);
}
if (config.getMdSplitterLevel() == null || config.getMdSplitterLevel() <= 0) {
config.setMdSplitterLevel(RagDefaults.MD_SPLITTER_LEVEL);
}
return config;
}
@SuppressWarnings("unchecked")
private StrategyConfig readProfileConfig(Map<String, Object> options, String strategyCode) {
if (!StringUtil.hasText(strategyCode)) {
return null;
}
Object profileObject = options.get(DocumentImportKeys.KEY_SPLITTER_STRATEGY_PROFILES);
if (!(profileObject instanceof Map)) {
return null;
}
Object strategyObject = ((Map<String, Object>) profileObject).get(strategyCode);
if (!(strategyObject instanceof Map)) {
return null;
}
Map<String, Object> rawProfile = (Map<String, Object>) strategyObject;
StrategyConfig config = StrategyConfig.defaults();
config.setStrategyCode(strategyCode);
config.setChunkSize(asInteger(rawProfile.get("chunkSize"), config.getChunkSize()));
config.setOverlapSize(asInteger(rawProfile.get("overlapSize"), config.getOverlapSize()));
config.setRegex(asString(rawProfile.get("regex")));
config.setRowsPerChunk(asInteger(rawProfile.get("rowsPerChunk"), config.getRowsPerChunk()));
config.setMdSplitterLevel(asInteger(rawProfile.get("mdSplitterLevel"), config.getMdSplitterLevel()));
return config;
}
private void mergeStrategyConfig(StrategyConfig target, StrategyConfig source) {
if (source == null) {
return;
}
if (StringUtil.hasText(source.getStrategyCode())) {
target.setStrategyCode(source.getStrategyCode());
}
if (source.getChunkSize() != null) {
target.setChunkSize(source.getChunkSize());
}
if (source.getOverlapSize() != null) {
target.setOverlapSize(source.getOverlapSize());
}
if (StringUtil.hasText(source.getRegex())) {
target.setRegex(source.getRegex());
}
if (source.getRowsPerChunk() != null) {
target.setRowsPerChunk(source.getRowsPerChunk());
}
if (source.getMdSplitterLevel() != null) {
target.setMdSplitterLevel(source.getMdSplitterLevel());
}
}
private Map<String, Object> strategyConfigToMap(StrategyConfig strategyConfig) {
Map<String, Object> map = new LinkedHashMap<>();
map.put("strategyCode", strategyConfig.getStrategyCode());
map.put("chunkSize", strategyConfig.getChunkSize());
map.put("overlapSize", strategyConfig.getOverlapSize());
map.put("regex", strategyConfig.getRegex());
map.put("rowsPerChunk", strategyConfig.getRowsPerChunk());
map.put("mdSplitterLevel", strategyConfig.getMdSplitterLevel());
return map;
}
private int countWarnings(List<RagChunk> chunks) {
int total = 0;
for (RagChunk chunk : chunks) {
total += chunk.getWarnings() == null ? 0 : chunk.getWarnings().size();
}
return total;
}
private StoreExecutionContext prepareStoreContext(Document entity) {
DocumentCollection knowledge = knowledgeService.getById(entity.getCollectionId());
if (knowledge == null) {
throw new BusinessException("知识库不存在");
@@ -274,23 +664,22 @@ public class DocumentServiceImpl extends ServiceImpl<DocumentMapper, Document> i
if (knowledge.isFaqCollection()) {
throw new BusinessException("FAQ知识库不支持文档上传");
}
DocumentStore documentStore = null;
DocumentStore documentStore;
try {
documentStore = knowledge.toDocumentStore();
} catch (Exception e) {
Log.error(e.getMessage());
Log.error("向量库配置错误: knowledgeId={}", knowledge.getId(), e);
throw new BusinessException("向量数据库配置错误");
}
if (documentStore == null) {
throw new BusinessException("向量数据库配置错误");
}
// 设置向量模型
Model model = modelService.getModelInstance(knowledge.getVectorEmbedModelId());
if (model == null) {
throw new BusinessException("该知识库未配置大模型");
}
// 设置向量模型
EmbeddingModel embeddingModel = model.toEmbeddingModel();
documentStore.setEmbeddingModel(embeddingModel);
@@ -300,46 +689,152 @@ public class DocumentServiceImpl extends ServiceImpl<DocumentMapper, Document> i
embeddingOptions.setDimensions(knowledge.getDimensionOfVectorModel());
options.setEmbeddingOptions(embeddingOptions);
options.setIndexName(options.getCollectionName());
DocumentSearcher searcher = null;
if (knowledge.isSearchEngineEnabled()) {
searcher = searcherFactory.getSearcher((String) knowledge.getOptionsByKey(KEY_SEARCH_ENGINE_TYPE));
}
return new StoreExecutionContext(knowledge, model, embeddingModel, documentStore, options, searcher);
}
private void storeDocumentChunks(StoreExecutionContext storeContext, List<DocumentChunk> documentChunks) {
List<com.easyagents.core.document.Document> documents = new ArrayList<>();
documentChunks.forEach(item -> {
com.easyagents.core.document.Document document = new com.easyagents.core.document.Document();
document.setId(item.getId());
document.setContent(item.getContent());
documents.add(document);
}
);
StoreResult result = null;
for (DocumentChunk item : documentChunks) {
com.easyagents.core.document.Document document = new com.easyagents.core.document.Document();
document.setId(item.getId());
document.setContent(item.getContent());
documents.add(document);
}
StoreResult result;
try {
result = documentStore.store(documents, options);
result = storeContext.documentStore.store(documents, storeContext.options);
} catch (Exception e) {
Log.error("Vector store failed: knowledgeId={}, collection={}, chunkCount={}",
knowledge.getId(), options.getCollectionName(), documents.size(), e);
storeContext.knowledge.getId(),
storeContext.options.getCollectionName(),
documents.size(),
e);
throw new BusinessException("向量过程中发生错误,错误信息为:" + e.getMessage());
}
if (result == null || !result.isSuccess()) {
Log.error("DocumentStore.store failed: " + result);
if (result == null || !result.isSuccess()) {
Log.error("DocumentStore.store failed: {}", result);
throw new BusinessException("DocumentStore.store failed");
}
if (knowledge.isSearchEngineEnabled()) {
// 获取搜索引擎
DocumentSearcher searcher = searcherFactory.getSearcher((String) knowledge.getOptionsByKey(KEY_SEARCH_ENGINE_TYPE));
// 添加到搜索引擎
documents.forEach(searcher::addDocument);
if (storeContext.searcher != null) {
for (com.easyagents.core.document.Document document : documents) {
storeContext.searcher.addDocument(document);
}
}
}
private void rollbackStoredChunks(StoreExecutionContext storeContext, List<DocumentChunk> documentChunks) {
try {
List<BigInteger> chunkIds = new ArrayList<>();
for (DocumentChunk chunk : documentChunks) {
chunkIds.add(chunk.getId());
}
storeContext.documentStore.delete(chunkIds, storeContext.options);
if (storeContext.searcher != null) {
for (BigInteger chunkId : chunkIds) {
storeContext.searcher.deleteDocument(chunkId);
}
}
} catch (Exception e) {
Log.error("回滚向量文档失败: knowledgeId={}", storeContext.knowledge.getId(), e);
}
}
private void updateKnowledgeAfterStore(StoreExecutionContext storeContext) {
DocumentCollection documentCollection = new DocumentCollection();
documentCollection.setId(entity.getCollectionId());
Map<String, Object> knowledgeOptions = knowledge.getOptions();
documentCollection.setId(storeContext.knowledge.getId());
Map<String, Object> knowledgeOptions = storeContext.knowledge.getOptions() == null
? new HashMap<>()
: new HashMap<>(storeContext.knowledge.getOptions());
knowledgeOptions.put(KEY_CAN_UPDATE_EMBEDDING_MODEL, false);
documentCollection.setOptions(knowledgeOptions);
knowledgeService.updateById(documentCollection);
if (knowledge.getDimensionOfVectorModel() == null) {
int dimension = Model.getEmbeddingDimension(embeddingModel);
knowledge.setDimensionOfVectorModel(dimension);
knowledgeService.updateById(knowledge);
if (storeContext.knowledge.getDimensionOfVectorModel() == null) {
int dimension = Model.getEmbeddingDimension(storeContext.embeddingModel);
DocumentCollection update = new DocumentCollection();
update.setId(storeContext.knowledge.getId());
update.setDimensionOfVectorModel(dimension);
knowledgeService.updateById(update);
}
}
private void persistDocumentWithChunks(Document document, List<DocumentChunk> chunks) {
this.getMapper().insert(document);
AtomicInteger sort = new AtomicInteger(1);
for (DocumentChunk item : chunks) {
item.setDocumentCollectionId(document.getCollectionId());
item.setDocumentId(document.getId());
item.setSorting(sort.getAndIncrement());
documentChunkService.save(item);
}
}
private void cleanupPersistedDocument(Document document) {
if (document == null || document.getId() == null) {
return;
}
documentChunkMapper.deleteByQuery(QueryWrapper.create().eq(DocumentChunk::getDocumentId, document.getId()));
this.getMapper().deleteById(document.getId());
}
private String asString(Object value) {
return value == null ? null : String.valueOf(value);
}
private Integer asInteger(Object value, Integer defaultValue) {
if (value == null) {
return defaultValue;
}
if (value instanceof Number) {
return ((Number) value).intValue();
}
if (value instanceof String && StringUtil.hasText((String) value)) {
return Integer.parseInt((String) value);
}
return defaultValue;
}
private Boolean asBoolean(Object value, boolean defaultValue) {
if (value == null) {
return defaultValue;
}
if (value instanceof Boolean) {
return (Boolean) value;
}
if (value instanceof Number) {
return ((Number) value).intValue() != 0;
}
return Boolean.parseBoolean(String.valueOf(value));
}
private static class StoreExecutionContext {
private final DocumentCollection knowledge;
private final Model model;
private final EmbeddingModel embeddingModel;
private final DocumentStore documentStore;
private final StoreOptions options;
private final DocumentSearcher searcher;
private StoreExecutionContext(DocumentCollection knowledge,
Model model,
EmbeddingModel embeddingModel,
DocumentStore documentStore,
StoreOptions options,
DocumentSearcher searcher) {
this.knowledge = knowledge;
this.model = model;
this.embeddingModel = embeddingModel;
this.documentStore = documentStore;
this.options = options;
this.searcher = searcher;
}
return true;
}
public DocumentSplitter getDocumentSplitter(DocumentCollectionSplitParams params) {