diff --git a/easy-agents-bom/pom.xml b/easy-agents-bom/pom.xml
index 6d6f348..99b7079 100644
--- a/easy-agents-bom/pom.xml
+++ b/easy-agents-bom/pom.xml
@@ -56,6 +56,30 @@
+
+ com.easyagents
+ easy-agents-rag-core
+
+
+
+ com.easyagents
+ easy-agents-rag-ingestion
+
+
+
+ com.easyagents
+ easy-agents-rag-ocr
+
+
+
+ com.easyagents
+ easy-agents-rag-enhance
+
+
+
+ com.easyagents
+ easy-agents-rag-retrieval
+
diff --git a/easy-agents-rag/TECH-PLAN.md b/easy-agents-rag/TECH-PLAN.md
new file mode 100644
index 0000000..edc6e84
--- /dev/null
+++ b/easy-agents-rag/TECH-PLAN.md
@@ -0,0 +1,102 @@
+# easy-agents-rag 技术规划
+
+## 目标
+
+`easy-agents-rag` 用于承载 Easy-Agents 的 RAG 领域能力,逐步将知识入库、文档预处理、分块、索引增强、检索增强等能力从业务工程中抽离,形成可复用的框架层模块。
+
+当前阶段先完成模块骨架建设,并优先承接本次文档导入链路中的预处理与分块能力迁移。
+
+## 模块规划
+
+### `easy-agents-rag-core`
+
+定位:RAG 域共享契约层。
+
+负责内容:
+- 通用常量与元数据 key
+- 结构类型、策略类型、chunk 类型定义
+- 少量稳定共享模型与接口
+
+不负责内容:
+- 具体 OCR 实现
+- 具体分块实现
+- 具体召回编排
+
+### `easy-agents-rag-ingestion`
+
+定位:入库前处理链路。
+
+负责内容:
+- 文本标准化与清洗
+- 文档结构分析
+- 拆分策略推荐
+- 文档分块与 chunk 元信息补全
+- 入库前质量控制
+
+当前迁移优先承接:
+- 文档结构分析
+- 章节/问答/段落分块
+- 自动推荐拆分策略
+
+### `easy-agents-rag-ocr`
+
+定位:OCR 与版面恢复能力。
+
+负责内容:
+- 图片/PDF OCR
+- 页面版面解析
+- 标题、段落、表格等结构恢复
+- PDF 到结构化文本或 Markdown 的转换
+
+### `easy-agents-rag-enhance`
+
+定位:索引前增强能力。
+
+负责内容:
+- 图增强
+- RAPTOR
+- parent-child chunk
+- window chunk
+- 摘要、关键词、标签等增强信息生成
+- 索引前的知识单元增强
+
+### `easy-agents-rag-retrieval`
+
+定位:查询侧增强与召回编排。
+
+负责内容:
+- query rewrite / expansion
+- hybrid recall 编排
+- metadata filter 策略
+- graph recall
+- rerank 编排
+- chunk merge / window expand / context assemble
+
+## 当前迁移范围
+
+本次优先迁移到 `easy-agents-rag-ingestion` 的能力:
+- 文档结构分析
+- 拆分策略推荐
+- 标题型 / QA 型 / 段落型分块
+
+本次不迁移的能力:
+- 业务侧预览会话
+- 控制器与接口 DTO
+- 业务库持久化
+- 前端导入页面
+
+这些能力继续留在业务工程,由业务层依赖 `easy-agents-rag` 提供的能力完成编排。
+
+## 后续演进
+
+后续演进顺序建议如下:
+
+1. 完成 `rag-ingestion` 首批能力迁移并稳定对外接口
+2. 补充 `rag-ocr`,接入 OCR 与版面恢复
+3. 补充 `rag-enhance`,支持图增强、RAPTOR、索引增强
+4. 补充 `rag-retrieval`,统一查询增强与召回后处理
+
+整体原则:
+- `easy-agents-core` 保持基础抽象
+- `easy-agents-rag` 聚合 RAG 领域实现
+- 业务工程只保留编排、持久化与产品层逻辑
diff --git a/easy-agents-rag/easy-agents-rag-core/pom.xml b/easy-agents-rag/easy-agents-rag-core/pom.xml
new file mode 100644
index 0000000..02187a8
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-core/pom.xml
@@ -0,0 +1,28 @@
+
+
+ 4.0.0
+
+
+ com.easyagents
+ easy-agents-rag
+ ${revision}
+
+
+ easy-agents-rag-core
+ easy-agents-rag-core
+
+
+ 8
+ 8
+ UTF-8
+
+
+
+
+ com.easyagents
+ easy-agents-core
+
+
+
diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunk.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunk.java
new file mode 100644
index 0000000..2621e44
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunk.java
@@ -0,0 +1,128 @@
+package com.easyagents.rag.core;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+public class RagChunk implements Serializable {
+
+ private String chunkId;
+ private String chunkType;
+ private String sourceLabel;
+ private List headingPath = new ArrayList();
+ private String content;
+ private String question;
+ private String answer;
+ private Integer charCount;
+ private Integer tokenEstimate;
+ private Integer partNo = 1;
+ private Integer partTotal = 1;
+ private List warnings = new ArrayList();
+ private Map options = new LinkedHashMap();
+
+ public String getChunkId() {
+ return chunkId;
+ }
+
+ public void setChunkId(String chunkId) {
+ this.chunkId = chunkId;
+ }
+
+ public String getChunkType() {
+ return chunkType;
+ }
+
+ public void setChunkType(String chunkType) {
+ this.chunkType = chunkType;
+ }
+
+ public String getSourceLabel() {
+ return sourceLabel;
+ }
+
+ public void setSourceLabel(String sourceLabel) {
+ this.sourceLabel = sourceLabel;
+ }
+
+ public List getHeadingPath() {
+ return headingPath;
+ }
+
+ public void setHeadingPath(List headingPath) {
+ this.headingPath = headingPath;
+ }
+
+ public String getContent() {
+ return content;
+ }
+
+ public void setContent(String content) {
+ this.content = content;
+ }
+
+ public String getQuestion() {
+ return question;
+ }
+
+ public void setQuestion(String question) {
+ this.question = question;
+ }
+
+ public String getAnswer() {
+ return answer;
+ }
+
+ public void setAnswer(String answer) {
+ this.answer = answer;
+ }
+
+ public Integer getCharCount() {
+ return charCount;
+ }
+
+ public void setCharCount(Integer charCount) {
+ this.charCount = charCount;
+ }
+
+ public Integer getTokenEstimate() {
+ return tokenEstimate;
+ }
+
+ public void setTokenEstimate(Integer tokenEstimate) {
+ this.tokenEstimate = tokenEstimate;
+ }
+
+ public Integer getPartNo() {
+ return partNo;
+ }
+
+ public void setPartNo(Integer partNo) {
+ this.partNo = partNo;
+ }
+
+ public Integer getPartTotal() {
+ return partTotal;
+ }
+
+ public void setPartTotal(Integer partTotal) {
+ this.partTotal = partTotal;
+ }
+
+ public List getWarnings() {
+ return warnings;
+ }
+
+ public void setWarnings(List warnings) {
+ this.warnings = warnings;
+ }
+
+ public Map getOptions() {
+ return options;
+ }
+
+ public void setOptions(Map options) {
+ this.options = options;
+ }
+}
diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunkTypes.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunkTypes.java
new file mode 100644
index 0000000..39a60e6
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunkTypes.java
@@ -0,0 +1,11 @@
+package com.easyagents.rag.core;
+
+public final class RagChunkTypes {
+
+ private RagChunkTypes() {
+ }
+
+ public static final String SECTION = "section";
+ public static final String QA_PAIR = "qa_pair";
+ public static final String PARAGRAPH = "paragraph";
+}
diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagDefaults.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagDefaults.java
new file mode 100644
index 0000000..a78c377
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagDefaults.java
@@ -0,0 +1,12 @@
+package com.easyagents.rag.core;
+
+public final class RagDefaults {
+
+ private RagDefaults() {
+ }
+
+ public static final int CHUNK_SIZE = 512;
+ public static final int OVERLAP_SIZE = 128;
+ public static final int MD_SPLITTER_LEVEL = 2;
+ public static final int ROWS_PER_CHUNK = 1;
+}
diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java
new file mode 100644
index 0000000..db86c63
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java
@@ -0,0 +1,20 @@
+package com.easyagents.rag.core;
+
+public final class RagMetadataKeys {
+
+ private RagMetadataKeys() {
+ }
+
+ public static final String CHUNK_TYPE = "chunkType";
+ public static final String SOURCE_LABEL = "sourceLabel";
+ public static final String HEADING_PATH = "headingPath";
+ public static final String PAGE_NO = "pageNo";
+ public static final String CHAR_COUNT = "charCount";
+ public static final String TOKEN_ESTIMATE = "tokenEstimate";
+ public static final String QA_QUESTION = "qaQuestion";
+ public static final String QA_ANSWER = "qaAnswer";
+ public static final String QA_GROUP_ID = "qaGroupId";
+ public static final String PART_NO = "partNo";
+ public static final String PART_TOTAL = "partTotal";
+ public static final String WARNINGS = "warnings";
+}
diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStrategyCodes.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStrategyCodes.java
new file mode 100644
index 0000000..aac676f
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStrategyCodes.java
@@ -0,0 +1,14 @@
+package com.easyagents.rag.core;
+
+public final class RagStrategyCodes {
+
+ private RagStrategyCodes() {
+ }
+
+ public static final String AUTO = "AUTO";
+ public static final String MARKDOWN_SECTION = "MARKDOWN_SECTION";
+ public static final String OUTLINE_SECTION = "OUTLINE_SECTION";
+ public static final String QA_PAIR = "QA_PAIR";
+ public static final String PARAGRAPH_LENGTH = "PARAGRAPH_LENGTH";
+ public static final String CUSTOM_REGEX = "CUSTOM_REGEX";
+}
diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStructureTypes.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStructureTypes.java
new file mode 100644
index 0000000..eaba9b0
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStructureTypes.java
@@ -0,0 +1,12 @@
+package com.easyagents.rag.core;
+
+public final class RagStructureTypes {
+
+ private RagStructureTypes() {
+ }
+
+ public static final String MARKDOWN_HEADING = "markdown_heading";
+ public static final String OUTLINE_SECTION = "outline_section";
+ public static final String QA_PAIR = "qa_pair";
+ public static final String PLAIN_PARAGRAPH = "plain_paragraph";
+}
diff --git a/easy-agents-rag/easy-agents-rag-enhance/pom.xml b/easy-agents-rag/easy-agents-rag-enhance/pom.xml
new file mode 100644
index 0000000..3112175
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-enhance/pom.xml
@@ -0,0 +1,36 @@
+
+
+ 4.0.0
+
+
+ com.easyagents
+ easy-agents-rag
+ ${revision}
+
+
+ easy-agents-rag-enhance
+ easy-agents-rag-enhance
+
+
+ 8
+ 8
+ UTF-8
+
+
+
+
+ com.easyagents
+ easy-agents-core
+
+
+ com.easyagents
+ easy-agents-rag-core
+
+
+ com.easyagents
+ easy-agents-rag-ingestion
+
+
+
diff --git a/easy-agents-rag/easy-agents-rag-ingestion/pom.xml b/easy-agents-rag/easy-agents-rag-ingestion/pom.xml
new file mode 100644
index 0000000..2427b5e
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-ingestion/pom.xml
@@ -0,0 +1,37 @@
+
+
+ 4.0.0
+
+
+ com.easyagents
+ easy-agents-rag
+ ${revision}
+
+
+ easy-agents-rag-ingestion
+ easy-agents-rag-ingestion
+
+
+ 8
+ 8
+ UTF-8
+
+
+
+
+ com.easyagents
+ easy-agents-core
+
+
+ com.easyagents
+ easy-agents-rag-core
+
+
+ junit
+ junit
+ test
+
+
+
diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/DefaultRagIngestionService.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/DefaultRagIngestionService.java
new file mode 100644
index 0000000..bf1ab88
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/DefaultRagIngestionService.java
@@ -0,0 +1,41 @@
+package com.easyagents.rag.ingestion;
+
+import com.easyagents.rag.core.RagChunk;
+import com.easyagents.rag.ingestion.analysis.DocumentStructureAnalyzer;
+import com.easyagents.rag.ingestion.chunk.RagSplitStrategyRegistry;
+import com.easyagents.rag.ingestion.model.AnalysisResult;
+import com.easyagents.rag.ingestion.model.StrategyConfig;
+import com.easyagents.rag.ingestion.recommend.SplitStrategyRecommender;
+
+import java.util.List;
+
+public class DefaultRagIngestionService implements RagIngestionService {
+
+ private final DocumentStructureAnalyzer documentStructureAnalyzer;
+ private final SplitStrategyRecommender splitStrategyRecommender;
+ private final RagSplitStrategyRegistry ragSplitStrategyRegistry;
+
+ public DefaultRagIngestionService(DocumentStructureAnalyzer documentStructureAnalyzer,
+ SplitStrategyRecommender splitStrategyRecommender,
+ RagSplitStrategyRegistry ragSplitStrategyRegistry) {
+ this.documentStructureAnalyzer = documentStructureAnalyzer;
+ this.splitStrategyRecommender = splitStrategyRecommender;
+ this.ragSplitStrategyRegistry = ragSplitStrategyRegistry;
+ }
+
+ @Override
+ public AnalysisResult analyze(String rawContent, String sourceFormat) {
+ AnalysisResult result = documentStructureAnalyzer.analyze(rawContent, sourceFormat);
+ return splitStrategyRecommender.recommend(result);
+ }
+
+ @Override
+ public List split(AnalysisResult analysis, StrategyConfig config) {
+ return ragSplitStrategyRegistry.split(analysis, config);
+ }
+
+ @Override
+ public String toStrategyLabel(String strategyCode) {
+ return splitStrategyRecommender.toStrategyLabel(strategyCode);
+ }
+}
diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/RagIngestionService.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/RagIngestionService.java
new file mode 100644
index 0000000..6efe60a
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/RagIngestionService.java
@@ -0,0 +1,16 @@
+package com.easyagents.rag.ingestion;
+
+import com.easyagents.rag.core.RagChunk;
+import com.easyagents.rag.ingestion.model.AnalysisResult;
+import com.easyagents.rag.ingestion.model.StrategyConfig;
+
+import java.util.List;
+
+public interface RagIngestionService {
+
+ AnalysisResult analyze(String rawContent, String sourceFormat);
+
+ List split(AnalysisResult analysis, StrategyConfig config);
+
+ String toStrategyLabel(String strategyCode);
+}
diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/analysis/DocumentStructureAnalyzer.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/analysis/DocumentStructureAnalyzer.java
new file mode 100644
index 0000000..3baa447
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/analysis/DocumentStructureAnalyzer.java
@@ -0,0 +1,234 @@
+package com.easyagents.rag.ingestion.analysis;
+
+import com.easyagents.core.util.StringUtil;
+import com.easyagents.rag.ingestion.model.AnalysisResult;
+
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class DocumentStructureAnalyzer {
+
+ private static final Pattern MARKDOWN_HEADING = Pattern.compile("^#{1,6}\\s+\\S+.*$");
+ private static final Pattern CHINESE_CHAPTER = Pattern.compile("^第[一二三四五六七八九十百零两0-9]+[章节篇部分卷]\\s*.*$");
+ private static final Pattern CHINESE_SECTION = Pattern.compile("^[一二三四五六七八九十百零]+[、..]\\s*\\S+.*$");
+ private static final Pattern CHINESE_SUBSECTION = Pattern.compile("^[((][一二三四五六七八九十百零0-9]+[))]\\s*\\S+.*$");
+ private static final Pattern NUMERIC_SECTION = Pattern.compile("^[0-9]+(\\.[0-9]+){0,4}\\s+\\S+.*$");
+ private static final Pattern ENGLISH_SECTION = Pattern.compile("^(Chapter|Section|Part)\\s+[0-9IVXLC]+([.:\\-\\s].*)?$", Pattern.CASE_INSENSITIVE);
+ private static final Pattern ENGLISH_ROMAN = Pattern.compile("^[IVXLC]+[.、)\\s-]+\\S+.*$");
+ private static final Pattern QUESTION_LINE = Pattern.compile("^(Q|QUESTION|问|问题|FAQ\\s*[0-9]+)\\s*[.::-]\\s*.+$", Pattern.CASE_INSENSITIVE);
+ private static final Pattern ANSWER_LINE = Pattern.compile("^(A|ANSWER|答|答案)\\s*[.::-]\\s*.+$", Pattern.CASE_INSENSITIVE);
+ private static final Pattern PAGE_NUMBER = Pattern.compile("^(第?\\s*\\d+\\s*页|page\\s+\\d+|\\d+)\\s*$", Pattern.CASE_INSENSITIVE);
+ private static final Pattern TOC_LINE = Pattern.compile("^.{2,80}[.·•…]{2,}\\s*\\d+\\s*$");
+
+ public AnalysisResult analyze(String rawContent, String sourceFormat) {
+ String normalizedContent = normalize(rawContent);
+ List lines = toLines(normalizedContent);
+
+ int markdownHeadingCount = 0;
+ int outlineHeadingCount = 0;
+ int qaQuestionCount = 0;
+ int qaAnswerCount = 0;
+ int pairedQaCount = 0;
+ int tocLineCount = 0;
+ int shortLineCount = 0;
+ Set markdownLevels = new HashSet();
+
+ for (int i = 0; i < lines.size(); i++) {
+ String line = lines.get(i);
+ if (MARKDOWN_HEADING.matcher(line).matches()) {
+ markdownHeadingCount++;
+ markdownLevels.add(Integer.valueOf(countMarkdownLevel(line)));
+ }
+ if (isOutlineHeading(line)) {
+ outlineHeadingCount++;
+ }
+ if (QUESTION_LINE.matcher(line).matches()) {
+ qaQuestionCount++;
+ if (hasAnswerNearby(lines, i)) {
+ pairedQaCount++;
+ }
+ }
+ if (ANSWER_LINE.matcher(line).matches()) {
+ qaAnswerCount++;
+ }
+ if (TOC_LINE.matcher(line).matches()) {
+ tocLineCount++;
+ }
+ if (line.length() <= 20) {
+ shortLineCount++;
+ }
+ }
+
+ int paragraphCount = 0;
+ int longParagraphCount = 0;
+ List paragraphs = splitParagraphs(normalizedContent);
+ for (String paragraph : paragraphs) {
+ if (StringUtil.hasText(paragraph)) {
+ paragraphCount++;
+ if (paragraph.length() > 800) {
+ longParagraphCount++;
+ }
+ }
+ }
+
+ Map features = new LinkedHashMap();
+ features.put("lineCount", Integer.valueOf(lines.size()));
+ features.put("paragraphCount", Integer.valueOf(paragraphCount));
+ features.put("markdownHeadingCount", Integer.valueOf(markdownHeadingCount));
+ features.put("markdownLevelVariety", Integer.valueOf(markdownLevels.size()));
+ features.put("outlineHeadingCount", Integer.valueOf(outlineHeadingCount));
+ features.put("qaQuestionCount", Integer.valueOf(qaQuestionCount));
+ features.put("qaAnswerCount", Integer.valueOf(qaAnswerCount));
+ features.put("pairedQaCount", Integer.valueOf(pairedQaCount));
+ features.put("tocLineCount", Integer.valueOf(tocLineCount));
+ features.put("shortLineRatio", lines.isEmpty() ? Double.valueOf(0D) : Double.valueOf((double) shortLineCount / (double) lines.size()));
+ features.put("longParagraphCount", Integer.valueOf(longParagraphCount));
+
+ AnalysisResult result = new AnalysisResult();
+ result.setSourceFormat(sourceFormat == null ? "" : sourceFormat.toLowerCase(Locale.ROOT));
+ result.setNormalizedContent(normalizedContent);
+ result.setFeatures(features);
+ return result;
+ }
+
+ private String normalize(String rawContent) {
+ if (!StringUtil.hasText(rawContent)) {
+ return "";
+ }
+
+ String content = rawContent
+ .replace("\uFEFF", "")
+ .replace("\u200B", "")
+ .replace("\r\n", "\n")
+ .replace('\r', '\n')
+ .replace('\u00A0', ' ')
+ .replace(':', ':');
+
+ List originalLines = toLines(content);
+ Map lineCounts = new HashMap();
+ for (String line : originalLines) {
+ if (line.length() >= 4 && line.length() <= 60) {
+ Integer count = lineCounts.get(line);
+ lineCounts.put(line, count == null ? Integer.valueOf(1) : Integer.valueOf(count.intValue() + 1));
+ }
+ }
+
+ List filteredLines = new ArrayList();
+ for (String line : originalLines) {
+ if (!StringUtil.hasText(line)) {
+ filteredLines.add("");
+ continue;
+ }
+ if (PAGE_NUMBER.matcher(line).matches()) {
+ continue;
+ }
+ Integer repeated = lineCounts.get(line);
+ if (repeated != null && repeated.intValue() >= 3 && line.length() <= 40) {
+ continue;
+ }
+ filteredLines.add(line);
+ }
+
+ List mergedLines = new ArrayList();
+ for (String line : filteredLines) {
+ if (mergedLines.isEmpty()) {
+ mergedLines.add(line);
+ continue;
+ }
+ String previous = mergedLines.get(mergedLines.size() - 1);
+ if (!StringUtil.hasText(previous) || !StringUtil.hasText(line)) {
+ mergedLines.add(line);
+ continue;
+ }
+ if (shouldMerge(previous, line)) {
+ mergedLines.set(mergedLines.size() - 1, previous + joinToken(previous, line) + line);
+ } else {
+ mergedLines.add(line);
+ }
+ }
+
+ return String.join("\n", mergedLines)
+ .replaceAll("[ \\t]{2,}", " ")
+ .replaceAll("\\n{3,}", "\n\n")
+ .trim();
+ }
+
+ private boolean shouldMerge(String previous, String current) {
+ if (isHeading(previous) || isHeading(current)) {
+ return false;
+ }
+ if (QUESTION_LINE.matcher(current).matches() || ANSWER_LINE.matcher(current).matches()) {
+ return false;
+ }
+ if (TOC_LINE.matcher(previous).matches() || TOC_LINE.matcher(current).matches()) {
+ return false;
+ }
+ char previousChar = previous.charAt(previous.length() - 1);
+ if ("。!?.!?:;:;".indexOf(previousChar) >= 0) {
+ return false;
+ }
+ return current.length() < 80;
+ }
+
+ private String joinToken(String previous, String current) {
+ char last = previous.charAt(previous.length() - 1);
+ char first = current.charAt(0);
+ if (Character.isLetterOrDigit(last) && Character.isLetterOrDigit(first)) {
+ return " ";
+ }
+ return "";
+ }
+
+ private boolean hasAnswerNearby(List lines, int index) {
+ int end = Math.min(lines.size(), index + 4);
+ for (int i = index + 1; i < end; i++) {
+ if (ANSWER_LINE.matcher(lines.get(i)).matches()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private boolean isHeading(String line) {
+ return MARKDOWN_HEADING.matcher(line).matches() || isOutlineHeading(line);
+ }
+
+ private boolean isOutlineHeading(String line) {
+ return CHINESE_CHAPTER.matcher(line).matches()
+ || CHINESE_SECTION.matcher(line).matches()
+ || CHINESE_SUBSECTION.matcher(line).matches()
+ || NUMERIC_SECTION.matcher(line).matches()
+ || ENGLISH_SECTION.matcher(line).matches()
+ || ENGLISH_ROMAN.matcher(line).matches();
+ }
+
+ private int countMarkdownLevel(String line) {
+ Matcher matcher = Pattern.compile("^(#{1,6})\\s+").matcher(line);
+ if (!matcher.find()) {
+ return 0;
+ }
+ return matcher.group(1).length();
+ }
+
+ private List splitParagraphs(String normalizedContent) {
+ String[] parts = normalizedContent.split("\\n\\s*\\n");
+ List paragraphs = new ArrayList();
+ for (String part : parts) {
+ String paragraph = part.trim();
+ if (StringUtil.hasText(paragraph)) {
+ paragraphs.add(paragraph);
+ }
+ }
+ return paragraphs;
+ }
+
+ private List toLines(String content) {
+ String[] rawLines = content.split("\\n");
+ List lines = new ArrayList(rawLines.length);
+ for (String rawLine : rawLines) {
+ lines.add(rawLine == null ? "" : rawLine.trim());
+ }
+ return lines;
+ }
+}
diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java
new file mode 100644
index 0000000..0be40ac
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java
@@ -0,0 +1,388 @@
+package com.easyagents.rag.ingestion.chunk;
+
+import com.easyagents.core.document.Document;
+import com.easyagents.core.document.DocumentSplitter;
+import com.easyagents.core.document.splitter.RegexDocumentSplitter;
+import com.easyagents.core.document.splitter.SimpleDocumentSplitter;
+import com.easyagents.core.util.StringUtil;
+import com.easyagents.rag.core.*;
+import com.easyagents.rag.ingestion.model.AnalysisResult;
+import com.easyagents.rag.ingestion.model.StrategyConfig;
+
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class RagSplitStrategyRegistry {
+
+ private static final Pattern MARKDOWN_HEADING = Pattern.compile("^(#{1,6})\\s+(.*)$");
+ private static final Pattern QUESTION_PREFIX = Pattern.compile("^(Q|QUESTION|问|问题|FAQ\\s*[0-9]+)\\s*[.::-]\\s*(.+)$", Pattern.CASE_INSENSITIVE);
+ private static final Pattern ANSWER_PREFIX = Pattern.compile("^(A|ANSWER|答|答案)\\s*[.::-]\\s*(.+)$", Pattern.CASE_INSENSITIVE);
+
+ public List split(AnalysisResult analysisResult, StrategyConfig strategyConfig) {
+ String strategyCode = strategyConfig.getStrategyCode();
+ if (!StringUtil.hasText(strategyCode) || RagStrategyCodes.AUTO.equals(strategyCode)) {
+ strategyCode = analysisResult.getRecommendedStrategyCode();
+ }
+ String normalizedContent = analysisResult.getNormalizedContent();
+ if (RagStrategyCodes.MARKDOWN_SECTION.equals(strategyCode)) {
+ return buildMarkdownChunks(normalizedContent, strategyConfig);
+ }
+ if (RagStrategyCodes.OUTLINE_SECTION.equals(strategyCode)) {
+ return buildOutlineChunks(normalizedContent, strategyConfig);
+ }
+ if (RagStrategyCodes.QA_PAIR.equals(strategyCode)) {
+ return buildQaChunks(normalizedContent, strategyConfig);
+ }
+ if (RagStrategyCodes.CUSTOM_REGEX.equals(strategyCode)) {
+ return buildRegexChunks(normalizedContent, strategyConfig);
+ }
+ return buildParagraphChunks(normalizedContent, strategyConfig);
+ }
+
+ private List buildMarkdownChunks(String content, StrategyConfig strategyConfig) {
+ List lines = Arrays.asList(content.split("\\n"));
+ List sections = new ArrayList();
+ Deque stack = new ArrayDeque();
+ SectionChunk current = null;
+ for (String rawLine : lines) {
+ String line = rawLine.trim();
+ Matcher matcher = MARKDOWN_HEADING.matcher(line);
+ if (matcher.matches()) {
+ if (current != null) {
+ sections.add(current);
+ }
+ int level = matcher.group(1).length();
+ while (!stack.isEmpty() && stack.peekLast().level >= level) {
+ stack.removeLast();
+ }
+ stack.addLast(new HeadingLevel(level, matcher.group(2).trim()));
+ current = new SectionChunk(copyPath(stack), matcher.group(2).trim());
+ current.lines.add(line);
+ } else {
+ if (current == null) {
+ current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
+ }
+ current.lines.add(rawLine);
+ }
+ }
+ if (current != null) {
+ sections.add(current);
+ }
+ return finalizeSectionChunks(sections, strategyConfig);
+ }
+
+ private List buildOutlineChunks(String content, StrategyConfig strategyConfig) {
+ List lines = Arrays.asList(content.split("\\n"));
+ List sections = new ArrayList();
+ Deque stack = new ArrayDeque();
+ SectionChunk current = null;
+ for (String rawLine : lines) {
+ String line = rawLine.trim();
+ OutlineHeading heading = OutlineHeading.parse(line);
+ if (heading != null) {
+ if (current != null) {
+ sections.add(current);
+ }
+ while (!stack.isEmpty() && stack.peekLast().level >= heading.level) {
+ stack.removeLast();
+ }
+ stack.addLast(new HeadingLevel(heading.level, heading.title));
+ current = new SectionChunk(copyPath(stack), heading.title);
+ current.lines.add(line);
+ } else {
+ if (current == null) {
+ current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
+ }
+ current.lines.add(rawLine);
+ }
+ }
+ if (current != null) {
+ sections.add(current);
+ }
+ return finalizeSectionChunks(sections, strategyConfig);
+ }
+
+ private List finalizeSectionChunks(List sections, StrategyConfig strategyConfig) {
+ List result = new ArrayList();
+ int index = 1;
+ for (SectionChunk section : sections) {
+ String content = joinAndTrim(section.lines);
+ if (!StringUtil.hasText(content) || content.equals(section.sourceLabel)) {
+ continue;
+ }
+ if (content.length() <= safeChunkSize(strategyConfig)) {
+ result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, content, index++, 1, 1));
+ continue;
+ }
+ List subContents = splitLongContent(content, strategyConfig.getChunkSize());
+ int total = subContents.size();
+ for (int i = 0; i < subContents.size(); i++) {
+ result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, subContents.get(i), index++, i + 1, total));
+ }
+ }
+ return postProcess(result);
+ }
+
+ private List buildQaChunks(String content, StrategyConfig strategyConfig) {
+ List result = new ArrayList();
+ String currentQuestion = null;
+ StringBuilder answerBuilder = new StringBuilder();
+ StringBuilder questionBuilder = new StringBuilder();
+ int qaIndex = 1;
+
+ for (String rawLine : content.split("\\n")) {
+ String line = rawLine.trim();
+ if (!StringUtil.hasText(line)) {
+ continue;
+ }
+ Matcher questionMatcher = QUESTION_PREFIX.matcher(line);
+ Matcher answerMatcher = ANSWER_PREFIX.matcher(line);
+ if (questionMatcher.matches()) {
+ qaIndex = flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig);
+ currentQuestion = questionMatcher.group(2).trim();
+ questionBuilder = new StringBuilder(currentQuestion);
+ answerBuilder = new StringBuilder();
+ continue;
+ }
+ if (answerMatcher.matches()) {
+ if (answerBuilder.length() > 0) {
+ answerBuilder.append('\n');
+ }
+ answerBuilder.append(answerMatcher.group(2).trim());
+ continue;
+ }
+ if (answerBuilder.length() > 0) {
+ answerBuilder.append('\n').append(rawLine.trim());
+ } else if (questionBuilder.length() > 0) {
+ questionBuilder.append('\n').append(rawLine.trim());
+ }
+ }
+ flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig);
+ return postProcess(result);
+ }
+
+ private int flushQaChunk(List result,
+ String currentQuestion,
+ StringBuilder questionBuilder,
+ StringBuilder answerBuilder,
+ int qaIndex,
+ StrategyConfig strategyConfig) {
+ if (!StringUtil.hasText(currentQuestion)) {
+ return qaIndex;
+ }
+ if (!StringUtil.hasText(answerBuilder.toString())) {
+ return qaIndex;
+ }
+ String question = questionBuilder.toString().trim();
+ String answer = answerBuilder.toString().trim();
+ String baseContent = "问题:" + question + "\n答案:" + answer;
+ List subContents = baseContent.length() > safeChunkSize(strategyConfig)
+ ? splitLongContent(baseContent, strategyConfig.getChunkSize())
+ : Collections.singletonList(baseContent);
+ int total = subContents.size();
+ for (int i = 0; i < subContents.size(); i++) {
+ RagChunk chunk = createChunk(RagChunkTypes.QA_PAIR, "Q" + qaIndex + " " + question, Collections.emptyList(), subContents.get(i), result.size() + 1, i + 1, total);
+ chunk.setQuestion(question);
+ chunk.setAnswer(answer);
+ chunk.getOptions().put(RagMetadataKeys.QA_GROUP_ID, "qa-" + qaIndex);
+ result.add(chunk);
+ }
+ return qaIndex + 1;
+ }
+
+ private List buildParagraphChunks(String content, StrategyConfig strategyConfig) {
+ List result = new ArrayList();
+ DocumentSplitter splitter = new SimpleDocumentSplitter(safeChunkSize(strategyConfig), safeOverlap(strategyConfig));
+ List docs = splitter.split(new Document(content));
+ int index = 1;
+ for (Document doc : docs) {
+ result.add(createChunk(RagChunkTypes.PARAGRAPH, "分块 " + index, Collections.emptyList(), doc.getContent(), index, 1, 1));
+ index++;
+ }
+ return postProcess(result);
+ }
+
+ private List buildRegexChunks(String content, StrategyConfig strategyConfig) {
+ String regex = StringUtil.hasText(strategyConfig.getRegex()) ? strategyConfig.getRegex() : "\\n\\s*\\n";
+ DocumentSplitter splitter = new RegexDocumentSplitter(regex);
+ List docs = splitter.split(new Document(content));
+ List result = new ArrayList();
+ int index = 1;
+ for (Document doc : docs) {
+ result.add(createChunk(RagChunkTypes.PARAGRAPH, "正则分块 " + index, Collections.emptyList(), doc.getContent(), index, 1, 1));
+ index++;
+ }
+ return postProcess(result);
+ }
+
+ private List splitLongContent(String content, Integer chunkSize) {
+ int size = chunkSize == null || chunkSize.intValue() <= 0 ? RagDefaults.CHUNK_SIZE : chunkSize.intValue();
+ String[] paragraphs = content.split("\\n\\s*\\n");
+ List parts = new ArrayList();
+ StringBuilder current = new StringBuilder();
+ for (String paragraph : paragraphs) {
+ String text = paragraph.trim();
+ if (!StringUtil.hasText(text)) {
+ continue;
+ }
+ if (current.length() > 0 && current.length() + text.length() + 2 > size) {
+ parts.add(current.toString().trim());
+ current = new StringBuilder();
+ }
+ if (current.length() > 0) {
+ current.append("\n\n");
+ }
+ current.append(text);
+ }
+ if (current.length() > 0) {
+ parts.add(current.toString().trim());
+ }
+ if (parts.isEmpty()) {
+ parts.add(content);
+ }
+ return parts;
+ }
+
+ private List postProcess(List chunks) {
+ List result = new ArrayList();
+ Set dedup = new HashSet();
+ int index = 1;
+ for (RagChunk chunk : chunks) {
+ String content = chunk.getContent() == null ? "" : chunk.getContent().trim();
+ if (!StringUtil.hasText(content)) {
+ continue;
+ }
+ if (content.length() < 10 && !RagChunkTypes.QA_PAIR.equals(chunk.getChunkType())) {
+ continue;
+ }
+ String dedupKey = content.replaceAll("\\s+", " ");
+ if (!dedup.add(dedupKey)) {
+ continue;
+ }
+ chunk.setChunkId("chunk-" + index);
+ chunk.setCharCount(Integer.valueOf(content.length()));
+ chunk.setTokenEstimate(Integer.valueOf(Math.max(1, content.length() / 4)));
+ result.add(chunk);
+ index++;
+ }
+ return result;
+ }
+
+ private RagChunk createChunk(String chunkType,
+ String sourceLabel,
+ List headingPath,
+ String content,
+ int index,
+ int partNo,
+ int partTotal) {
+ RagChunk chunk = new RagChunk();
+ chunk.setChunkId("chunk-" + index);
+ chunk.setChunkType(chunkType);
+ chunk.setSourceLabel(sourceLabel);
+ chunk.setHeadingPath(new ArrayList(headingPath));
+ chunk.setContent(content.trim());
+ chunk.setPartNo(Integer.valueOf(partNo));
+ chunk.setPartTotal(Integer.valueOf(partTotal));
+ if (!headingPath.isEmpty()) {
+ chunk.getOptions().put(RagMetadataKeys.HEADING_PATH, new ArrayList(headingPath));
+ }
+ if (RagChunkTypes.SECTION.equals(chunkType)) {
+ chunk.getOptions().put(RagMetadataKeys.SOURCE_LABEL, sourceLabel);
+ }
+ return chunk;
+ }
+
+ private int safeChunkSize(StrategyConfig strategyConfig) {
+ Integer chunkSize = strategyConfig.getChunkSize();
+ return chunkSize == null || chunkSize.intValue() <= 0 ? RagDefaults.CHUNK_SIZE : chunkSize.intValue();
+ }
+
+ private int safeOverlap(StrategyConfig strategyConfig) {
+ Integer overlapSize = strategyConfig.getOverlapSize();
+ return overlapSize == null || overlapSize.intValue() < 0 ? RagDefaults.OVERLAP_SIZE : overlapSize.intValue();
+ }
+
+ private String joinAndTrim(List lines) {
+ String value = String.join("\n", lines).trim();
+ return value.replaceAll("\\n{3,}", "\n\n");
+ }
+
+ private List copyPath(Deque stack) {
+ List path = new ArrayList();
+ for (HeadingLevel item : stack) {
+ path.add(0, item.title);
+ }
+ return path;
+ }
+
+ private static class SectionChunk {
+ private final List headingPath;
+ private final String sourceLabel;
+ private final List lines = new ArrayList();
+
+ private SectionChunk(List headingPath, String sourceLabel) {
+ this.headingPath = headingPath;
+ this.sourceLabel = sourceLabel;
+ }
+ }
+
+ private static class HeadingLevel {
+ private final int level;
+ private final String title;
+
+ private HeadingLevel(int level, String title) {
+ this.level = level;
+ this.title = title;
+ }
+ }
+
+ private static class OutlineHeading {
+ private static final Pattern CHINESE_CHAPTER = Pattern.compile("^第[一二三四五六七八九十百零两0-9]+[章节篇部分卷]\\s*(.*)$");
+ private static final Pattern CHINESE_SECTION = Pattern.compile("^([一二三四五六七八九十百零]+[、..])\\s*(\\S+.*)$");
+ private static final Pattern CHINESE_SUBSECTION = Pattern.compile("^[((]([一二三四五六七八九十百零0-9]+)[))]\\s*(\\S+.*)$");
+ private static final Pattern NUMERIC_SECTION = Pattern.compile("^([0-9]+(?:\\.[0-9]+){0,4})\\s+(\\S+.*)$");
+ private static final Pattern ENGLISH_SECTION = Pattern.compile("^(Chapter|Section|Part)\\s+([0-9IVXLC]+)(.*)$", Pattern.CASE_INSENSITIVE);
+ private static final Pattern ENGLISH_ROMAN = Pattern.compile("^([IVXLC]+)[.、)\\s-]+(\\S+.*)$");
+
+ private final int level;
+ private final String title;
+
+ private OutlineHeading(int level, String title) {
+ this.level = level;
+ this.title = title;
+ }
+
+ private static OutlineHeading parse(String line) {
+ Matcher matcher = CHINESE_CHAPTER.matcher(line);
+ if (matcher.matches()) {
+ return new OutlineHeading(1, line.trim());
+ }
+ matcher = CHINESE_SECTION.matcher(line);
+ if (matcher.matches()) {
+ return new OutlineHeading(2, line.trim());
+ }
+ matcher = CHINESE_SUBSECTION.matcher(line);
+ if (matcher.matches()) {
+ return new OutlineHeading(3, line.trim());
+ }
+ matcher = NUMERIC_SECTION.matcher(line);
+ if (matcher.matches()) {
+ String code = matcher.group(1);
+ int level = code.split("\\.").length;
+ return new OutlineHeading(level, line.trim());
+ }
+ matcher = ENGLISH_SECTION.matcher(line);
+ if (matcher.matches()) {
+ String prefix = matcher.group(1).toLowerCase();
+ int level = "chapter".equals(prefix) ? 1 : ("section".equals(prefix) ? 2 : 1);
+ return new OutlineHeading(level, line.trim());
+ }
+ matcher = ENGLISH_ROMAN.matcher(line);
+ if (matcher.matches()) {
+ return new OutlineHeading(2, line.trim());
+ }
+ return null;
+ }
+ }
+}
diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/AnalysisResult.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/AnalysisResult.java
new file mode 100644
index 0000000..c66928d
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/AnalysisResult.java
@@ -0,0 +1,92 @@
+package com.easyagents.rag.ingestion.model;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+public class AnalysisResult implements Serializable {
+
+ private String sourceFormat;
+ private String normalizedContent;
+ private String recommendedStructureType;
+ private String recommendedStrategyCode;
+ private String recommendedStrategyLabel;
+ private Double confidence;
+ private List reasons = new ArrayList();
+ private List candidateStrategies = new ArrayList();
+ private Map features = new LinkedHashMap();
+
+ public String getSourceFormat() {
+ return sourceFormat;
+ }
+
+ public void setSourceFormat(String sourceFormat) {
+ this.sourceFormat = sourceFormat;
+ }
+
+ public String getNormalizedContent() {
+ return normalizedContent;
+ }
+
+ public void setNormalizedContent(String normalizedContent) {
+ this.normalizedContent = normalizedContent;
+ }
+
+ public String getRecommendedStructureType() {
+ return recommendedStructureType;
+ }
+
+ public void setRecommendedStructureType(String recommendedStructureType) {
+ this.recommendedStructureType = recommendedStructureType;
+ }
+
+ public String getRecommendedStrategyCode() {
+ return recommendedStrategyCode;
+ }
+
+ public void setRecommendedStrategyCode(String recommendedStrategyCode) {
+ this.recommendedStrategyCode = recommendedStrategyCode;
+ }
+
+ public String getRecommendedStrategyLabel() {
+ return recommendedStrategyLabel;
+ }
+
+ public void setRecommendedStrategyLabel(String recommendedStrategyLabel) {
+ this.recommendedStrategyLabel = recommendedStrategyLabel;
+ }
+
+ public Double getConfidence() {
+ return confidence;
+ }
+
+ public void setConfidence(Double confidence) {
+ this.confidence = confidence;
+ }
+
+ public List getReasons() {
+ return reasons;
+ }
+
+ public void setReasons(List reasons) {
+ this.reasons = reasons;
+ }
+
+ public List getCandidateStrategies() {
+ return candidateStrategies;
+ }
+
+ public void setCandidateStrategies(List candidateStrategies) {
+ this.candidateStrategies = candidateStrategies;
+ }
+
+ public Map getFeatures() {
+ return features;
+ }
+
+ public void setFeatures(Map features) {
+ this.features = features;
+ }
+}
diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/CandidateStrategy.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/CandidateStrategy.java
new file mode 100644
index 0000000..0f085b8
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/CandidateStrategy.java
@@ -0,0 +1,43 @@
+package com.easyagents.rag.ingestion.model;
+
+import java.io.Serializable;
+
+public class CandidateStrategy implements Serializable {
+
+ private String strategyCode;
+ private String strategyLabel;
+ private Double score;
+
+ public CandidateStrategy() {
+ }
+
+ public CandidateStrategy(String strategyCode, String strategyLabel, Double score) {
+ this.strategyCode = strategyCode;
+ this.strategyLabel = strategyLabel;
+ this.score = score;
+ }
+
+ public String getStrategyCode() {
+ return strategyCode;
+ }
+
+ public void setStrategyCode(String strategyCode) {
+ this.strategyCode = strategyCode;
+ }
+
+ public String getStrategyLabel() {
+ return strategyLabel;
+ }
+
+ public void setStrategyLabel(String strategyLabel) {
+ this.strategyLabel = strategyLabel;
+ }
+
+ public Double getScore() {
+ return score;
+ }
+
+ public void setScore(Double score) {
+ this.score = score;
+ }
+}
diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/StrategyConfig.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/StrategyConfig.java
new file mode 100644
index 0000000..b4d4da2
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/StrategyConfig.java
@@ -0,0 +1,79 @@
+package com.easyagents.rag.ingestion.model;
+
+import com.easyagents.rag.core.RagDefaults;
+import com.easyagents.rag.core.RagStrategyCodes;
+
+import java.io.Serializable;
+
+public class StrategyConfig implements Serializable {
+
+ private String strategyCode = RagStrategyCodes.AUTO;
+ private Integer chunkSize = RagDefaults.CHUNK_SIZE;
+ private Integer overlapSize = RagDefaults.OVERLAP_SIZE;
+ private String regex;
+ private Integer rowsPerChunk = RagDefaults.ROWS_PER_CHUNK;
+ private Integer mdSplitterLevel = RagDefaults.MD_SPLITTER_LEVEL;
+
+ public static StrategyConfig defaults() {
+ return new StrategyConfig();
+ }
+
+ public StrategyConfig copy() {
+ StrategyConfig copy = new StrategyConfig();
+ copy.setStrategyCode(this.strategyCode);
+ copy.setChunkSize(this.chunkSize);
+ copy.setOverlapSize(this.overlapSize);
+ copy.setRegex(this.regex);
+ copy.setRowsPerChunk(this.rowsPerChunk);
+ copy.setMdSplitterLevel(this.mdSplitterLevel);
+ return copy;
+ }
+
+ public String getStrategyCode() {
+ return strategyCode;
+ }
+
+ public void setStrategyCode(String strategyCode) {
+ this.strategyCode = strategyCode;
+ }
+
+ public Integer getChunkSize() {
+ return chunkSize;
+ }
+
+ public void setChunkSize(Integer chunkSize) {
+ this.chunkSize = chunkSize;
+ }
+
+ public Integer getOverlapSize() {
+ return overlapSize;
+ }
+
+ public void setOverlapSize(Integer overlapSize) {
+ this.overlapSize = overlapSize;
+ }
+
+ public String getRegex() {
+ return regex;
+ }
+
+ public void setRegex(String regex) {
+ this.regex = regex;
+ }
+
+ public Integer getRowsPerChunk() {
+ return rowsPerChunk;
+ }
+
+ public void setRowsPerChunk(Integer rowsPerChunk) {
+ this.rowsPerChunk = rowsPerChunk;
+ }
+
+ public Integer getMdSplitterLevel() {
+ return mdSplitterLevel;
+ }
+
+ public void setMdSplitterLevel(Integer mdSplitterLevel) {
+ this.mdSplitterLevel = mdSplitterLevel;
+ }
+}
diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/recommend/SplitStrategyRecommender.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/recommend/SplitStrategyRecommender.java
new file mode 100644
index 0000000..8c70c19
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/recommend/SplitStrategyRecommender.java
@@ -0,0 +1,133 @@
+package com.easyagents.rag.ingestion.recommend;
+
+import com.easyagents.rag.core.RagStrategyCodes;
+import com.easyagents.rag.core.RagStructureTypes;
+import com.easyagents.rag.ingestion.model.AnalysisResult;
+import com.easyagents.rag.ingestion.model.CandidateStrategy;
+
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+import java.util.*;
+
+public class SplitStrategyRecommender {
+
+ public AnalysisResult recommend(AnalysisResult analysisResult) {
+ Map features = analysisResult.getFeatures();
+ String sourceFormat = safeLowercase(analysisResult.getSourceFormat());
+
+ double markdownScore = number(features.get("markdownHeadingCount")) * 12
+ + number(features.get("markdownLevelVariety")) * 8
+ + ("md".equals(sourceFormat) ? 20 : 0);
+ double outlineScore = number(features.get("outlineHeadingCount")) * 10
+ + (("pdf".equals(sourceFormat) || "docx".equals(sourceFormat)) ? 5 : 0)
+ - number(features.get("tocLineCount")) * 4;
+ double qaScore = number(features.get("qaQuestionCount")) * 10
+ + number(features.get("qaAnswerCount")) * 10
+ + number(features.get("pairedQaCount")) * 18;
+ double plainScore = 18
+ + number(features.get("paragraphCount")) * 2
+ + number(features.get("longParagraphCount")) * 3;
+
+ Map scoreMap = new LinkedHashMap();
+ scoreMap.put(RagStrategyCodes.MARKDOWN_SECTION, Double.valueOf(markdownScore));
+ scoreMap.put(RagStrategyCodes.OUTLINE_SECTION, Double.valueOf(outlineScore));
+ scoreMap.put(RagStrategyCodes.QA_PAIR, Double.valueOf(qaScore));
+ scoreMap.put(RagStrategyCodes.PARAGRAPH_LENGTH, Double.valueOf(plainScore));
+
+ List> ranking = new ArrayList>(scoreMap.entrySet());
+ ranking.sort((left, right) -> Double.compare(right.getValue().doubleValue(), left.getValue().doubleValue()));
+
+ Map.Entry best = ranking.get(0);
+ Map.Entry second = ranking.size() > 1 ? ranking.get(1) : best;
+ double confidence = computeConfidence(best.getValue().doubleValue(), second.getValue().doubleValue());
+
+ String recommendedStrategy = confidence < 0.45D ? RagStrategyCodes.PARAGRAPH_LENGTH : best.getKey();
+ analysisResult.setRecommendedStrategyCode(recommendedStrategy);
+ analysisResult.setRecommendedStrategyLabel(toStrategyLabel(recommendedStrategy));
+ analysisResult.setRecommendedStructureType(toStructureType(recommendedStrategy));
+ analysisResult.setConfidence(Double.valueOf(scale(confidence)));
+ analysisResult.setReasons(buildReasons(features, recommendedStrategy, confidence));
+
+ List candidates = new ArrayList();
+ for (Map.Entry entry : ranking) {
+ candidates.add(new CandidateStrategy(entry.getKey(), toStrategyLabel(entry.getKey()), Double.valueOf(scale(entry.getValue().doubleValue()))));
+ }
+ analysisResult.setCandidateStrategies(candidates);
+ return analysisResult;
+ }
+
+ public String toStructureType(String strategyCode) {
+ if (RagStrategyCodes.MARKDOWN_SECTION.equals(strategyCode)) {
+ return RagStructureTypes.MARKDOWN_HEADING;
+ }
+ if (RagStrategyCodes.OUTLINE_SECTION.equals(strategyCode)) {
+ return RagStructureTypes.OUTLINE_SECTION;
+ }
+ if (RagStrategyCodes.QA_PAIR.equals(strategyCode)) {
+ return RagStructureTypes.QA_PAIR;
+ }
+ return RagStructureTypes.PLAIN_PARAGRAPH;
+ }
+
+ public String toStrategyLabel(String strategyCode) {
+ if (RagStrategyCodes.MARKDOWN_SECTION.equals(strategyCode)) {
+ return "Markdown 标题拆分";
+ }
+ if (RagStrategyCodes.OUTLINE_SECTION.equals(strategyCode)) {
+ return "章节标题拆分";
+ }
+ if (RagStrategyCodes.QA_PAIR.equals(strategyCode)) {
+ return "问答对拆分";
+ }
+ if (RagStrategyCodes.CUSTOM_REGEX.equals(strategyCode)) {
+ return "自定义正则拆分";
+ }
+ if (RagStrategyCodes.AUTO.equals(strategyCode)) {
+ return "自动推荐";
+ }
+ return "自然段长度拆分";
+ }
+
+ private List buildReasons(Map features, String strategyCode, double confidence) {
+ List reasons = new ArrayList();
+ if (RagStrategyCodes.MARKDOWN_SECTION.equals(strategyCode)) {
+ reasons.add("检测到 Markdown 标题结构,适合按标题层级拆分");
+ reasons.add("标题层级数:" + number(features.get("markdownLevelVariety")) + ",标题数量:" + number(features.get("markdownHeadingCount")));
+ return reasons;
+ }
+ if (RagStrategyCodes.OUTLINE_SECTION.equals(strategyCode)) {
+ reasons.add("检测到中英文标题/章节编号,适合按章节拆分");
+ reasons.add("章节标题数量:" + number(features.get("outlineHeadingCount")));
+ return reasons;
+ }
+ if (RagStrategyCodes.QA_PAIR.equals(strategyCode)) {
+ reasons.add("检测到问答结构,适合按一问一答拆分");
+ reasons.add("问题数量:" + number(features.get("qaQuestionCount")) + ",成对问答数量:" + number(features.get("pairedQaCount")));
+ return reasons;
+ }
+ reasons.add("结构特征不够集中,回退为自然段长度拆分");
+ reasons.add("推荐置信度:" + scale(confidence));
+ return reasons;
+ }
+
+ private double computeConfidence(double bestScore, double secondScore) {
+ double delta = Math.max(0D, bestScore - secondScore);
+ double base = Math.min(1D, bestScore / 100D);
+ return Math.min(1D, Math.max(0.25D, base * 0.6D + Math.min(0.4D, delta / 50D)));
+ }
+
+ private double number(Object value) {
+ if (value instanceof Number) {
+ return ((Number) value).doubleValue();
+ }
+ return 0D;
+ }
+
+ private String safeLowercase(String value) {
+ return value == null ? "" : value.toLowerCase(Locale.ROOT);
+ }
+
+ private double scale(double value) {
+ return BigDecimal.valueOf(value).setScale(2, RoundingMode.HALF_UP).doubleValue();
+ }
+}
diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java b/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java
new file mode 100644
index 0000000..b5f0e5a
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java
@@ -0,0 +1,80 @@
+package com.easyagents.rag.ingestion;
+
+import com.easyagents.rag.core.RagChunk;
+import com.easyagents.rag.core.RagChunkTypes;
+import com.easyagents.rag.core.RagStrategyCodes;
+import com.easyagents.rag.ingestion.analysis.DocumentStructureAnalyzer;
+import com.easyagents.rag.ingestion.chunk.RagSplitStrategyRegistry;
+import com.easyagents.rag.ingestion.model.AnalysisResult;
+import com.easyagents.rag.ingestion.model.StrategyConfig;
+import com.easyagents.rag.ingestion.recommend.SplitStrategyRecommender;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.List;
+
+public class RagIngestionPipelineTest {
+
+ private final DocumentStructureAnalyzer analyzer = new DocumentStructureAnalyzer();
+ private final SplitStrategyRecommender recommender = new SplitStrategyRecommender();
+ private final RagSplitStrategyRegistry registry = new RagSplitStrategyRegistry();
+
+ @Test
+ public void shouldRecommendMarkdownStrategy() {
+ String markdown = "# Quick Start\n"
+ + "Welcome\n\n"
+ + "## Install\n"
+ + "Run npm install\n\n"
+ + "## Usage\n"
+ + "Run pnpm dev";
+
+ AnalysisResult analysis = recommender.recommend(analyzer.analyze(markdown, "md"));
+
+ Assert.assertEquals(RagStrategyCodes.MARKDOWN_SECTION, analysis.getRecommendedStrategyCode());
+ Assert.assertTrue(analysis.getConfidence().doubleValue() > 0.4D);
+ }
+
+ @Test
+ public void shouldRecommendQaStrategyForEnglishAndChinese() {
+ String qa = "Q: How to reset password?\n"
+ + "A: Open admin page and click reset.\n\n"
+ + "问:默认密码是什么?\n"
+ + "答:由系统配置统一决定。";
+
+ AnalysisResult analysis = recommender.recommend(analyzer.analyze(qa, "txt"));
+
+ Assert.assertEquals(RagStrategyCodes.QA_PAIR, analysis.getRecommendedStrategyCode());
+ }
+
+ @Test
+ public void shouldSplitOutlineDocumentByHeadingPath() {
+ String outline = "第1章 总则\n适用范围说明。\n\n1.1 目标\n定义系统目标。\n\n1.2 范围\n定义系统范围。";
+ AnalysisResult analysis = recommender.recommend(analyzer.analyze(outline, "docx"));
+ StrategyConfig config = StrategyConfig.defaults();
+ config.setStrategyCode(RagStrategyCodes.OUTLINE_SECTION);
+
+ List chunks = registry.split(analysis, config);
+
+ Assert.assertEquals(3, chunks.size());
+ Assert.assertEquals("第1章 总则", chunks.get(0).getSourceLabel());
+ Assert.assertEquals(2, chunks.get(1).getHeadingPath().size());
+ }
+
+ @Test
+ public void shouldSplitQaDocumentByPair() {
+ String qa = "Q: How to reset password?\n"
+ + "A: Open admin page and click reset.\n\n"
+ + "问:默认密码是什么?\n"
+ + "答:由系统配置统一决定。";
+ AnalysisResult analysis = recommender.recommend(analyzer.analyze(qa, "txt"));
+ StrategyConfig config = StrategyConfig.defaults();
+ config.setStrategyCode(RagStrategyCodes.QA_PAIR);
+
+ List chunks = registry.split(analysis, config);
+
+ Assert.assertEquals(2, chunks.size());
+ Assert.assertEquals(RagChunkTypes.QA_PAIR, chunks.get(0).getChunkType());
+ Assert.assertTrue(chunks.get(0).getContent().contains("问题"));
+ Assert.assertTrue(chunks.get(1).getAnswer().contains("系统配置"));
+ }
+}
diff --git a/easy-agents-rag/easy-agents-rag-ocr/pom.xml b/easy-agents-rag/easy-agents-rag-ocr/pom.xml
new file mode 100644
index 0000000..c4b861b
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-ocr/pom.xml
@@ -0,0 +1,32 @@
+
+
+ 4.0.0
+
+
+ com.easyagents
+ easy-agents-rag
+ ${revision}
+
+
+ easy-agents-rag-ocr
+ easy-agents-rag-ocr
+
+
+ 8
+ 8
+ UTF-8
+
+
+
+
+ com.easyagents
+ easy-agents-core
+
+
+ com.easyagents
+ easy-agents-rag-core
+
+
+
diff --git a/easy-agents-rag/easy-agents-rag-retrieval/pom.xml b/easy-agents-rag/easy-agents-rag-retrieval/pom.xml
new file mode 100644
index 0000000..d7ebb38
--- /dev/null
+++ b/easy-agents-rag/easy-agents-rag-retrieval/pom.xml
@@ -0,0 +1,36 @@
+
+
+ 4.0.0
+
+
+ com.easyagents
+ easy-agents-rag
+ ${revision}
+
+
+ easy-agents-rag-retrieval
+ easy-agents-rag-retrieval
+
+
+ 8
+ 8
+ UTF-8
+
+
+
+
+ com.easyagents
+ easy-agents-core
+
+
+ com.easyagents
+ easy-agents-rag-core
+
+
+ com.easyagents
+ easy-agents-rag-enhance
+
+
+
diff --git a/easy-agents-rag/pom.xml b/easy-agents-rag/pom.xml
new file mode 100644
index 0000000..fbd265d
--- /dev/null
+++ b/easy-agents-rag/pom.xml
@@ -0,0 +1,24 @@
+
+
+ 4.0.0
+
+
+ com.easyagents
+ easy-agents
+ ${revision}
+
+
+ easy-agents-rag
+ pom
+ easy-agents-rag
+
+
+ easy-agents-rag-core
+ easy-agents-rag-ingestion
+ easy-agents-rag-ocr
+ easy-agents-rag-enhance
+ easy-agents-rag-retrieval
+
+
diff --git a/easy-agents-spring-boot-starter/pom.xml b/easy-agents-spring-boot-starter/pom.xml
index 12f30e9..4a06e63 100644
--- a/easy-agents-spring-boot-starter/pom.xml
+++ b/easy-agents-spring-boot-starter/pom.xml
@@ -51,6 +51,15 @@
easy-agents-bom
+
+ com.easyagents
+ easy-agents-rag-core
+
+
+
+ com.easyagents
+ easy-agents-rag-ingestion
+
org.springframework.boot
diff --git a/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/rag/ingestion/RagIngestionAutoConfiguration.java b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/rag/ingestion/RagIngestionAutoConfiguration.java
new file mode 100644
index 0000000..80bf7ea
--- /dev/null
+++ b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/rag/ingestion/RagIngestionAutoConfiguration.java
@@ -0,0 +1,42 @@
+package com.easyagents.spring.boot.rag.ingestion;
+
+import com.easyagents.rag.ingestion.DefaultRagIngestionService;
+import com.easyagents.rag.ingestion.RagIngestionService;
+import com.easyagents.rag.ingestion.analysis.DocumentStructureAnalyzer;
+import com.easyagents.rag.ingestion.chunk.RagSplitStrategyRegistry;
+import com.easyagents.rag.ingestion.recommend.SplitStrategyRecommender;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+
+@ConditionalOnClass(RagIngestionService.class)
+@Configuration(proxyBeanMethods = false)
+public class RagIngestionAutoConfiguration {
+
+ @Bean
+ @ConditionalOnMissingBean
+ public DocumentStructureAnalyzer documentStructureAnalyzer() {
+ return new DocumentStructureAnalyzer();
+ }
+
+ @Bean
+ @ConditionalOnMissingBean
+ public SplitStrategyRecommender splitStrategyRecommender() {
+ return new SplitStrategyRecommender();
+ }
+
+ @Bean
+ @ConditionalOnMissingBean
+ public RagSplitStrategyRegistry ragSplitStrategyRegistry() {
+ return new RagSplitStrategyRegistry();
+ }
+
+ @Bean
+ @ConditionalOnMissingBean
+ public RagIngestionService ragIngestionService(DocumentStructureAnalyzer documentStructureAnalyzer,
+ SplitStrategyRecommender splitStrategyRecommender,
+ RagSplitStrategyRegistry ragSplitStrategyRegistry) {
+ return new DefaultRagIngestionService(documentStructureAnalyzer, splitStrategyRecommender, ragSplitStrategyRegistry);
+ }
+}
diff --git a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring.factories b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring.factories
index 663545c..fa01615 100644
--- a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring.factories
+++ b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring.factories
@@ -1,10 +1,11 @@
org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
- com.easyagents.spring.boot.chatModel.chatglm.ChatglmAutoConfiguration,\
- com.easyagents.spring.boot.chatModel.openai.OpenAIAutoConfiguration,\
- com.easyagents.spring.boot.chatModel.qwen.QwenAutoConfiguration,\
- com.easyagents.spring.boot.chatModel.spark.SparkAutoConfiguration,\
+ com.easyagents.spring.boot.llm.openai.OpenAIAutoConfiguration,\
+ com.easyagents.spring.boot.llm.qwen.QwenAutoConfiguration,\
com.easyagents.spring.boot.store.aliyun.AliyunAutoConfiguration,\
com.easyagents.spring.boot.store.qcloud.QCloudStoreAutoConfiguration,\
- com.easyagents.spring.boot.chatModel.ollama.OllamaAutoConfiguration,\
- com.easyagents.spring.boot.chatModel.deepseek.DeepSeekAutoConfiguration,\
- com.easyagents.spring.boot.store.chroma.ChromaAutoConfiguration
+ com.easyagents.spring.boot.llm.ollama.OllamaAutoConfiguration,\
+ com.easyagents.spring.boot.llm.deepseek.DeepSeekAutoConfiguration,\
+ com.easyagents.spring.boot.store.chroma.ChromaAutoConfiguration,\
+ com.easyagents.spring.boot.store.elasticsearch.ElasticSearchAutoConfiguration,\
+ com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration,\
+ com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration
diff --git a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports
index 102f3d6..bea3e22 100644
--- a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports
+++ b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports
@@ -1,8 +1,10 @@
-com.easyagents.spring.boot.chatModel.chatglm.ChatglmAutoConfiguration
-com.easyagents.spring.boot.chatModel.openai.OpenAIAutoConfiguration
-com.easyagents.spring.boot.chatModel.qwen.QwenAutoConfiguration
-com.easyagents.spring.boot.chatModel.spark.SparkAutoConfiguration
+com.easyagents.spring.boot.llm.openai.OpenAIAutoConfiguration
+com.easyagents.spring.boot.llm.qwen.QwenAutoConfiguration
com.easyagents.spring.boot.store.aliyun.AliyunAutoConfiguration
com.easyagents.spring.boot.store.qcloud.QCloudStoreAutoConfiguration
-com.easyagents.spring.boot.chatModel.ollama.OllamaAutoConfiguration
+com.easyagents.spring.boot.llm.ollama.OllamaAutoConfiguration
+com.easyagents.spring.boot.llm.deepseek.DeepSeekAutoConfiguration
com.easyagents.spring.boot.store.chroma.ChromaAutoConfiguration
+com.easyagents.spring.boot.store.elasticsearch.ElasticSearchAutoConfiguration
+com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration
+com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration
diff --git a/pom.xml b/pom.xml
index 948387e..a957978 100644
--- a/pom.xml
+++ b/pom.xml
@@ -16,6 +16,7 @@
easy-agents-bom
easy-agents-core
+ easy-agents-rag
easy-agents-chat
easy-agents-store
easy-agents-spring-boot-starter
@@ -118,6 +119,36 @@
${revision}
+
+ com.easyagents
+ easy-agents-rag-core
+ ${revision}
+
+
+
+ com.easyagents
+ easy-agents-rag-ingestion
+ ${revision}
+
+
+
+ com.easyagents
+ easy-agents-rag-ocr
+ ${revision}
+
+
+
+ com.easyagents
+ easy-agents-rag-enhance
+ ${revision}
+
+
+
+ com.easyagents
+ easy-agents-rag-retrieval
+ ${revision}
+
+
com.easyagents
easy-agents-bom