From 941995d1b84a857b18ef119c5912ab38834f00b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E5=AD=90=E9=BB=98?= <925456043@qq.com> Date: Sun, 29 Mar 2026 17:28:12 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20RAG=E5=88=86=E5=9D=97=E7=AD=96=E7=95=A5?= =?UTF-8?q?=E5=A2=9E=E5=BC=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- easy-agents-bom/pom.xml | 24 ++ easy-agents-rag/TECH-PLAN.md | 102 +++++ easy-agents-rag/easy-agents-rag-core/pom.xml | 28 ++ .../com/easyagents/rag/core/RagChunk.java | 128 ++++++ .../easyagents/rag/core/RagChunkTypes.java | 11 + .../com/easyagents/rag/core/RagDefaults.java | 12 + .../easyagents/rag/core/RagMetadataKeys.java | 20 + .../easyagents/rag/core/RagStrategyCodes.java | 14 + .../rag/core/RagStructureTypes.java | 12 + .../easy-agents-rag-enhance/pom.xml | 36 ++ .../easy-agents-rag-ingestion/pom.xml | 37 ++ .../ingestion/DefaultRagIngestionService.java | 41 ++ .../rag/ingestion/RagIngestionService.java | 16 + .../analysis/DocumentStructureAnalyzer.java | 234 +++++++++++ .../chunk/RagSplitStrategyRegistry.java | 388 ++++++++++++++++++ .../rag/ingestion/model/AnalysisResult.java | 92 +++++ .../ingestion/model/CandidateStrategy.java | 43 ++ .../rag/ingestion/model/StrategyConfig.java | 79 ++++ .../recommend/SplitStrategyRecommender.java | 133 ++++++ .../ingestion/RagIngestionPipelineTest.java | 80 ++++ easy-agents-rag/easy-agents-rag-ocr/pom.xml | 32 ++ .../easy-agents-rag-retrieval/pom.xml | 36 ++ easy-agents-rag/pom.xml | 24 ++ easy-agents-spring-boot-starter/pom.xml | 9 + .../RagIngestionAutoConfiguration.java | 42 ++ .../main/resources/META-INF/spring.factories | 15 +- ...ot.autoconfigure.AutoConfiguration.imports | 12 +- pom.xml | 31 ++ 28 files changed, 1719 insertions(+), 12 deletions(-) create mode 100644 easy-agents-rag/TECH-PLAN.md create mode 100644 easy-agents-rag/easy-agents-rag-core/pom.xml create mode 100644 easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunk.java create mode 100644 easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunkTypes.java create mode 100644 easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagDefaults.java create mode 100644 easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java create mode 100644 easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStrategyCodes.java create mode 100644 easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStructureTypes.java create mode 100644 easy-agents-rag/easy-agents-rag-enhance/pom.xml create mode 100644 easy-agents-rag/easy-agents-rag-ingestion/pom.xml create mode 100644 easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/DefaultRagIngestionService.java create mode 100644 easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/RagIngestionService.java create mode 100644 easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/analysis/DocumentStructureAnalyzer.java create mode 100644 easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java create mode 100644 easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/AnalysisResult.java create mode 100644 easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/CandidateStrategy.java create mode 100644 easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/StrategyConfig.java create mode 100644 easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/recommend/SplitStrategyRecommender.java create mode 100644 easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java create mode 100644 easy-agents-rag/easy-agents-rag-ocr/pom.xml create mode 100644 easy-agents-rag/easy-agents-rag-retrieval/pom.xml create mode 100644 easy-agents-rag/pom.xml create mode 100644 easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/rag/ingestion/RagIngestionAutoConfiguration.java diff --git a/easy-agents-bom/pom.xml b/easy-agents-bom/pom.xml index 6d6f348..99b7079 100644 --- a/easy-agents-bom/pom.xml +++ b/easy-agents-bom/pom.xml @@ -56,6 +56,30 @@ + + com.easyagents + easy-agents-rag-core + + + + com.easyagents + easy-agents-rag-ingestion + + + + com.easyagents + easy-agents-rag-ocr + + + + com.easyagents + easy-agents-rag-enhance + + + + com.easyagents + easy-agents-rag-retrieval + diff --git a/easy-agents-rag/TECH-PLAN.md b/easy-agents-rag/TECH-PLAN.md new file mode 100644 index 0000000..edc6e84 --- /dev/null +++ b/easy-agents-rag/TECH-PLAN.md @@ -0,0 +1,102 @@ +# easy-agents-rag 技术规划 + +## 目标 + +`easy-agents-rag` 用于承载 Easy-Agents 的 RAG 领域能力,逐步将知识入库、文档预处理、分块、索引增强、检索增强等能力从业务工程中抽离,形成可复用的框架层模块。 + +当前阶段先完成模块骨架建设,并优先承接本次文档导入链路中的预处理与分块能力迁移。 + +## 模块规划 + +### `easy-agents-rag-core` + +定位:RAG 域共享契约层。 + +负责内容: +- 通用常量与元数据 key +- 结构类型、策略类型、chunk 类型定义 +- 少量稳定共享模型与接口 + +不负责内容: +- 具体 OCR 实现 +- 具体分块实现 +- 具体召回编排 + +### `easy-agents-rag-ingestion` + +定位:入库前处理链路。 + +负责内容: +- 文本标准化与清洗 +- 文档结构分析 +- 拆分策略推荐 +- 文档分块与 chunk 元信息补全 +- 入库前质量控制 + +当前迁移优先承接: +- 文档结构分析 +- 章节/问答/段落分块 +- 自动推荐拆分策略 + +### `easy-agents-rag-ocr` + +定位:OCR 与版面恢复能力。 + +负责内容: +- 图片/PDF OCR +- 页面版面解析 +- 标题、段落、表格等结构恢复 +- PDF 到结构化文本或 Markdown 的转换 + +### `easy-agents-rag-enhance` + +定位:索引前增强能力。 + +负责内容: +- 图增强 +- RAPTOR +- parent-child chunk +- window chunk +- 摘要、关键词、标签等增强信息生成 +- 索引前的知识单元增强 + +### `easy-agents-rag-retrieval` + +定位:查询侧增强与召回编排。 + +负责内容: +- query rewrite / expansion +- hybrid recall 编排 +- metadata filter 策略 +- graph recall +- rerank 编排 +- chunk merge / window expand / context assemble + +## 当前迁移范围 + +本次优先迁移到 `easy-agents-rag-ingestion` 的能力: +- 文档结构分析 +- 拆分策略推荐 +- 标题型 / QA 型 / 段落型分块 + +本次不迁移的能力: +- 业务侧预览会话 +- 控制器与接口 DTO +- 业务库持久化 +- 前端导入页面 + +这些能力继续留在业务工程,由业务层依赖 `easy-agents-rag` 提供的能力完成编排。 + +## 后续演进 + +后续演进顺序建议如下: + +1. 完成 `rag-ingestion` 首批能力迁移并稳定对外接口 +2. 补充 `rag-ocr`,接入 OCR 与版面恢复 +3. 补充 `rag-enhance`,支持图增强、RAPTOR、索引增强 +4. 补充 `rag-retrieval`,统一查询增强与召回后处理 + +整体原则: +- `easy-agents-core` 保持基础抽象 +- `easy-agents-rag` 聚合 RAG 领域实现 +- 业务工程只保留编排、持久化与产品层逻辑 diff --git a/easy-agents-rag/easy-agents-rag-core/pom.xml b/easy-agents-rag/easy-agents-rag-core/pom.xml new file mode 100644 index 0000000..02187a8 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-core/pom.xml @@ -0,0 +1,28 @@ + + + 4.0.0 + + + com.easyagents + easy-agents-rag + ${revision} + + + easy-agents-rag-core + easy-agents-rag-core + + + 8 + 8 + UTF-8 + + + + + com.easyagents + easy-agents-core + + + diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunk.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunk.java new file mode 100644 index 0000000..2621e44 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunk.java @@ -0,0 +1,128 @@ +package com.easyagents.rag.core; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +public class RagChunk implements Serializable { + + private String chunkId; + private String chunkType; + private String sourceLabel; + private List headingPath = new ArrayList(); + private String content; + private String question; + private String answer; + private Integer charCount; + private Integer tokenEstimate; + private Integer partNo = 1; + private Integer partTotal = 1; + private List warnings = new ArrayList(); + private Map options = new LinkedHashMap(); + + public String getChunkId() { + return chunkId; + } + + public void setChunkId(String chunkId) { + this.chunkId = chunkId; + } + + public String getChunkType() { + return chunkType; + } + + public void setChunkType(String chunkType) { + this.chunkType = chunkType; + } + + public String getSourceLabel() { + return sourceLabel; + } + + public void setSourceLabel(String sourceLabel) { + this.sourceLabel = sourceLabel; + } + + public List getHeadingPath() { + return headingPath; + } + + public void setHeadingPath(List headingPath) { + this.headingPath = headingPath; + } + + public String getContent() { + return content; + } + + public void setContent(String content) { + this.content = content; + } + + public String getQuestion() { + return question; + } + + public void setQuestion(String question) { + this.question = question; + } + + public String getAnswer() { + return answer; + } + + public void setAnswer(String answer) { + this.answer = answer; + } + + public Integer getCharCount() { + return charCount; + } + + public void setCharCount(Integer charCount) { + this.charCount = charCount; + } + + public Integer getTokenEstimate() { + return tokenEstimate; + } + + public void setTokenEstimate(Integer tokenEstimate) { + this.tokenEstimate = tokenEstimate; + } + + public Integer getPartNo() { + return partNo; + } + + public void setPartNo(Integer partNo) { + this.partNo = partNo; + } + + public Integer getPartTotal() { + return partTotal; + } + + public void setPartTotal(Integer partTotal) { + this.partTotal = partTotal; + } + + public List getWarnings() { + return warnings; + } + + public void setWarnings(List warnings) { + this.warnings = warnings; + } + + public Map getOptions() { + return options; + } + + public void setOptions(Map options) { + this.options = options; + } +} diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunkTypes.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunkTypes.java new file mode 100644 index 0000000..39a60e6 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagChunkTypes.java @@ -0,0 +1,11 @@ +package com.easyagents.rag.core; + +public final class RagChunkTypes { + + private RagChunkTypes() { + } + + public static final String SECTION = "section"; + public static final String QA_PAIR = "qa_pair"; + public static final String PARAGRAPH = "paragraph"; +} diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagDefaults.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagDefaults.java new file mode 100644 index 0000000..a78c377 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagDefaults.java @@ -0,0 +1,12 @@ +package com.easyagents.rag.core; + +public final class RagDefaults { + + private RagDefaults() { + } + + public static final int CHUNK_SIZE = 512; + public static final int OVERLAP_SIZE = 128; + public static final int MD_SPLITTER_LEVEL = 2; + public static final int ROWS_PER_CHUNK = 1; +} diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java new file mode 100644 index 0000000..db86c63 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java @@ -0,0 +1,20 @@ +package com.easyagents.rag.core; + +public final class RagMetadataKeys { + + private RagMetadataKeys() { + } + + public static final String CHUNK_TYPE = "chunkType"; + public static final String SOURCE_LABEL = "sourceLabel"; + public static final String HEADING_PATH = "headingPath"; + public static final String PAGE_NO = "pageNo"; + public static final String CHAR_COUNT = "charCount"; + public static final String TOKEN_ESTIMATE = "tokenEstimate"; + public static final String QA_QUESTION = "qaQuestion"; + public static final String QA_ANSWER = "qaAnswer"; + public static final String QA_GROUP_ID = "qaGroupId"; + public static final String PART_NO = "partNo"; + public static final String PART_TOTAL = "partTotal"; + public static final String WARNINGS = "warnings"; +} diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStrategyCodes.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStrategyCodes.java new file mode 100644 index 0000000..aac676f --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStrategyCodes.java @@ -0,0 +1,14 @@ +package com.easyagents.rag.core; + +public final class RagStrategyCodes { + + private RagStrategyCodes() { + } + + public static final String AUTO = "AUTO"; + public static final String MARKDOWN_SECTION = "MARKDOWN_SECTION"; + public static final String OUTLINE_SECTION = "OUTLINE_SECTION"; + public static final String QA_PAIR = "QA_PAIR"; + public static final String PARAGRAPH_LENGTH = "PARAGRAPH_LENGTH"; + public static final String CUSTOM_REGEX = "CUSTOM_REGEX"; +} diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStructureTypes.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStructureTypes.java new file mode 100644 index 0000000..eaba9b0 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagStructureTypes.java @@ -0,0 +1,12 @@ +package com.easyagents.rag.core; + +public final class RagStructureTypes { + + private RagStructureTypes() { + } + + public static final String MARKDOWN_HEADING = "markdown_heading"; + public static final String OUTLINE_SECTION = "outline_section"; + public static final String QA_PAIR = "qa_pair"; + public static final String PLAIN_PARAGRAPH = "plain_paragraph"; +} diff --git a/easy-agents-rag/easy-agents-rag-enhance/pom.xml b/easy-agents-rag/easy-agents-rag-enhance/pom.xml new file mode 100644 index 0000000..3112175 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-enhance/pom.xml @@ -0,0 +1,36 @@ + + + 4.0.0 + + + com.easyagents + easy-agents-rag + ${revision} + + + easy-agents-rag-enhance + easy-agents-rag-enhance + + + 8 + 8 + UTF-8 + + + + + com.easyagents + easy-agents-core + + + com.easyagents + easy-agents-rag-core + + + com.easyagents + easy-agents-rag-ingestion + + + diff --git a/easy-agents-rag/easy-agents-rag-ingestion/pom.xml b/easy-agents-rag/easy-agents-rag-ingestion/pom.xml new file mode 100644 index 0000000..2427b5e --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-ingestion/pom.xml @@ -0,0 +1,37 @@ + + + 4.0.0 + + + com.easyagents + easy-agents-rag + ${revision} + + + easy-agents-rag-ingestion + easy-agents-rag-ingestion + + + 8 + 8 + UTF-8 + + + + + com.easyagents + easy-agents-core + + + com.easyagents + easy-agents-rag-core + + + junit + junit + test + + + diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/DefaultRagIngestionService.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/DefaultRagIngestionService.java new file mode 100644 index 0000000..bf1ab88 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/DefaultRagIngestionService.java @@ -0,0 +1,41 @@ +package com.easyagents.rag.ingestion; + +import com.easyagents.rag.core.RagChunk; +import com.easyagents.rag.ingestion.analysis.DocumentStructureAnalyzer; +import com.easyagents.rag.ingestion.chunk.RagSplitStrategyRegistry; +import com.easyagents.rag.ingestion.model.AnalysisResult; +import com.easyagents.rag.ingestion.model.StrategyConfig; +import com.easyagents.rag.ingestion.recommend.SplitStrategyRecommender; + +import java.util.List; + +public class DefaultRagIngestionService implements RagIngestionService { + + private final DocumentStructureAnalyzer documentStructureAnalyzer; + private final SplitStrategyRecommender splitStrategyRecommender; + private final RagSplitStrategyRegistry ragSplitStrategyRegistry; + + public DefaultRagIngestionService(DocumentStructureAnalyzer documentStructureAnalyzer, + SplitStrategyRecommender splitStrategyRecommender, + RagSplitStrategyRegistry ragSplitStrategyRegistry) { + this.documentStructureAnalyzer = documentStructureAnalyzer; + this.splitStrategyRecommender = splitStrategyRecommender; + this.ragSplitStrategyRegistry = ragSplitStrategyRegistry; + } + + @Override + public AnalysisResult analyze(String rawContent, String sourceFormat) { + AnalysisResult result = documentStructureAnalyzer.analyze(rawContent, sourceFormat); + return splitStrategyRecommender.recommend(result); + } + + @Override + public List split(AnalysisResult analysis, StrategyConfig config) { + return ragSplitStrategyRegistry.split(analysis, config); + } + + @Override + public String toStrategyLabel(String strategyCode) { + return splitStrategyRecommender.toStrategyLabel(strategyCode); + } +} diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/RagIngestionService.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/RagIngestionService.java new file mode 100644 index 0000000..6efe60a --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/RagIngestionService.java @@ -0,0 +1,16 @@ +package com.easyagents.rag.ingestion; + +import com.easyagents.rag.core.RagChunk; +import com.easyagents.rag.ingestion.model.AnalysisResult; +import com.easyagents.rag.ingestion.model.StrategyConfig; + +import java.util.List; + +public interface RagIngestionService { + + AnalysisResult analyze(String rawContent, String sourceFormat); + + List split(AnalysisResult analysis, StrategyConfig config); + + String toStrategyLabel(String strategyCode); +} diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/analysis/DocumentStructureAnalyzer.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/analysis/DocumentStructureAnalyzer.java new file mode 100644 index 0000000..3baa447 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/analysis/DocumentStructureAnalyzer.java @@ -0,0 +1,234 @@ +package com.easyagents.rag.ingestion.analysis; + +import com.easyagents.core.util.StringUtil; +import com.easyagents.rag.ingestion.model.AnalysisResult; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DocumentStructureAnalyzer { + + private static final Pattern MARKDOWN_HEADING = Pattern.compile("^#{1,6}\\s+\\S+.*$"); + private static final Pattern CHINESE_CHAPTER = Pattern.compile("^第[一二三四五六七八九十百零两0-9]+[章节篇部分卷]\\s*.*$"); + private static final Pattern CHINESE_SECTION = Pattern.compile("^[一二三四五六七八九十百零]+[、..]\\s*\\S+.*$"); + private static final Pattern CHINESE_SUBSECTION = Pattern.compile("^[((][一二三四五六七八九十百零0-9]+[))]\\s*\\S+.*$"); + private static final Pattern NUMERIC_SECTION = Pattern.compile("^[0-9]+(\\.[0-9]+){0,4}\\s+\\S+.*$"); + private static final Pattern ENGLISH_SECTION = Pattern.compile("^(Chapter|Section|Part)\\s+[0-9IVXLC]+([.:\\-\\s].*)?$", Pattern.CASE_INSENSITIVE); + private static final Pattern ENGLISH_ROMAN = Pattern.compile("^[IVXLC]+[.、)\\s-]+\\S+.*$"); + private static final Pattern QUESTION_LINE = Pattern.compile("^(Q|QUESTION|问|问题|FAQ\\s*[0-9]+)\\s*[.::-]\\s*.+$", Pattern.CASE_INSENSITIVE); + private static final Pattern ANSWER_LINE = Pattern.compile("^(A|ANSWER|答|答案)\\s*[.::-]\\s*.+$", Pattern.CASE_INSENSITIVE); + private static final Pattern PAGE_NUMBER = Pattern.compile("^(第?\\s*\\d+\\s*页|page\\s+\\d+|\\d+)\\s*$", Pattern.CASE_INSENSITIVE); + private static final Pattern TOC_LINE = Pattern.compile("^.{2,80}[.·•…]{2,}\\s*\\d+\\s*$"); + + public AnalysisResult analyze(String rawContent, String sourceFormat) { + String normalizedContent = normalize(rawContent); + List lines = toLines(normalizedContent); + + int markdownHeadingCount = 0; + int outlineHeadingCount = 0; + int qaQuestionCount = 0; + int qaAnswerCount = 0; + int pairedQaCount = 0; + int tocLineCount = 0; + int shortLineCount = 0; + Set markdownLevels = new HashSet(); + + for (int i = 0; i < lines.size(); i++) { + String line = lines.get(i); + if (MARKDOWN_HEADING.matcher(line).matches()) { + markdownHeadingCount++; + markdownLevels.add(Integer.valueOf(countMarkdownLevel(line))); + } + if (isOutlineHeading(line)) { + outlineHeadingCount++; + } + if (QUESTION_LINE.matcher(line).matches()) { + qaQuestionCount++; + if (hasAnswerNearby(lines, i)) { + pairedQaCount++; + } + } + if (ANSWER_LINE.matcher(line).matches()) { + qaAnswerCount++; + } + if (TOC_LINE.matcher(line).matches()) { + tocLineCount++; + } + if (line.length() <= 20) { + shortLineCount++; + } + } + + int paragraphCount = 0; + int longParagraphCount = 0; + List paragraphs = splitParagraphs(normalizedContent); + for (String paragraph : paragraphs) { + if (StringUtil.hasText(paragraph)) { + paragraphCount++; + if (paragraph.length() > 800) { + longParagraphCount++; + } + } + } + + Map features = new LinkedHashMap(); + features.put("lineCount", Integer.valueOf(lines.size())); + features.put("paragraphCount", Integer.valueOf(paragraphCount)); + features.put("markdownHeadingCount", Integer.valueOf(markdownHeadingCount)); + features.put("markdownLevelVariety", Integer.valueOf(markdownLevels.size())); + features.put("outlineHeadingCount", Integer.valueOf(outlineHeadingCount)); + features.put("qaQuestionCount", Integer.valueOf(qaQuestionCount)); + features.put("qaAnswerCount", Integer.valueOf(qaAnswerCount)); + features.put("pairedQaCount", Integer.valueOf(pairedQaCount)); + features.put("tocLineCount", Integer.valueOf(tocLineCount)); + features.put("shortLineRatio", lines.isEmpty() ? Double.valueOf(0D) : Double.valueOf((double) shortLineCount / (double) lines.size())); + features.put("longParagraphCount", Integer.valueOf(longParagraphCount)); + + AnalysisResult result = new AnalysisResult(); + result.setSourceFormat(sourceFormat == null ? "" : sourceFormat.toLowerCase(Locale.ROOT)); + result.setNormalizedContent(normalizedContent); + result.setFeatures(features); + return result; + } + + private String normalize(String rawContent) { + if (!StringUtil.hasText(rawContent)) { + return ""; + } + + String content = rawContent + .replace("\uFEFF", "") + .replace("\u200B", "") + .replace("\r\n", "\n") + .replace('\r', '\n') + .replace('\u00A0', ' ') + .replace(':', ':'); + + List originalLines = toLines(content); + Map lineCounts = new HashMap(); + for (String line : originalLines) { + if (line.length() >= 4 && line.length() <= 60) { + Integer count = lineCounts.get(line); + lineCounts.put(line, count == null ? Integer.valueOf(1) : Integer.valueOf(count.intValue() + 1)); + } + } + + List filteredLines = new ArrayList(); + for (String line : originalLines) { + if (!StringUtil.hasText(line)) { + filteredLines.add(""); + continue; + } + if (PAGE_NUMBER.matcher(line).matches()) { + continue; + } + Integer repeated = lineCounts.get(line); + if (repeated != null && repeated.intValue() >= 3 && line.length() <= 40) { + continue; + } + filteredLines.add(line); + } + + List mergedLines = new ArrayList(); + for (String line : filteredLines) { + if (mergedLines.isEmpty()) { + mergedLines.add(line); + continue; + } + String previous = mergedLines.get(mergedLines.size() - 1); + if (!StringUtil.hasText(previous) || !StringUtil.hasText(line)) { + mergedLines.add(line); + continue; + } + if (shouldMerge(previous, line)) { + mergedLines.set(mergedLines.size() - 1, previous + joinToken(previous, line) + line); + } else { + mergedLines.add(line); + } + } + + return String.join("\n", mergedLines) + .replaceAll("[ \\t]{2,}", " ") + .replaceAll("\\n{3,}", "\n\n") + .trim(); + } + + private boolean shouldMerge(String previous, String current) { + if (isHeading(previous) || isHeading(current)) { + return false; + } + if (QUESTION_LINE.matcher(current).matches() || ANSWER_LINE.matcher(current).matches()) { + return false; + } + if (TOC_LINE.matcher(previous).matches() || TOC_LINE.matcher(current).matches()) { + return false; + } + char previousChar = previous.charAt(previous.length() - 1); + if ("。!?.!?:;:;".indexOf(previousChar) >= 0) { + return false; + } + return current.length() < 80; + } + + private String joinToken(String previous, String current) { + char last = previous.charAt(previous.length() - 1); + char first = current.charAt(0); + if (Character.isLetterOrDigit(last) && Character.isLetterOrDigit(first)) { + return " "; + } + return ""; + } + + private boolean hasAnswerNearby(List lines, int index) { + int end = Math.min(lines.size(), index + 4); + for (int i = index + 1; i < end; i++) { + if (ANSWER_LINE.matcher(lines.get(i)).matches()) { + return true; + } + } + return false; + } + + private boolean isHeading(String line) { + return MARKDOWN_HEADING.matcher(line).matches() || isOutlineHeading(line); + } + + private boolean isOutlineHeading(String line) { + return CHINESE_CHAPTER.matcher(line).matches() + || CHINESE_SECTION.matcher(line).matches() + || CHINESE_SUBSECTION.matcher(line).matches() + || NUMERIC_SECTION.matcher(line).matches() + || ENGLISH_SECTION.matcher(line).matches() + || ENGLISH_ROMAN.matcher(line).matches(); + } + + private int countMarkdownLevel(String line) { + Matcher matcher = Pattern.compile("^(#{1,6})\\s+").matcher(line); + if (!matcher.find()) { + return 0; + } + return matcher.group(1).length(); + } + + private List splitParagraphs(String normalizedContent) { + String[] parts = normalizedContent.split("\\n\\s*\\n"); + List paragraphs = new ArrayList(); + for (String part : parts) { + String paragraph = part.trim(); + if (StringUtil.hasText(paragraph)) { + paragraphs.add(paragraph); + } + } + return paragraphs; + } + + private List toLines(String content) { + String[] rawLines = content.split("\\n"); + List lines = new ArrayList(rawLines.length); + for (String rawLine : rawLines) { + lines.add(rawLine == null ? "" : rawLine.trim()); + } + return lines; + } +} diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java new file mode 100644 index 0000000..0be40ac --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java @@ -0,0 +1,388 @@ +package com.easyagents.rag.ingestion.chunk; + +import com.easyagents.core.document.Document; +import com.easyagents.core.document.DocumentSplitter; +import com.easyagents.core.document.splitter.RegexDocumentSplitter; +import com.easyagents.core.document.splitter.SimpleDocumentSplitter; +import com.easyagents.core.util.StringUtil; +import com.easyagents.rag.core.*; +import com.easyagents.rag.ingestion.model.AnalysisResult; +import com.easyagents.rag.ingestion.model.StrategyConfig; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class RagSplitStrategyRegistry { + + private static final Pattern MARKDOWN_HEADING = Pattern.compile("^(#{1,6})\\s+(.*)$"); + private static final Pattern QUESTION_PREFIX = Pattern.compile("^(Q|QUESTION|问|问题|FAQ\\s*[0-9]+)\\s*[.::-]\\s*(.+)$", Pattern.CASE_INSENSITIVE); + private static final Pattern ANSWER_PREFIX = Pattern.compile("^(A|ANSWER|答|答案)\\s*[.::-]\\s*(.+)$", Pattern.CASE_INSENSITIVE); + + public List split(AnalysisResult analysisResult, StrategyConfig strategyConfig) { + String strategyCode = strategyConfig.getStrategyCode(); + if (!StringUtil.hasText(strategyCode) || RagStrategyCodes.AUTO.equals(strategyCode)) { + strategyCode = analysisResult.getRecommendedStrategyCode(); + } + String normalizedContent = analysisResult.getNormalizedContent(); + if (RagStrategyCodes.MARKDOWN_SECTION.equals(strategyCode)) { + return buildMarkdownChunks(normalizedContent, strategyConfig); + } + if (RagStrategyCodes.OUTLINE_SECTION.equals(strategyCode)) { + return buildOutlineChunks(normalizedContent, strategyConfig); + } + if (RagStrategyCodes.QA_PAIR.equals(strategyCode)) { + return buildQaChunks(normalizedContent, strategyConfig); + } + if (RagStrategyCodes.CUSTOM_REGEX.equals(strategyCode)) { + return buildRegexChunks(normalizedContent, strategyConfig); + } + return buildParagraphChunks(normalizedContent, strategyConfig); + } + + private List buildMarkdownChunks(String content, StrategyConfig strategyConfig) { + List lines = Arrays.asList(content.split("\\n")); + List sections = new ArrayList(); + Deque stack = new ArrayDeque(); + SectionChunk current = null; + for (String rawLine : lines) { + String line = rawLine.trim(); + Matcher matcher = MARKDOWN_HEADING.matcher(line); + if (matcher.matches()) { + if (current != null) { + sections.add(current); + } + int level = matcher.group(1).length(); + while (!stack.isEmpty() && stack.peekLast().level >= level) { + stack.removeLast(); + } + stack.addLast(new HeadingLevel(level, matcher.group(2).trim())); + current = new SectionChunk(copyPath(stack), matcher.group(2).trim()); + current.lines.add(line); + } else { + if (current == null) { + current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落"); + } + current.lines.add(rawLine); + } + } + if (current != null) { + sections.add(current); + } + return finalizeSectionChunks(sections, strategyConfig); + } + + private List buildOutlineChunks(String content, StrategyConfig strategyConfig) { + List lines = Arrays.asList(content.split("\\n")); + List sections = new ArrayList(); + Deque stack = new ArrayDeque(); + SectionChunk current = null; + for (String rawLine : lines) { + String line = rawLine.trim(); + OutlineHeading heading = OutlineHeading.parse(line); + if (heading != null) { + if (current != null) { + sections.add(current); + } + while (!stack.isEmpty() && stack.peekLast().level >= heading.level) { + stack.removeLast(); + } + stack.addLast(new HeadingLevel(heading.level, heading.title)); + current = new SectionChunk(copyPath(stack), heading.title); + current.lines.add(line); + } else { + if (current == null) { + current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落"); + } + current.lines.add(rawLine); + } + } + if (current != null) { + sections.add(current); + } + return finalizeSectionChunks(sections, strategyConfig); + } + + private List finalizeSectionChunks(List sections, StrategyConfig strategyConfig) { + List result = new ArrayList(); + int index = 1; + for (SectionChunk section : sections) { + String content = joinAndTrim(section.lines); + if (!StringUtil.hasText(content) || content.equals(section.sourceLabel)) { + continue; + } + if (content.length() <= safeChunkSize(strategyConfig)) { + result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, content, index++, 1, 1)); + continue; + } + List subContents = splitLongContent(content, strategyConfig.getChunkSize()); + int total = subContents.size(); + for (int i = 0; i < subContents.size(); i++) { + result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, subContents.get(i), index++, i + 1, total)); + } + } + return postProcess(result); + } + + private List buildQaChunks(String content, StrategyConfig strategyConfig) { + List result = new ArrayList(); + String currentQuestion = null; + StringBuilder answerBuilder = new StringBuilder(); + StringBuilder questionBuilder = new StringBuilder(); + int qaIndex = 1; + + for (String rawLine : content.split("\\n")) { + String line = rawLine.trim(); + if (!StringUtil.hasText(line)) { + continue; + } + Matcher questionMatcher = QUESTION_PREFIX.matcher(line); + Matcher answerMatcher = ANSWER_PREFIX.matcher(line); + if (questionMatcher.matches()) { + qaIndex = flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig); + currentQuestion = questionMatcher.group(2).trim(); + questionBuilder = new StringBuilder(currentQuestion); + answerBuilder = new StringBuilder(); + continue; + } + if (answerMatcher.matches()) { + if (answerBuilder.length() > 0) { + answerBuilder.append('\n'); + } + answerBuilder.append(answerMatcher.group(2).trim()); + continue; + } + if (answerBuilder.length() > 0) { + answerBuilder.append('\n').append(rawLine.trim()); + } else if (questionBuilder.length() > 0) { + questionBuilder.append('\n').append(rawLine.trim()); + } + } + flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig); + return postProcess(result); + } + + private int flushQaChunk(List result, + String currentQuestion, + StringBuilder questionBuilder, + StringBuilder answerBuilder, + int qaIndex, + StrategyConfig strategyConfig) { + if (!StringUtil.hasText(currentQuestion)) { + return qaIndex; + } + if (!StringUtil.hasText(answerBuilder.toString())) { + return qaIndex; + } + String question = questionBuilder.toString().trim(); + String answer = answerBuilder.toString().trim(); + String baseContent = "问题:" + question + "\n答案:" + answer; + List subContents = baseContent.length() > safeChunkSize(strategyConfig) + ? splitLongContent(baseContent, strategyConfig.getChunkSize()) + : Collections.singletonList(baseContent); + int total = subContents.size(); + for (int i = 0; i < subContents.size(); i++) { + RagChunk chunk = createChunk(RagChunkTypes.QA_PAIR, "Q" + qaIndex + " " + question, Collections.emptyList(), subContents.get(i), result.size() + 1, i + 1, total); + chunk.setQuestion(question); + chunk.setAnswer(answer); + chunk.getOptions().put(RagMetadataKeys.QA_GROUP_ID, "qa-" + qaIndex); + result.add(chunk); + } + return qaIndex + 1; + } + + private List buildParagraphChunks(String content, StrategyConfig strategyConfig) { + List result = new ArrayList(); + DocumentSplitter splitter = new SimpleDocumentSplitter(safeChunkSize(strategyConfig), safeOverlap(strategyConfig)); + List docs = splitter.split(new Document(content)); + int index = 1; + for (Document doc : docs) { + result.add(createChunk(RagChunkTypes.PARAGRAPH, "分块 " + index, Collections.emptyList(), doc.getContent(), index, 1, 1)); + index++; + } + return postProcess(result); + } + + private List buildRegexChunks(String content, StrategyConfig strategyConfig) { + String regex = StringUtil.hasText(strategyConfig.getRegex()) ? strategyConfig.getRegex() : "\\n\\s*\\n"; + DocumentSplitter splitter = new RegexDocumentSplitter(regex); + List docs = splitter.split(new Document(content)); + List result = new ArrayList(); + int index = 1; + for (Document doc : docs) { + result.add(createChunk(RagChunkTypes.PARAGRAPH, "正则分块 " + index, Collections.emptyList(), doc.getContent(), index, 1, 1)); + index++; + } + return postProcess(result); + } + + private List splitLongContent(String content, Integer chunkSize) { + int size = chunkSize == null || chunkSize.intValue() <= 0 ? RagDefaults.CHUNK_SIZE : chunkSize.intValue(); + String[] paragraphs = content.split("\\n\\s*\\n"); + List parts = new ArrayList(); + StringBuilder current = new StringBuilder(); + for (String paragraph : paragraphs) { + String text = paragraph.trim(); + if (!StringUtil.hasText(text)) { + continue; + } + if (current.length() > 0 && current.length() + text.length() + 2 > size) { + parts.add(current.toString().trim()); + current = new StringBuilder(); + } + if (current.length() > 0) { + current.append("\n\n"); + } + current.append(text); + } + if (current.length() > 0) { + parts.add(current.toString().trim()); + } + if (parts.isEmpty()) { + parts.add(content); + } + return parts; + } + + private List postProcess(List chunks) { + List result = new ArrayList(); + Set dedup = new HashSet(); + int index = 1; + for (RagChunk chunk : chunks) { + String content = chunk.getContent() == null ? "" : chunk.getContent().trim(); + if (!StringUtil.hasText(content)) { + continue; + } + if (content.length() < 10 && !RagChunkTypes.QA_PAIR.equals(chunk.getChunkType())) { + continue; + } + String dedupKey = content.replaceAll("\\s+", " "); + if (!dedup.add(dedupKey)) { + continue; + } + chunk.setChunkId("chunk-" + index); + chunk.setCharCount(Integer.valueOf(content.length())); + chunk.setTokenEstimate(Integer.valueOf(Math.max(1, content.length() / 4))); + result.add(chunk); + index++; + } + return result; + } + + private RagChunk createChunk(String chunkType, + String sourceLabel, + List headingPath, + String content, + int index, + int partNo, + int partTotal) { + RagChunk chunk = new RagChunk(); + chunk.setChunkId("chunk-" + index); + chunk.setChunkType(chunkType); + chunk.setSourceLabel(sourceLabel); + chunk.setHeadingPath(new ArrayList(headingPath)); + chunk.setContent(content.trim()); + chunk.setPartNo(Integer.valueOf(partNo)); + chunk.setPartTotal(Integer.valueOf(partTotal)); + if (!headingPath.isEmpty()) { + chunk.getOptions().put(RagMetadataKeys.HEADING_PATH, new ArrayList(headingPath)); + } + if (RagChunkTypes.SECTION.equals(chunkType)) { + chunk.getOptions().put(RagMetadataKeys.SOURCE_LABEL, sourceLabel); + } + return chunk; + } + + private int safeChunkSize(StrategyConfig strategyConfig) { + Integer chunkSize = strategyConfig.getChunkSize(); + return chunkSize == null || chunkSize.intValue() <= 0 ? RagDefaults.CHUNK_SIZE : chunkSize.intValue(); + } + + private int safeOverlap(StrategyConfig strategyConfig) { + Integer overlapSize = strategyConfig.getOverlapSize(); + return overlapSize == null || overlapSize.intValue() < 0 ? RagDefaults.OVERLAP_SIZE : overlapSize.intValue(); + } + + private String joinAndTrim(List lines) { + String value = String.join("\n", lines).trim(); + return value.replaceAll("\\n{3,}", "\n\n"); + } + + private List copyPath(Deque stack) { + List path = new ArrayList(); + for (HeadingLevel item : stack) { + path.add(0, item.title); + } + return path; + } + + private static class SectionChunk { + private final List headingPath; + private final String sourceLabel; + private final List lines = new ArrayList(); + + private SectionChunk(List headingPath, String sourceLabel) { + this.headingPath = headingPath; + this.sourceLabel = sourceLabel; + } + } + + private static class HeadingLevel { + private final int level; + private final String title; + + private HeadingLevel(int level, String title) { + this.level = level; + this.title = title; + } + } + + private static class OutlineHeading { + private static final Pattern CHINESE_CHAPTER = Pattern.compile("^第[一二三四五六七八九十百零两0-9]+[章节篇部分卷]\\s*(.*)$"); + private static final Pattern CHINESE_SECTION = Pattern.compile("^([一二三四五六七八九十百零]+[、..])\\s*(\\S+.*)$"); + private static final Pattern CHINESE_SUBSECTION = Pattern.compile("^[((]([一二三四五六七八九十百零0-9]+)[))]\\s*(\\S+.*)$"); + private static final Pattern NUMERIC_SECTION = Pattern.compile("^([0-9]+(?:\\.[0-9]+){0,4})\\s+(\\S+.*)$"); + private static final Pattern ENGLISH_SECTION = Pattern.compile("^(Chapter|Section|Part)\\s+([0-9IVXLC]+)(.*)$", Pattern.CASE_INSENSITIVE); + private static final Pattern ENGLISH_ROMAN = Pattern.compile("^([IVXLC]+)[.、)\\s-]+(\\S+.*)$"); + + private final int level; + private final String title; + + private OutlineHeading(int level, String title) { + this.level = level; + this.title = title; + } + + private static OutlineHeading parse(String line) { + Matcher matcher = CHINESE_CHAPTER.matcher(line); + if (matcher.matches()) { + return new OutlineHeading(1, line.trim()); + } + matcher = CHINESE_SECTION.matcher(line); + if (matcher.matches()) { + return new OutlineHeading(2, line.trim()); + } + matcher = CHINESE_SUBSECTION.matcher(line); + if (matcher.matches()) { + return new OutlineHeading(3, line.trim()); + } + matcher = NUMERIC_SECTION.matcher(line); + if (matcher.matches()) { + String code = matcher.group(1); + int level = code.split("\\.").length; + return new OutlineHeading(level, line.trim()); + } + matcher = ENGLISH_SECTION.matcher(line); + if (matcher.matches()) { + String prefix = matcher.group(1).toLowerCase(); + int level = "chapter".equals(prefix) ? 1 : ("section".equals(prefix) ? 2 : 1); + return new OutlineHeading(level, line.trim()); + } + matcher = ENGLISH_ROMAN.matcher(line); + if (matcher.matches()) { + return new OutlineHeading(2, line.trim()); + } + return null; + } + } +} diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/AnalysisResult.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/AnalysisResult.java new file mode 100644 index 0000000..c66928d --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/AnalysisResult.java @@ -0,0 +1,92 @@ +package com.easyagents.rag.ingestion.model; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +public class AnalysisResult implements Serializable { + + private String sourceFormat; + private String normalizedContent; + private String recommendedStructureType; + private String recommendedStrategyCode; + private String recommendedStrategyLabel; + private Double confidence; + private List reasons = new ArrayList(); + private List candidateStrategies = new ArrayList(); + private Map features = new LinkedHashMap(); + + public String getSourceFormat() { + return sourceFormat; + } + + public void setSourceFormat(String sourceFormat) { + this.sourceFormat = sourceFormat; + } + + public String getNormalizedContent() { + return normalizedContent; + } + + public void setNormalizedContent(String normalizedContent) { + this.normalizedContent = normalizedContent; + } + + public String getRecommendedStructureType() { + return recommendedStructureType; + } + + public void setRecommendedStructureType(String recommendedStructureType) { + this.recommendedStructureType = recommendedStructureType; + } + + public String getRecommendedStrategyCode() { + return recommendedStrategyCode; + } + + public void setRecommendedStrategyCode(String recommendedStrategyCode) { + this.recommendedStrategyCode = recommendedStrategyCode; + } + + public String getRecommendedStrategyLabel() { + return recommendedStrategyLabel; + } + + public void setRecommendedStrategyLabel(String recommendedStrategyLabel) { + this.recommendedStrategyLabel = recommendedStrategyLabel; + } + + public Double getConfidence() { + return confidence; + } + + public void setConfidence(Double confidence) { + this.confidence = confidence; + } + + public List getReasons() { + return reasons; + } + + public void setReasons(List reasons) { + this.reasons = reasons; + } + + public List getCandidateStrategies() { + return candidateStrategies; + } + + public void setCandidateStrategies(List candidateStrategies) { + this.candidateStrategies = candidateStrategies; + } + + public Map getFeatures() { + return features; + } + + public void setFeatures(Map features) { + this.features = features; + } +} diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/CandidateStrategy.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/CandidateStrategy.java new file mode 100644 index 0000000..0f085b8 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/CandidateStrategy.java @@ -0,0 +1,43 @@ +package com.easyagents.rag.ingestion.model; + +import java.io.Serializable; + +public class CandidateStrategy implements Serializable { + + private String strategyCode; + private String strategyLabel; + private Double score; + + public CandidateStrategy() { + } + + public CandidateStrategy(String strategyCode, String strategyLabel, Double score) { + this.strategyCode = strategyCode; + this.strategyLabel = strategyLabel; + this.score = score; + } + + public String getStrategyCode() { + return strategyCode; + } + + public void setStrategyCode(String strategyCode) { + this.strategyCode = strategyCode; + } + + public String getStrategyLabel() { + return strategyLabel; + } + + public void setStrategyLabel(String strategyLabel) { + this.strategyLabel = strategyLabel; + } + + public Double getScore() { + return score; + } + + public void setScore(Double score) { + this.score = score; + } +} diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/StrategyConfig.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/StrategyConfig.java new file mode 100644 index 0000000..b4d4da2 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/model/StrategyConfig.java @@ -0,0 +1,79 @@ +package com.easyagents.rag.ingestion.model; + +import com.easyagents.rag.core.RagDefaults; +import com.easyagents.rag.core.RagStrategyCodes; + +import java.io.Serializable; + +public class StrategyConfig implements Serializable { + + private String strategyCode = RagStrategyCodes.AUTO; + private Integer chunkSize = RagDefaults.CHUNK_SIZE; + private Integer overlapSize = RagDefaults.OVERLAP_SIZE; + private String regex; + private Integer rowsPerChunk = RagDefaults.ROWS_PER_CHUNK; + private Integer mdSplitterLevel = RagDefaults.MD_SPLITTER_LEVEL; + + public static StrategyConfig defaults() { + return new StrategyConfig(); + } + + public StrategyConfig copy() { + StrategyConfig copy = new StrategyConfig(); + copy.setStrategyCode(this.strategyCode); + copy.setChunkSize(this.chunkSize); + copy.setOverlapSize(this.overlapSize); + copy.setRegex(this.regex); + copy.setRowsPerChunk(this.rowsPerChunk); + copy.setMdSplitterLevel(this.mdSplitterLevel); + return copy; + } + + public String getStrategyCode() { + return strategyCode; + } + + public void setStrategyCode(String strategyCode) { + this.strategyCode = strategyCode; + } + + public Integer getChunkSize() { + return chunkSize; + } + + public void setChunkSize(Integer chunkSize) { + this.chunkSize = chunkSize; + } + + public Integer getOverlapSize() { + return overlapSize; + } + + public void setOverlapSize(Integer overlapSize) { + this.overlapSize = overlapSize; + } + + public String getRegex() { + return regex; + } + + public void setRegex(String regex) { + this.regex = regex; + } + + public Integer getRowsPerChunk() { + return rowsPerChunk; + } + + public void setRowsPerChunk(Integer rowsPerChunk) { + this.rowsPerChunk = rowsPerChunk; + } + + public Integer getMdSplitterLevel() { + return mdSplitterLevel; + } + + public void setMdSplitterLevel(Integer mdSplitterLevel) { + this.mdSplitterLevel = mdSplitterLevel; + } +} diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/recommend/SplitStrategyRecommender.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/recommend/SplitStrategyRecommender.java new file mode 100644 index 0000000..8c70c19 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/recommend/SplitStrategyRecommender.java @@ -0,0 +1,133 @@ +package com.easyagents.rag.ingestion.recommend; + +import com.easyagents.rag.core.RagStrategyCodes; +import com.easyagents.rag.core.RagStructureTypes; +import com.easyagents.rag.ingestion.model.AnalysisResult; +import com.easyagents.rag.ingestion.model.CandidateStrategy; + +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.util.*; + +public class SplitStrategyRecommender { + + public AnalysisResult recommend(AnalysisResult analysisResult) { + Map features = analysisResult.getFeatures(); + String sourceFormat = safeLowercase(analysisResult.getSourceFormat()); + + double markdownScore = number(features.get("markdownHeadingCount")) * 12 + + number(features.get("markdownLevelVariety")) * 8 + + ("md".equals(sourceFormat) ? 20 : 0); + double outlineScore = number(features.get("outlineHeadingCount")) * 10 + + (("pdf".equals(sourceFormat) || "docx".equals(sourceFormat)) ? 5 : 0) + - number(features.get("tocLineCount")) * 4; + double qaScore = number(features.get("qaQuestionCount")) * 10 + + number(features.get("qaAnswerCount")) * 10 + + number(features.get("pairedQaCount")) * 18; + double plainScore = 18 + + number(features.get("paragraphCount")) * 2 + + number(features.get("longParagraphCount")) * 3; + + Map scoreMap = new LinkedHashMap(); + scoreMap.put(RagStrategyCodes.MARKDOWN_SECTION, Double.valueOf(markdownScore)); + scoreMap.put(RagStrategyCodes.OUTLINE_SECTION, Double.valueOf(outlineScore)); + scoreMap.put(RagStrategyCodes.QA_PAIR, Double.valueOf(qaScore)); + scoreMap.put(RagStrategyCodes.PARAGRAPH_LENGTH, Double.valueOf(plainScore)); + + List> ranking = new ArrayList>(scoreMap.entrySet()); + ranking.sort((left, right) -> Double.compare(right.getValue().doubleValue(), left.getValue().doubleValue())); + + Map.Entry best = ranking.get(0); + Map.Entry second = ranking.size() > 1 ? ranking.get(1) : best; + double confidence = computeConfidence(best.getValue().doubleValue(), second.getValue().doubleValue()); + + String recommendedStrategy = confidence < 0.45D ? RagStrategyCodes.PARAGRAPH_LENGTH : best.getKey(); + analysisResult.setRecommendedStrategyCode(recommendedStrategy); + analysisResult.setRecommendedStrategyLabel(toStrategyLabel(recommendedStrategy)); + analysisResult.setRecommendedStructureType(toStructureType(recommendedStrategy)); + analysisResult.setConfidence(Double.valueOf(scale(confidence))); + analysisResult.setReasons(buildReasons(features, recommendedStrategy, confidence)); + + List candidates = new ArrayList(); + for (Map.Entry entry : ranking) { + candidates.add(new CandidateStrategy(entry.getKey(), toStrategyLabel(entry.getKey()), Double.valueOf(scale(entry.getValue().doubleValue())))); + } + analysisResult.setCandidateStrategies(candidates); + return analysisResult; + } + + public String toStructureType(String strategyCode) { + if (RagStrategyCodes.MARKDOWN_SECTION.equals(strategyCode)) { + return RagStructureTypes.MARKDOWN_HEADING; + } + if (RagStrategyCodes.OUTLINE_SECTION.equals(strategyCode)) { + return RagStructureTypes.OUTLINE_SECTION; + } + if (RagStrategyCodes.QA_PAIR.equals(strategyCode)) { + return RagStructureTypes.QA_PAIR; + } + return RagStructureTypes.PLAIN_PARAGRAPH; + } + + public String toStrategyLabel(String strategyCode) { + if (RagStrategyCodes.MARKDOWN_SECTION.equals(strategyCode)) { + return "Markdown 标题拆分"; + } + if (RagStrategyCodes.OUTLINE_SECTION.equals(strategyCode)) { + return "章节标题拆分"; + } + if (RagStrategyCodes.QA_PAIR.equals(strategyCode)) { + return "问答对拆分"; + } + if (RagStrategyCodes.CUSTOM_REGEX.equals(strategyCode)) { + return "自定义正则拆分"; + } + if (RagStrategyCodes.AUTO.equals(strategyCode)) { + return "自动推荐"; + } + return "自然段长度拆分"; + } + + private List buildReasons(Map features, String strategyCode, double confidence) { + List reasons = new ArrayList(); + if (RagStrategyCodes.MARKDOWN_SECTION.equals(strategyCode)) { + reasons.add("检测到 Markdown 标题结构,适合按标题层级拆分"); + reasons.add("标题层级数:" + number(features.get("markdownLevelVariety")) + ",标题数量:" + number(features.get("markdownHeadingCount"))); + return reasons; + } + if (RagStrategyCodes.OUTLINE_SECTION.equals(strategyCode)) { + reasons.add("检测到中英文标题/章节编号,适合按章节拆分"); + reasons.add("章节标题数量:" + number(features.get("outlineHeadingCount"))); + return reasons; + } + if (RagStrategyCodes.QA_PAIR.equals(strategyCode)) { + reasons.add("检测到问答结构,适合按一问一答拆分"); + reasons.add("问题数量:" + number(features.get("qaQuestionCount")) + ",成对问答数量:" + number(features.get("pairedQaCount"))); + return reasons; + } + reasons.add("结构特征不够集中,回退为自然段长度拆分"); + reasons.add("推荐置信度:" + scale(confidence)); + return reasons; + } + + private double computeConfidence(double bestScore, double secondScore) { + double delta = Math.max(0D, bestScore - secondScore); + double base = Math.min(1D, bestScore / 100D); + return Math.min(1D, Math.max(0.25D, base * 0.6D + Math.min(0.4D, delta / 50D))); + } + + private double number(Object value) { + if (value instanceof Number) { + return ((Number) value).doubleValue(); + } + return 0D; + } + + private String safeLowercase(String value) { + return value == null ? "" : value.toLowerCase(Locale.ROOT); + } + + private double scale(double value) { + return BigDecimal.valueOf(value).setScale(2, RoundingMode.HALF_UP).doubleValue(); + } +} diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java b/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java new file mode 100644 index 0000000..b5f0e5a --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java @@ -0,0 +1,80 @@ +package com.easyagents.rag.ingestion; + +import com.easyagents.rag.core.RagChunk; +import com.easyagents.rag.core.RagChunkTypes; +import com.easyagents.rag.core.RagStrategyCodes; +import com.easyagents.rag.ingestion.analysis.DocumentStructureAnalyzer; +import com.easyagents.rag.ingestion.chunk.RagSplitStrategyRegistry; +import com.easyagents.rag.ingestion.model.AnalysisResult; +import com.easyagents.rag.ingestion.model.StrategyConfig; +import com.easyagents.rag.ingestion.recommend.SplitStrategyRecommender; +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +public class RagIngestionPipelineTest { + + private final DocumentStructureAnalyzer analyzer = new DocumentStructureAnalyzer(); + private final SplitStrategyRecommender recommender = new SplitStrategyRecommender(); + private final RagSplitStrategyRegistry registry = new RagSplitStrategyRegistry(); + + @Test + public void shouldRecommendMarkdownStrategy() { + String markdown = "# Quick Start\n" + + "Welcome\n\n" + + "## Install\n" + + "Run npm install\n\n" + + "## Usage\n" + + "Run pnpm dev"; + + AnalysisResult analysis = recommender.recommend(analyzer.analyze(markdown, "md")); + + Assert.assertEquals(RagStrategyCodes.MARKDOWN_SECTION, analysis.getRecommendedStrategyCode()); + Assert.assertTrue(analysis.getConfidence().doubleValue() > 0.4D); + } + + @Test + public void shouldRecommendQaStrategyForEnglishAndChinese() { + String qa = "Q: How to reset password?\n" + + "A: Open admin page and click reset.\n\n" + + "问:默认密码是什么?\n" + + "答:由系统配置统一决定。"; + + AnalysisResult analysis = recommender.recommend(analyzer.analyze(qa, "txt")); + + Assert.assertEquals(RagStrategyCodes.QA_PAIR, analysis.getRecommendedStrategyCode()); + } + + @Test + public void shouldSplitOutlineDocumentByHeadingPath() { + String outline = "第1章 总则\n适用范围说明。\n\n1.1 目标\n定义系统目标。\n\n1.2 范围\n定义系统范围。"; + AnalysisResult analysis = recommender.recommend(analyzer.analyze(outline, "docx")); + StrategyConfig config = StrategyConfig.defaults(); + config.setStrategyCode(RagStrategyCodes.OUTLINE_SECTION); + + List chunks = registry.split(analysis, config); + + Assert.assertEquals(3, chunks.size()); + Assert.assertEquals("第1章 总则", chunks.get(0).getSourceLabel()); + Assert.assertEquals(2, chunks.get(1).getHeadingPath().size()); + } + + @Test + public void shouldSplitQaDocumentByPair() { + String qa = "Q: How to reset password?\n" + + "A: Open admin page and click reset.\n\n" + + "问:默认密码是什么?\n" + + "答:由系统配置统一决定。"; + AnalysisResult analysis = recommender.recommend(analyzer.analyze(qa, "txt")); + StrategyConfig config = StrategyConfig.defaults(); + config.setStrategyCode(RagStrategyCodes.QA_PAIR); + + List chunks = registry.split(analysis, config); + + Assert.assertEquals(2, chunks.size()); + Assert.assertEquals(RagChunkTypes.QA_PAIR, chunks.get(0).getChunkType()); + Assert.assertTrue(chunks.get(0).getContent().contains("问题")); + Assert.assertTrue(chunks.get(1).getAnswer().contains("系统配置")); + } +} diff --git a/easy-agents-rag/easy-agents-rag-ocr/pom.xml b/easy-agents-rag/easy-agents-rag-ocr/pom.xml new file mode 100644 index 0000000..c4b861b --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-ocr/pom.xml @@ -0,0 +1,32 @@ + + + 4.0.0 + + + com.easyagents + easy-agents-rag + ${revision} + + + easy-agents-rag-ocr + easy-agents-rag-ocr + + + 8 + 8 + UTF-8 + + + + + com.easyagents + easy-agents-core + + + com.easyagents + easy-agents-rag-core + + + diff --git a/easy-agents-rag/easy-agents-rag-retrieval/pom.xml b/easy-agents-rag/easy-agents-rag-retrieval/pom.xml new file mode 100644 index 0000000..d7ebb38 --- /dev/null +++ b/easy-agents-rag/easy-agents-rag-retrieval/pom.xml @@ -0,0 +1,36 @@ + + + 4.0.0 + + + com.easyagents + easy-agents-rag + ${revision} + + + easy-agents-rag-retrieval + easy-agents-rag-retrieval + + + 8 + 8 + UTF-8 + + + + + com.easyagents + easy-agents-core + + + com.easyagents + easy-agents-rag-core + + + com.easyagents + easy-agents-rag-enhance + + + diff --git a/easy-agents-rag/pom.xml b/easy-agents-rag/pom.xml new file mode 100644 index 0000000..fbd265d --- /dev/null +++ b/easy-agents-rag/pom.xml @@ -0,0 +1,24 @@ + + + 4.0.0 + + + com.easyagents + easy-agents + ${revision} + + + easy-agents-rag + pom + easy-agents-rag + + + easy-agents-rag-core + easy-agents-rag-ingestion + easy-agents-rag-ocr + easy-agents-rag-enhance + easy-agents-rag-retrieval + + diff --git a/easy-agents-spring-boot-starter/pom.xml b/easy-agents-spring-boot-starter/pom.xml index 12f30e9..4a06e63 100644 --- a/easy-agents-spring-boot-starter/pom.xml +++ b/easy-agents-spring-boot-starter/pom.xml @@ -51,6 +51,15 @@ easy-agents-bom + + com.easyagents + easy-agents-rag-core + + + + com.easyagents + easy-agents-rag-ingestion + org.springframework.boot diff --git a/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/rag/ingestion/RagIngestionAutoConfiguration.java b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/rag/ingestion/RagIngestionAutoConfiguration.java new file mode 100644 index 0000000..80bf7ea --- /dev/null +++ b/easy-agents-spring-boot-starter/src/main/java/com/easyagents/spring/boot/rag/ingestion/RagIngestionAutoConfiguration.java @@ -0,0 +1,42 @@ +package com.easyagents.spring.boot.rag.ingestion; + +import com.easyagents.rag.ingestion.DefaultRagIngestionService; +import com.easyagents.rag.ingestion.RagIngestionService; +import com.easyagents.rag.ingestion.analysis.DocumentStructureAnalyzer; +import com.easyagents.rag.ingestion.chunk.RagSplitStrategyRegistry; +import com.easyagents.rag.ingestion.recommend.SplitStrategyRecommender; +import org.springframework.boot.autoconfigure.condition.ConditionalOnClass; +import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@ConditionalOnClass(RagIngestionService.class) +@Configuration(proxyBeanMethods = false) +public class RagIngestionAutoConfiguration { + + @Bean + @ConditionalOnMissingBean + public DocumentStructureAnalyzer documentStructureAnalyzer() { + return new DocumentStructureAnalyzer(); + } + + @Bean + @ConditionalOnMissingBean + public SplitStrategyRecommender splitStrategyRecommender() { + return new SplitStrategyRecommender(); + } + + @Bean + @ConditionalOnMissingBean + public RagSplitStrategyRegistry ragSplitStrategyRegistry() { + return new RagSplitStrategyRegistry(); + } + + @Bean + @ConditionalOnMissingBean + public RagIngestionService ragIngestionService(DocumentStructureAnalyzer documentStructureAnalyzer, + SplitStrategyRecommender splitStrategyRecommender, + RagSplitStrategyRegistry ragSplitStrategyRegistry) { + return new DefaultRagIngestionService(documentStructureAnalyzer, splitStrategyRecommender, ragSplitStrategyRegistry); + } +} diff --git a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring.factories b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring.factories index 663545c..fa01615 100644 --- a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring.factories +++ b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring.factories @@ -1,10 +1,11 @@ org.springframework.boot.autoconfigure.EnableAutoConfiguration=\ - com.easyagents.spring.boot.chatModel.chatglm.ChatglmAutoConfiguration,\ - com.easyagents.spring.boot.chatModel.openai.OpenAIAutoConfiguration,\ - com.easyagents.spring.boot.chatModel.qwen.QwenAutoConfiguration,\ - com.easyagents.spring.boot.chatModel.spark.SparkAutoConfiguration,\ + com.easyagents.spring.boot.llm.openai.OpenAIAutoConfiguration,\ + com.easyagents.spring.boot.llm.qwen.QwenAutoConfiguration,\ com.easyagents.spring.boot.store.aliyun.AliyunAutoConfiguration,\ com.easyagents.spring.boot.store.qcloud.QCloudStoreAutoConfiguration,\ - com.easyagents.spring.boot.chatModel.ollama.OllamaAutoConfiguration,\ - com.easyagents.spring.boot.chatModel.deepseek.DeepSeekAutoConfiguration,\ - com.easyagents.spring.boot.store.chroma.ChromaAutoConfiguration + com.easyagents.spring.boot.llm.ollama.OllamaAutoConfiguration,\ + com.easyagents.spring.boot.llm.deepseek.DeepSeekAutoConfiguration,\ + com.easyagents.spring.boot.store.chroma.ChromaAutoConfiguration,\ + com.easyagents.spring.boot.store.elasticsearch.ElasticSearchAutoConfiguration,\ + com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration,\ + com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration diff --git a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports index 102f3d6..bea3e22 100644 --- a/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports +++ b/easy-agents-spring-boot-starter/src/main/resources/META-INF/spring/org.springframework.boot.autoconfigure.AutoConfiguration.imports @@ -1,8 +1,10 @@ -com.easyagents.spring.boot.chatModel.chatglm.ChatglmAutoConfiguration -com.easyagents.spring.boot.chatModel.openai.OpenAIAutoConfiguration -com.easyagents.spring.boot.chatModel.qwen.QwenAutoConfiguration -com.easyagents.spring.boot.chatModel.spark.SparkAutoConfiguration +com.easyagents.spring.boot.llm.openai.OpenAIAutoConfiguration +com.easyagents.spring.boot.llm.qwen.QwenAutoConfiguration com.easyagents.spring.boot.store.aliyun.AliyunAutoConfiguration com.easyagents.spring.boot.store.qcloud.QCloudStoreAutoConfiguration -com.easyagents.spring.boot.chatModel.ollama.OllamaAutoConfiguration +com.easyagents.spring.boot.llm.ollama.OllamaAutoConfiguration +com.easyagents.spring.boot.llm.deepseek.DeepSeekAutoConfiguration com.easyagents.spring.boot.store.chroma.ChromaAutoConfiguration +com.easyagents.spring.boot.store.elasticsearch.ElasticSearchAutoConfiguration +com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration +com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration diff --git a/pom.xml b/pom.xml index 948387e..a957978 100644 --- a/pom.xml +++ b/pom.xml @@ -16,6 +16,7 @@ easy-agents-bom easy-agents-core + easy-agents-rag easy-agents-chat easy-agents-store easy-agents-spring-boot-starter @@ -118,6 +119,36 @@ ${revision} + + com.easyagents + easy-agents-rag-core + ${revision} + + + + com.easyagents + easy-agents-rag-ingestion + ${revision} + + + + com.easyagents + easy-agents-rag-ocr + ${revision} + + + + com.easyagents + easy-agents-rag-enhance + ${revision} + + + + com.easyagents + easy-agents-rag-retrieval + ${revision} + + com.easyagents easy-agents-bom