feat: 完善统一文档解析与分块原文映射

- 兼容 MinerU docx 嵌套压缩工件与数组模型输出

- 补充异步解析日志与 sourceRanges 原文区间映射
This commit is contained in:
2026-04-15 19:27:22 +08:00
parent 0c7b362173
commit 547d4f6ee0
6 changed files with 427 additions and 77 deletions

View File

@@ -17,4 +17,5 @@ public final class RagMetadataKeys {
public static final String PART_NO = "partNo";
public static final String PART_TOTAL = "partTotal";
public static final String WARNINGS = "warnings";
public static final String SOURCE_RANGES = "sourceRanges";
}

View File

@@ -1,9 +1,5 @@
package com.easyagents.rag.ingestion.chunk;
import com.easyagents.core.document.Document;
import com.easyagents.core.document.DocumentSplitter;
import com.easyagents.core.document.splitter.RegexDocumentSplitter;
import com.easyagents.core.document.splitter.SimpleDocumentSplitter;
import com.easyagents.core.util.StringUtil;
import com.easyagents.rag.core.*;
import com.easyagents.rag.ingestion.model.AnalysisResult;
@@ -41,12 +37,12 @@ public class RagSplitStrategyRegistry {
}
private List<RagChunk> buildMarkdownChunks(String content, StrategyConfig strategyConfig) {
List<String> lines = Arrays.asList(content.split("\\n"));
List<LineSlice> lines = sliceLines(content);
List<SectionChunk> sections = new ArrayList<SectionChunk>();
Deque<HeadingLevel> stack = new ArrayDeque<HeadingLevel>();
SectionChunk current = null;
for (String rawLine : lines) {
String line = rawLine.trim();
for (LineSlice lineSlice : lines) {
String line = lineSlice.trimmedLine;
Matcher matcher = MARKDOWN_HEADING.matcher(line);
if (matcher.matches()) {
if (current != null) {
@@ -58,27 +54,27 @@ public class RagSplitStrategyRegistry {
}
stack.addLast(new HeadingLevel(level, matcher.group(2).trim()));
current = new SectionChunk(copyPath(stack), matcher.group(2).trim());
current.lines.add(line);
current.addLine(lineSlice);
} else {
if (current == null) {
current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
}
current.lines.add(rawLine);
current.addLine(lineSlice);
}
}
if (current != null) {
sections.add(current);
}
return finalizeSectionChunks(sections, strategyConfig);
return finalizeSectionChunks(content, sections, strategyConfig);
}
private List<RagChunk> buildOutlineChunks(String content, StrategyConfig strategyConfig) {
List<String> lines = Arrays.asList(content.split("\\n"));
List<LineSlice> lines = sliceLines(content);
List<SectionChunk> sections = new ArrayList<SectionChunk>();
Deque<HeadingLevel> stack = new ArrayDeque<HeadingLevel>();
SectionChunk current = null;
for (String rawLine : lines) {
String line = rawLine.trim();
for (LineSlice lineSlice : lines) {
String line = lineSlice.trimmedLine;
OutlineHeading heading = OutlineHeading.parse(line);
if (heading != null) {
if (current != null) {
@@ -89,36 +85,62 @@ public class RagSplitStrategyRegistry {
}
stack.addLast(new HeadingLevel(heading.level, heading.title));
current = new SectionChunk(copyPath(stack), heading.title);
current.lines.add(line);
current.addLine(lineSlice);
} else {
if (current == null) {
current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
}
current.lines.add(rawLine);
current.addLine(lineSlice);
}
}
if (current != null) {
sections.add(current);
}
return finalizeSectionChunks(sections, strategyConfig);
return finalizeSectionChunks(content, sections, strategyConfig);
}
private List<RagChunk> finalizeSectionChunks(List<SectionChunk> sections, StrategyConfig strategyConfig) {
private List<RagChunk> finalizeSectionChunks(String content, List<SectionChunk> sections, StrategyConfig strategyConfig) {
List<RagChunk> result = new ArrayList<RagChunk>();
int index = 1;
for (SectionChunk section : sections) {
String content = joinAndTrim(section.lines);
if (!StringUtil.hasText(content) || content.equals(section.sourceLabel)) {
TextRange baseRange = trimRange(content, section.start, section.end);
if (baseRange == null) {
continue;
}
if (content.length() <= safeChunkSize(strategyConfig)) {
result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, content, index++, 1, 1));
String sectionContent = content.substring(baseRange.start, baseRange.end);
if (!StringUtil.hasText(sectionContent) || sectionContent.equals(section.sourceLabel)) {
continue;
}
List<String> subContents = splitLongContent(content, strategyConfig.getChunkSize());
if (sectionContent.length() <= safeChunkSize(strategyConfig)) {
result.add(createChunk(
RagChunkTypes.SECTION,
section.sourceLabel,
section.headingPath,
sectionContent,
index++,
1,
1,
Collections.singletonList(baseRange)
));
continue;
}
List<String> subContents = splitLongContent(sectionContent, strategyConfig.getChunkSize());
int total = subContents.size();
int relativeCursor = 0;
for (int i = 0; i < subContents.size(); i++) {
result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, subContents.get(i), index++, i + 1, total));
String subContent = subContents.get(i);
TextRange relativeRange = findOrderedRange(sectionContent, subContent, relativeCursor, "章节分块");
relativeCursor = relativeRange.end;
result.add(createChunk(
RagChunkTypes.SECTION,
section.sourceLabel,
section.headingPath,
subContent,
index++,
i + 1,
total,
Collections.singletonList(relativeRange.offset(baseRange.start))
));
}
}
return postProcess(result);
@@ -127,62 +149,79 @@ public class RagSplitStrategyRegistry {
private List<RagChunk> buildQaChunks(String content, StrategyConfig strategyConfig) {
List<RagChunk> result = new ArrayList<RagChunk>();
String currentQuestion = null;
StringBuilder answerBuilder = new StringBuilder();
StringBuilder questionBuilder = new StringBuilder();
List<LineSlice> answerSlices = new ArrayList<LineSlice>();
List<LineSlice> questionSlices = new ArrayList<LineSlice>();
boolean answerStarted = false;
int qaIndex = 1;
for (String rawLine : content.split("\\n")) {
String line = rawLine.trim();
for (LineSlice lineSlice : sliceLines(content)) {
String line = lineSlice.trimmedLine;
if (!StringUtil.hasText(line)) {
continue;
}
Matcher questionMatcher = QUESTION_PREFIX.matcher(line);
Matcher answerMatcher = ANSWER_PREFIX.matcher(line);
if (questionMatcher.matches()) {
qaIndex = flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig);
qaIndex = flushQaChunk(result, currentQuestion, questionSlices, answerSlices, qaIndex, strategyConfig);
currentQuestion = questionMatcher.group(2).trim();
questionBuilder = new StringBuilder(currentQuestion);
answerBuilder = new StringBuilder();
questionSlices = new ArrayList<LineSlice>();
answerSlices = new ArrayList<LineSlice>();
questionSlices.add(lineSlice);
answerStarted = false;
continue;
}
if (answerMatcher.matches()) {
if (answerBuilder.length() > 0) {
answerBuilder.append('\n');
if (!StringUtil.hasText(currentQuestion)) {
continue;
}
answerBuilder.append(answerMatcher.group(2).trim());
answerSlices.add(lineSlice);
answerStarted = true;
continue;
}
if (answerBuilder.length() > 0) {
answerBuilder.append('\n').append(rawLine.trim());
} else if (questionBuilder.length() > 0) {
questionBuilder.append('\n').append(rawLine.trim());
if (!StringUtil.hasText(currentQuestion)) {
continue;
}
if (answerStarted) {
answerSlices.add(lineSlice);
} else if (!questionSlices.isEmpty()) {
questionSlices.add(lineSlice);
}
}
flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig);
flushQaChunk(result, currentQuestion, questionSlices, answerSlices, qaIndex, strategyConfig);
return postProcess(result);
}
private int flushQaChunk(List<RagChunk> result,
String currentQuestion,
StringBuilder questionBuilder,
StringBuilder answerBuilder,
List<LineSlice> questionSlices,
List<LineSlice> answerSlices,
int qaIndex,
StrategyConfig strategyConfig) {
if (!StringUtil.hasText(currentQuestion)) {
return qaIndex;
}
if (!StringUtil.hasText(answerBuilder.toString())) {
if (answerSlices == null || answerSlices.isEmpty()) {
return qaIndex;
}
String question = questionBuilder.toString().trim();
String answer = answerBuilder.toString().trim();
String question = joinLineSlices(questionSlices);
String answer = joinLineSlices(answerSlices);
String baseContent = "问题:" + question + "\n答案" + answer;
List<String> subContents = baseContent.length() > safeChunkSize(strategyConfig)
? splitLongContent(baseContent, strategyConfig.getChunkSize())
: Collections.singletonList(baseContent);
int total = subContents.size();
List<TextRange> sourceRanges = buildQaSourceRanges(questionSlices, answerSlices);
for (int i = 0; i < subContents.size(); i++) {
RagChunk chunk = createChunk(RagChunkTypes.QA_PAIR, "Q" + qaIndex + " " + question, Collections.<String>emptyList(), subContents.get(i), result.size() + 1, i + 1, total);
RagChunk chunk = createChunk(
RagChunkTypes.QA_PAIR,
"Q" + qaIndex + " " + question,
Collections.<String>emptyList(),
subContents.get(i),
result.size() + 1,
i + 1,
total,
sourceRanges
);
chunk.setQuestion(question);
chunk.setAnswer(answer);
chunk.getOptions().put(RagMetadataKeys.QA_GROUP_ID, "qa-" + qaIndex);
@@ -193,11 +232,26 @@ public class RagSplitStrategyRegistry {
private List<RagChunk> buildParagraphChunks(String content, StrategyConfig strategyConfig) {
List<RagChunk> result = new ArrayList<RagChunk>();
DocumentSplitter splitter = new SimpleDocumentSplitter(safeChunkSize(strategyConfig), safeOverlap(strategyConfig));
List<Document> docs = splitter.split(new Document(content));
int index = 1;
for (Document doc : docs) {
result.add(createChunk(RagChunkTypes.PARAGRAPH, "分块 " + index, Collections.<String>emptyList(), doc.getContent(), index, 1, 1));
int currentIndex = 0;
int maxIndex = content.length();
while (currentIndex < maxIndex) {
int endIndex = Math.min(currentIndex + safeChunkSize(strategyConfig), maxIndex);
TextRange range = trimRange(content, currentIndex, endIndex);
currentIndex = currentIndex + safeChunkSize(strategyConfig) - safeOverlap(strategyConfig);
if (range == null) {
continue;
}
result.add(createChunk(
RagChunkTypes.PARAGRAPH,
"分块 " + index,
Collections.<String>emptyList(),
content.substring(range.start, range.end),
index,
1,
1,
Collections.singletonList(range)
));
index++;
}
return postProcess(result);
@@ -205,14 +259,16 @@ public class RagSplitStrategyRegistry {
private List<RagChunk> buildRegexChunks(String content, StrategyConfig strategyConfig) {
String regex = StringUtil.hasText(strategyConfig.getRegex()) ? strategyConfig.getRegex() : "\\n\\s*\\n";
DocumentSplitter splitter = new RegexDocumentSplitter(regex);
List<Document> docs = splitter.split(new Document(content));
List<RagChunk> result = new ArrayList<RagChunk>();
int index = 1;
for (Document doc : docs) {
result.add(createChunk(RagChunkTypes.PARAGRAPH, "正则分块 " + index, Collections.<String>emptyList(), doc.getContent(), index, 1, 1));
index++;
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(content);
int segmentStart = 0;
while (matcher.find()) {
index = addRegexChunk(result, content, segmentStart, matcher.start(), index);
segmentStart = matcher.end();
}
addRegexChunk(result, content, segmentStart, content.length(), index);
return postProcess(result);
}
@@ -275,7 +331,8 @@ public class RagSplitStrategyRegistry {
String content,
int index,
int partNo,
int partTotal) {
int partTotal,
List<TextRange> sourceRanges) {
RagChunk chunk = new RagChunk();
chunk.setChunkId("chunk-" + index);
chunk.setChunkType(chunkType);
@@ -290,9 +347,112 @@ public class RagSplitStrategyRegistry {
if (RagChunkTypes.SECTION.equals(chunkType)) {
chunk.getOptions().put(RagMetadataKeys.SOURCE_LABEL, sourceLabel);
}
if (sourceRanges != null && !sourceRanges.isEmpty()) {
chunk.getOptions().put(RagMetadataKeys.SOURCE_RANGES, toSourceRangeMaps(sourceRanges));
}
return chunk;
}
private int addRegexChunk(List<RagChunk> result, String content, int rawStart, int rawEnd, int index) {
TextRange range = trimRange(content, rawStart, rawEnd);
if (range == null) {
return index;
}
result.add(createChunk(
RagChunkTypes.PARAGRAPH,
"正则分块 " + index,
Collections.<String>emptyList(),
content.substring(range.start, range.end),
index,
1,
1,
Collections.singletonList(range)
));
return index + 1;
}
private List<LineSlice> sliceLines(String content) {
List<LineSlice> result = new ArrayList<LineSlice>();
if (content == null || content.isEmpty()) {
return result;
}
int start = 0;
for (int i = 0; i <= content.length(); i++) {
if (i < content.length() && content.charAt(i) != '\n') {
continue;
}
String rawLine = content.substring(start, i);
result.add(new LineSlice(start, i, rawLine));
start = i + 1;
}
return result;
}
private String joinLineSlices(List<LineSlice> slices) {
List<String> values = new ArrayList<String>();
for (LineSlice slice : slices) {
values.add(slice.trimmedLine);
}
return joinAndTrim(values);
}
private List<TextRange> buildQaSourceRanges(List<LineSlice> questionSlices, List<LineSlice> answerSlices) {
List<TextRange> result = new ArrayList<TextRange>();
TextRange questionRange = mergeLineSlices(questionSlices);
if (questionRange != null) {
result.add(questionRange);
}
TextRange answerRange = mergeLineSlices(answerSlices);
if (answerRange != null) {
result.add(answerRange);
}
return result;
}
private TextRange mergeLineSlices(List<LineSlice> slices) {
if (slices == null || slices.isEmpty()) {
return null;
}
return new TextRange(slices.get(0).start, slices.get(slices.size() - 1).end);
}
private TextRange trimRange(String content, int rawStart, int rawEnd) {
int start = Math.max(0, rawStart);
int end = Math.min(content.length(), rawEnd);
while (start < end && Character.isWhitespace(content.charAt(start))) {
start++;
}
while (end > start && Character.isWhitespace(content.charAt(end - 1))) {
end--;
}
if (start >= end) {
return null;
}
return new TextRange(start, end);
}
private TextRange findOrderedRange(String baseContent, String chunkContent, int searchStart, String label) {
int index = baseContent.indexOf(chunkContent, Math.max(0, searchStart));
if (index < 0 && searchStart > 0) {
index = baseContent.indexOf(chunkContent);
}
if (index < 0) {
throw new IllegalStateException(label + "无法定位原文区间");
}
return new TextRange(index, index + chunkContent.length());
}
private List<Map<String, Object>> toSourceRangeMaps(List<TextRange> sourceRanges) {
List<Map<String, Object>> result = new ArrayList<Map<String, Object>>();
for (TextRange sourceRange : sourceRanges) {
Map<String, Object> item = new LinkedHashMap<String, Object>();
item.put("start", Integer.valueOf(sourceRange.start));
item.put("end", Integer.valueOf(sourceRange.end));
result.add(item);
}
return result;
}
private int safeChunkSize(StrategyConfig strategyConfig) {
Integer chunkSize = strategyConfig.getChunkSize();
return chunkSize == null || chunkSize.intValue() <= 0 ? RagDefaults.CHUNK_SIZE : chunkSize.intValue();
@@ -320,11 +480,21 @@ public class RagSplitStrategyRegistry {
private final List<String> headingPath;
private final String sourceLabel;
private final List<String> lines = new ArrayList<String>();
private int start = -1;
private int end = -1;
private SectionChunk(List<String> headingPath, String sourceLabel) {
this.headingPath = headingPath;
this.sourceLabel = sourceLabel;
}
private void addLine(LineSlice lineSlice) {
if (start < 0) {
start = lineSlice.start;
}
end = lineSlice.end;
lines.add(lineSlice.rawLine);
}
}
private static class HeadingLevel {
@@ -385,4 +555,32 @@ public class RagSplitStrategyRegistry {
return null;
}
}
private static class LineSlice {
private final int start;
private final int end;
private final String rawLine;
private final String trimmedLine;
private LineSlice(int start, int end, String rawLine) {
this.start = start;
this.end = end;
this.rawLine = rawLine == null ? "" : rawLine;
this.trimmedLine = this.rawLine.trim();
}
}
private static class TextRange {
private final int start;
private final int end;
private TextRange(int start, int end) {
this.start = start;
this.end = end;
}
private TextRange offset(int offset) {
return new TextRange(start + offset, end + offset);
}
}
}

View File

@@ -12,6 +12,7 @@ import org.junit.Assert;
import org.junit.Test;
import java.util.List;
import java.util.Map;
public class RagIngestionPipelineTest {
@@ -58,6 +59,7 @@ public class RagIngestionPipelineTest {
Assert.assertEquals(3, chunks.size());
Assert.assertEquals("第1章 总则", chunks.get(0).getSourceLabel());
Assert.assertEquals(2, chunks.get(1).getHeadingPath().size());
assertHasValidSourceRanges(analysis, chunks.get(0));
}
@Test
@@ -76,5 +78,38 @@ public class RagIngestionPipelineTest {
Assert.assertEquals(RagChunkTypes.QA_PAIR, chunks.get(0).getChunkType());
Assert.assertTrue(chunks.get(0).getContent().contains("问题"));
Assert.assertTrue(chunks.get(1).getAnswer().contains("系统配置"));
assertHasValidSourceRanges(analysis, chunks.get(0));
Assert.assertEquals(2, ((List<?>) chunks.get(0).getOptions().get("sourceRanges")).size());
}
@Test
public void shouldSplitParagraphDocumentWithSourceRanges() {
String content = "第一段内容用于测试原文映射。\n第二段内容继续补充便于生成多个分块。\n第三段内容再长一点确保范围映射稳定。";
AnalysisResult analysis = recommender.recommend(analyzer.analyze(content, "txt"));
StrategyConfig config = StrategyConfig.defaults();
config.setStrategyCode(RagStrategyCodes.PARAGRAPH_LENGTH);
config.setChunkSize(18);
config.setOverlapSize(4);
List<RagChunk> chunks = registry.split(analysis, config);
Assert.assertTrue(chunks.size() > 1);
assertHasValidSourceRanges(analysis, chunks.get(0));
}
@SuppressWarnings("unchecked")
private void assertHasValidSourceRanges(AnalysisResult analysis, RagChunk chunk) {
Object rawRanges = chunk.getOptions().get("sourceRanges");
Assert.assertTrue(rawRanges instanceof List);
List<Map<String, Object>> ranges = (List<Map<String, Object>>) rawRanges;
Assert.assertFalse(ranges.isEmpty());
int normalizedLength = analysis.getNormalizedContent().length();
for (Map<String, Object> range : ranges) {
int start = ((Number) range.get("start")).intValue();
int end = ((Number) range.get("end")).intValue();
Assert.assertTrue(start >= 0);
Assert.assertTrue(end > start);
Assert.assertTrue(end <= normalizedLength);
}
}
}