feat: 完善统一文档解析与分块原文映射

- 兼容 MinerU docx 嵌套压缩工件与数组模型输出 - 补充异步解析日志与 sourceRanges 原文区间映射
2026-04-15 19:27:22 +08:00
parent 0c7b362173
commit 547d4f6ee0
6 changed files with 427 additions and 77 deletions
--- a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java
+++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java
@@ -17,4 +17,5 @@ public final class RagMetadataKeys {
    public static final String PART_NO = "partNo";
    public static final String PART_TOTAL = "partTotal";
    public static final String WARNINGS = "warnings";
+    public static final String SOURCE_RANGES = "sourceRanges";
 }
--- a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java
@@ -1,9 +1,5 @@
 package com.easyagents.rag.ingestion.chunk;

-import com.easyagents.core.document.Document;
-import com.easyagents.core.document.DocumentSplitter;
-import com.easyagents.core.document.splitter.RegexDocumentSplitter;
-import com.easyagents.core.document.splitter.SimpleDocumentSplitter;
 import com.easyagents.core.util.StringUtil;
 import com.easyagents.rag.core.*;
 import com.easyagents.rag.ingestion.model.AnalysisResult;
@@ -41,12 +37,12 @@ public class RagSplitStrategyRegistry {
    }

    private List<RagChunk> buildMarkdownChunks(String content, StrategyConfig strategyConfig) {
-        List<String> lines = Arrays.asList(content.split("\\n"));
+        List<LineSlice> lines = sliceLines(content);
        List<SectionChunk> sections = new ArrayList<SectionChunk>();
        Deque<HeadingLevel> stack = new ArrayDeque<HeadingLevel>();
        SectionChunk current = null;
-        for (String rawLine : lines) {
-            String line = rawLine.trim();
+        for (LineSlice lineSlice : lines) {
+            String line = lineSlice.trimmedLine;
            Matcher matcher = MARKDOWN_HEADING.matcher(line);
            if (matcher.matches()) {
                if (current != null) {
@@ -58,27 +54,27 @@ public class RagSplitStrategyRegistry {
                }
                stack.addLast(new HeadingLevel(level, matcher.group(2).trim()));
                current = new SectionChunk(copyPath(stack), matcher.group(2).trim());
-                current.lines.add(line);
+                current.addLine(lineSlice);
            } else {
                if (current == null) {
                    current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
                }
-                current.lines.add(rawLine);
+                current.addLine(lineSlice);
            }
        }
        if (current != null) {
            sections.add(current);
        }
-        return finalizeSectionChunks(sections, strategyConfig);
+        return finalizeSectionChunks(content, sections, strategyConfig);
    }

    private List<RagChunk> buildOutlineChunks(String content, StrategyConfig strategyConfig) {
-        List<String> lines = Arrays.asList(content.split("\\n"));
+        List<LineSlice> lines = sliceLines(content);
        List<SectionChunk> sections = new ArrayList<SectionChunk>();
        Deque<HeadingLevel> stack = new ArrayDeque<HeadingLevel>();
        SectionChunk current = null;
-        for (String rawLine : lines) {
-            String line = rawLine.trim();
+        for (LineSlice lineSlice : lines) {
+            String line = lineSlice.trimmedLine;
            OutlineHeading heading = OutlineHeading.parse(line);
            if (heading != null) {
                if (current != null) {
@@ -89,36 +85,62 @@ public class RagSplitStrategyRegistry {
                }
                stack.addLast(new HeadingLevel(heading.level, heading.title));
                current = new SectionChunk(copyPath(stack), heading.title);
-                current.lines.add(line);
+                current.addLine(lineSlice);
            } else {
                if (current == null) {
                    current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
                }
-                current.lines.add(rawLine);
+                current.addLine(lineSlice);
            }
        }
        if (current != null) {
            sections.add(current);
        }
-        return finalizeSectionChunks(sections, strategyConfig);
+        return finalizeSectionChunks(content, sections, strategyConfig);
    }

-    private List<RagChunk> finalizeSectionChunks(List<SectionChunk> sections, StrategyConfig strategyConfig) {
+    private List<RagChunk> finalizeSectionChunks(String content, List<SectionChunk> sections, StrategyConfig strategyConfig) {
        List<RagChunk> result = new ArrayList<RagChunk>();
        int index = 1;
        for (SectionChunk section : sections) {
-            String content = joinAndTrim(section.lines);
-            if (!StringUtil.hasText(content) || content.equals(section.sourceLabel)) {
+            TextRange baseRange = trimRange(content, section.start, section.end);
+            if (baseRange == null) {
                continue;
            }
-            if (content.length() <= safeChunkSize(strategyConfig)) {
-                result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, content, index++, 1, 1));
+            String sectionContent = content.substring(baseRange.start, baseRange.end);
+            if (!StringUtil.hasText(sectionContent) || sectionContent.equals(section.sourceLabel)) {
                continue;
            }
-            List<String> subContents = splitLongContent(content, strategyConfig.getChunkSize());
+            if (sectionContent.length() <= safeChunkSize(strategyConfig)) {
+                result.add(createChunk(
+                    RagChunkTypes.SECTION,
+                    section.sourceLabel,
+                    section.headingPath,
+                    sectionContent,
+                    index++,
+                    1,
+                    1,
+                    Collections.singletonList(baseRange)
+                ));
+                continue;
+            }
+            List<String> subContents = splitLongContent(sectionContent, strategyConfig.getChunkSize());
            int total = subContents.size();
+            int relativeCursor = 0;
            for (int i = 0; i < subContents.size(); i++) {
-                result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, subContents.get(i), index++, i + 1, total));
+                String subContent = subContents.get(i);
+                TextRange relativeRange = findOrderedRange(sectionContent, subContent, relativeCursor, "章节分块");
+                relativeCursor = relativeRange.end;
+                result.add(createChunk(
+                    RagChunkTypes.SECTION,
+                    section.sourceLabel,
+                    section.headingPath,
+                    subContent,
+                    index++,
+                    i + 1,
+                    total,
+                    Collections.singletonList(relativeRange.offset(baseRange.start))
+                ));
            }
        }
        return postProcess(result);
@@ -127,62 +149,79 @@ public class RagSplitStrategyRegistry {
    private List<RagChunk> buildQaChunks(String content, StrategyConfig strategyConfig) {
        List<RagChunk> result = new ArrayList<RagChunk>();
        String currentQuestion = null;
-        StringBuilder answerBuilder = new StringBuilder();
-        StringBuilder questionBuilder = new StringBuilder();
+        List<LineSlice> answerSlices = new ArrayList<LineSlice>();
+        List<LineSlice> questionSlices = new ArrayList<LineSlice>();
+        boolean answerStarted = false;
        int qaIndex = 1;

-        for (String rawLine : content.split("\\n")) {
-            String line = rawLine.trim();
+        for (LineSlice lineSlice : sliceLines(content)) {
+            String line = lineSlice.trimmedLine;
            if (!StringUtil.hasText(line)) {
                continue;
            }
            Matcher questionMatcher = QUESTION_PREFIX.matcher(line);
            Matcher answerMatcher = ANSWER_PREFIX.matcher(line);
            if (questionMatcher.matches()) {
-                qaIndex = flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig);
+                qaIndex = flushQaChunk(result, currentQuestion, questionSlices, answerSlices, qaIndex, strategyConfig);
                currentQuestion = questionMatcher.group(2).trim();
-                questionBuilder = new StringBuilder(currentQuestion);
-                answerBuilder = new StringBuilder();
+                questionSlices = new ArrayList<LineSlice>();
+                answerSlices = new ArrayList<LineSlice>();
+                questionSlices.add(lineSlice);
+                answerStarted = false;
                continue;
            }
            if (answerMatcher.matches()) {
-                if (answerBuilder.length() > 0) {
-                    answerBuilder.append('\n');
+                if (!StringUtil.hasText(currentQuestion)) {
+                    continue;
                }
-                answerBuilder.append(answerMatcher.group(2).trim());
+                answerSlices.add(lineSlice);
+                answerStarted = true;
                continue;
            }
-            if (answerBuilder.length() > 0) {
-                answerBuilder.append('\n').append(rawLine.trim());
-            } else if (questionBuilder.length() > 0) {
-                questionBuilder.append('\n').append(rawLine.trim());
+            if (!StringUtil.hasText(currentQuestion)) {
+                continue;
+            }
+            if (answerStarted) {
+                answerSlices.add(lineSlice);
+            } else if (!questionSlices.isEmpty()) {
+                questionSlices.add(lineSlice);
            }
        }
-        flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig);
+        flushQaChunk(result, currentQuestion, questionSlices, answerSlices, qaIndex, strategyConfig);
        return postProcess(result);
    }

    private int flushQaChunk(List<RagChunk> result,
                             String currentQuestion,
-                             StringBuilder questionBuilder,
-                             StringBuilder answerBuilder,
+                             List<LineSlice> questionSlices,
+                             List<LineSlice> answerSlices,
                             int qaIndex,
                             StrategyConfig strategyConfig) {
        if (!StringUtil.hasText(currentQuestion)) {
            return qaIndex;
        }
-        if (!StringUtil.hasText(answerBuilder.toString())) {
+        if (answerSlices == null || answerSlices.isEmpty()) {
            return qaIndex;
        }
-        String question = questionBuilder.toString().trim();
-        String answer = answerBuilder.toString().trim();
+        String question = joinLineSlices(questionSlices);
+        String answer = joinLineSlices(answerSlices);
        String baseContent = "问题：" + question + "\n答案：" + answer;
        List<String> subContents = baseContent.length() > safeChunkSize(strategyConfig)
            ? splitLongContent(baseContent, strategyConfig.getChunkSize())
            : Collections.singletonList(baseContent);
        int total = subContents.size();
+        List<TextRange> sourceRanges = buildQaSourceRanges(questionSlices, answerSlices);
        for (int i = 0; i < subContents.size(); i++) {
-            RagChunk chunk = createChunk(RagChunkTypes.QA_PAIR, "Q" + qaIndex + " " + question, Collections.<String>emptyList(), subContents.get(i), result.size() + 1, i + 1, total);
+            RagChunk chunk = createChunk(
+                RagChunkTypes.QA_PAIR,
+                "Q" + qaIndex + " " + question,
+                Collections.<String>emptyList(),
+                subContents.get(i),
+                result.size() + 1,
+                i + 1,
+                total,
+                sourceRanges
+            );
            chunk.setQuestion(question);
            chunk.setAnswer(answer);
            chunk.getOptions().put(RagMetadataKeys.QA_GROUP_ID, "qa-" + qaIndex);
@@ -193,11 +232,26 @@ public class RagSplitStrategyRegistry {

    private List<RagChunk> buildParagraphChunks(String content, StrategyConfig strategyConfig) {
        List<RagChunk> result = new ArrayList<RagChunk>();
-        DocumentSplitter splitter = new SimpleDocumentSplitter(safeChunkSize(strategyConfig), safeOverlap(strategyConfig));
-        List<Document> docs = splitter.split(new Document(content));
        int index = 1;
-        for (Document doc : docs) {
-            result.add(createChunk(RagChunkTypes.PARAGRAPH, "分块 " + index, Collections.<String>emptyList(), doc.getContent(), index, 1, 1));
+        int currentIndex = 0;
+        int maxIndex = content.length();
+        while (currentIndex < maxIndex) {
+            int endIndex = Math.min(currentIndex + safeChunkSize(strategyConfig), maxIndex);
+            TextRange range = trimRange(content, currentIndex, endIndex);
+            currentIndex = currentIndex + safeChunkSize(strategyConfig) - safeOverlap(strategyConfig);
+            if (range == null) {
+                continue;
+            }
+            result.add(createChunk(
+                RagChunkTypes.PARAGRAPH,
+                "分块 " + index,
+                Collections.<String>emptyList(),
+                content.substring(range.start, range.end),
+                index,
+                1,
+                1,
+                Collections.singletonList(range)
+            ));
            index++;
        }
        return postProcess(result);
@@ -205,14 +259,16 @@ public class RagSplitStrategyRegistry {

    private List<RagChunk> buildRegexChunks(String content, StrategyConfig strategyConfig) {
        String regex = StringUtil.hasText(strategyConfig.getRegex()) ? strategyConfig.getRegex() : "\\n\\s*\\n";
-        DocumentSplitter splitter = new RegexDocumentSplitter(regex);
-        List<Document> docs = splitter.split(new Document(content));
        List<RagChunk> result = new ArrayList<RagChunk>();
        int index = 1;
-        for (Document doc : docs) {
-            result.add(createChunk(RagChunkTypes.PARAGRAPH, "正则分块 " + index, Collections.<String>emptyList(), doc.getContent(), index, 1, 1));
-            index++;
+        Pattern pattern = Pattern.compile(regex);
+        Matcher matcher = pattern.matcher(content);
+        int segmentStart = 0;
+        while (matcher.find()) {
+            index = addRegexChunk(result, content, segmentStart, matcher.start(), index);
+            segmentStart = matcher.end();
        }
+        addRegexChunk(result, content, segmentStart, content.length(), index);
        return postProcess(result);
    }

@@ -275,7 +331,8 @@ public class RagSplitStrategyRegistry {
                                 String content,
                                 int index,
                                 int partNo,
-                                 int partTotal) {
+                                 int partTotal,
+                                 List<TextRange> sourceRanges) {
        RagChunk chunk = new RagChunk();
        chunk.setChunkId("chunk-" + index);
        chunk.setChunkType(chunkType);
@@ -290,9 +347,112 @@ public class RagSplitStrategyRegistry {
        if (RagChunkTypes.SECTION.equals(chunkType)) {
            chunk.getOptions().put(RagMetadataKeys.SOURCE_LABEL, sourceLabel);
        }
+        if (sourceRanges != null && !sourceRanges.isEmpty()) {
+            chunk.getOptions().put(RagMetadataKeys.SOURCE_RANGES, toSourceRangeMaps(sourceRanges));
+        }
        return chunk;
    }

+    private int addRegexChunk(List<RagChunk> result, String content, int rawStart, int rawEnd, int index) {
+        TextRange range = trimRange(content, rawStart, rawEnd);
+        if (range == null) {
+            return index;
+        }
+        result.add(createChunk(
+            RagChunkTypes.PARAGRAPH,
+            "正则分块 " + index,
+            Collections.<String>emptyList(),
+            content.substring(range.start, range.end),
+            index,
+            1,
+            1,
+            Collections.singletonList(range)
+        ));
+        return index + 1;
+    }
+
+    private List<LineSlice> sliceLines(String content) {
+        List<LineSlice> result = new ArrayList<LineSlice>();
+        if (content == null || content.isEmpty()) {
+            return result;
+        }
+        int start = 0;
+        for (int i = 0; i <= content.length(); i++) {
+            if (i < content.length() && content.charAt(i) != '\n') {
+                continue;
+            }
+            String rawLine = content.substring(start, i);
+            result.add(new LineSlice(start, i, rawLine));
+            start = i + 1;
+        }
+        return result;
+    }
+
+    private String joinLineSlices(List<LineSlice> slices) {
+        List<String> values = new ArrayList<String>();
+        for (LineSlice slice : slices) {
+            values.add(slice.trimmedLine);
+        }
+        return joinAndTrim(values);
+    }
+
+    private List<TextRange> buildQaSourceRanges(List<LineSlice> questionSlices, List<LineSlice> answerSlices) {
+        List<TextRange> result = new ArrayList<TextRange>();
+        TextRange questionRange = mergeLineSlices(questionSlices);
+        if (questionRange != null) {
+            result.add(questionRange);
+        }
+        TextRange answerRange = mergeLineSlices(answerSlices);
+        if (answerRange != null) {
+            result.add(answerRange);
+        }
+        return result;
+    }
+
+    private TextRange mergeLineSlices(List<LineSlice> slices) {
+        if (slices == null || slices.isEmpty()) {
+            return null;
+        }
+        return new TextRange(slices.get(0).start, slices.get(slices.size() - 1).end);
+    }
+
+    private TextRange trimRange(String content, int rawStart, int rawEnd) {
+        int start = Math.max(0, rawStart);
+        int end = Math.min(content.length(), rawEnd);
+        while (start < end && Character.isWhitespace(content.charAt(start))) {
+            start++;
+        }
+        while (end > start && Character.isWhitespace(content.charAt(end - 1))) {
+            end--;
+        }
+        if (start >= end) {
+            return null;
+        }
+        return new TextRange(start, end);
+    }
+
+    private TextRange findOrderedRange(String baseContent, String chunkContent, int searchStart, String label) {
+        int index = baseContent.indexOf(chunkContent, Math.max(0, searchStart));
+        if (index < 0 && searchStart > 0) {
+            index = baseContent.indexOf(chunkContent);
+        }
+        if (index < 0) {
+            throw new IllegalStateException(label + "无法定位原文区间");
+        }
+        return new TextRange(index, index + chunkContent.length());
+    }
+
+    private List<Map<String, Object>> toSourceRangeMaps(List<TextRange> sourceRanges) {
+        List<Map<String, Object>> result = new ArrayList<Map<String, Object>>();
+        for (TextRange sourceRange : sourceRanges) {
+            Map<String, Object> item = new LinkedHashMap<String, Object>();
+            item.put("start", Integer.valueOf(sourceRange.start));
+            item.put("end", Integer.valueOf(sourceRange.end));
+            result.add(item);
+        }
+        return result;
+    }
+
    private int safeChunkSize(StrategyConfig strategyConfig) {
        Integer chunkSize = strategyConfig.getChunkSize();
        return chunkSize == null || chunkSize.intValue() <= 0 ? RagDefaults.CHUNK_SIZE : chunkSize.intValue();
@@ -320,11 +480,21 @@ public class RagSplitStrategyRegistry {
        private final List<String> headingPath;
        private final String sourceLabel;
        private final List<String> lines = new ArrayList<String>();
+        private int start = -1;
+        private int end = -1;

        private SectionChunk(List<String> headingPath, String sourceLabel) {
            this.headingPath = headingPath;
            this.sourceLabel = sourceLabel;
        }
+
+        private void addLine(LineSlice lineSlice) {
+            if (start < 0) {
+                start = lineSlice.start;
+            }
+            end = lineSlice.end;
+            lines.add(lineSlice.rawLine);
+        }
    }

    private static class HeadingLevel {
@@ -385,4 +555,32 @@ public class RagSplitStrategyRegistry {
            return null;
        }
    }
+
+    private static class LineSlice {
+        private final int start;
+        private final int end;
+        private final String rawLine;
+        private final String trimmedLine;
+
+        private LineSlice(int start, int end, String rawLine) {
+            this.start = start;
+            this.end = end;
+            this.rawLine = rawLine == null ? "" : rawLine;
+            this.trimmedLine = this.rawLine.trim();
+        }
+    }
+
+    private static class TextRange {
+        private final int start;
+        private final int end;
+
+        private TextRange(int start, int end) {
+            this.start = start;
+            this.end = end;
+        }
+
+        private TextRange offset(int offset) {
+            return new TextRange(start + offset, end + offset);
+        }
+    }
 }
--- a/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java
@@ -12,6 +12,7 @@ import org.junit.Assert;
 import org.junit.Test;

 import java.util.List;
+import java.util.Map;

 public class RagIngestionPipelineTest {

@@ -58,6 +59,7 @@ public class RagIngestionPipelineTest {
        Assert.assertEquals(3, chunks.size());
        Assert.assertEquals("第1章 总则", chunks.get(0).getSourceLabel());
        Assert.assertEquals(2, chunks.get(1).getHeadingPath().size());
+        assertHasValidSourceRanges(analysis, chunks.get(0));
    }

    @Test
@@ -76,5 +78,38 @@ public class RagIngestionPipelineTest {
        Assert.assertEquals(RagChunkTypes.QA_PAIR, chunks.get(0).getChunkType());
        Assert.assertTrue(chunks.get(0).getContent().contains("问题"));
        Assert.assertTrue(chunks.get(1).getAnswer().contains("系统配置"));
+        assertHasValidSourceRanges(analysis, chunks.get(0));
+        Assert.assertEquals(2, ((List<?>) chunks.get(0).getOptions().get("sourceRanges")).size());
+    }
+
+    @Test
+    public void shouldSplitParagraphDocumentWithSourceRanges() {
+        String content = "第一段内容用于测试原文映射。\n第二段内容继续补充，便于生成多个分块。\n第三段内容再长一点，确保范围映射稳定。";
+        AnalysisResult analysis = recommender.recommend(analyzer.analyze(content, "txt"));
+        StrategyConfig config = StrategyConfig.defaults();
+        config.setStrategyCode(RagStrategyCodes.PARAGRAPH_LENGTH);
+        config.setChunkSize(18);
+        config.setOverlapSize(4);
+
+        List<RagChunk> chunks = registry.split(analysis, config);
+
+        Assert.assertTrue(chunks.size() > 1);
+        assertHasValidSourceRanges(analysis, chunks.get(0));
+    }
+
+    @SuppressWarnings("unchecked")
+    private void assertHasValidSourceRanges(AnalysisResult analysis, RagChunk chunk) {
+        Object rawRanges = chunk.getOptions().get("sourceRanges");
+        Assert.assertTrue(rawRanges instanceof List);
+        List<Map<String, Object>> ranges = (List<Map<String, Object>>) rawRanges;
+        Assert.assertFalse(ranges.isEmpty());
+        int normalizedLength = analysis.getNormalizedContent().length();
+        for (Map<String, Object> range : ranges) {
+            int start = ((Number) range.get("start")).intValue();
+            int end = ((Number) range.get("end")).intValue();
+            Assert.assertTrue(start >= 0);
+            Assert.assertTrue(end > start);
+            Assert.assertTrue(end <= normalizedLength);
+        }
    }
 }