feat: 完善统一文档解析与分块原文映射

- 兼容 MinerU docx 嵌套压缩工件与数组模型输出 - 补充异步解析日志与 sourceRanges 原文区间映射
2026-04-15 19:27:22 +08:00
parent 0c7b362173
commit 547d4f6ee0
6 changed files with 427 additions and 77 deletions
--- a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java
+++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java
@@ -256,14 +256,29 @@ public class MineruMapper {
        result.setPlainText(markdown);

        ParseArtifacts artifacts = new ParseArtifacts();
-        JSONObject middleJson = firstJsonObject(bundle.entriesBySuffix, "_middle.json");
-        JSONArray contentList = firstJsonArray(bundle.entriesBySuffix, "_content_list.json");
-        JSONObject modelOutput = firstJsonObject(bundle.entriesBySuffix, "_model.json");
-        artifacts.setMiddleJson(middleJson);
-        artifacts.setContentList(contentList);
+        Object middleArtifact = firstJsonValue(bundle.entriesBySuffix, "_middle.json");
+        Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
+        Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
+
+        JSONObject middleJson = asObject(middleArtifact);
+        JSONArray contentList = asArray(contentListArtifact);
+        Object modelOutput = modelOutputArtifact;
+
+        // MinerU 在 DOCX 等场景下可能将结构化块列表放在 middle/model 工件里，并且直接返回数组。
+        if (contentList == null && middleArtifact instanceof JSONArray) {
+            contentList = (JSONArray) middleArtifact;
+            middleJson = null;
+            middleArtifact = null;
+        }
+        if (contentList == null && modelOutputArtifact instanceof JSONArray) {
+            contentList = (JSONArray) modelOutputArtifact;
+        }
+
+        artifacts.setMiddleJson(middleArtifact);
+        artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
        artifacts.setModelOutput(modelOutput);

-        JSONArray contentListV2 = firstJsonArray(bundle.entriesBySuffix, "_content_list_v2.json");
+        JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
        if (contentListV2 != null) {
            artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
        }
@@ -344,7 +359,11 @@ public class MineruMapper {
            block.setType(item.getString("type"));
            block.setPageIndex(item.getInteger("page_idx"));
            block.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
-            block.setLevel(item.getInteger("text_level"));
+            Integer blockLevel = item.getInteger("text_level");
+            if (blockLevel == null) {
+                blockLevel = item.getInteger("level");
+            }
+            block.setLevel(blockLevel);
            block.setText(extractBlockText(item));
            block.setHtml(item.getString("table_body"));
            block.setImagePath(item.getString("img_path"));
@@ -531,20 +550,16 @@ public class MineruMapper {
        return null;
    }

-    private JSONObject firstJsonObject(Map<String, byte[]> entries, String suffix) {
+    private Object firstJsonValue(Map<String, byte[]> entries, String suffix) {
        String text = firstText(entries, suffix);
        if (!StringUtil.hasText(text)) {
            return null;
        }
-        return JSON.parseObject(text);
-    }
-
-    private JSONArray firstJsonArray(Map<String, byte[]> entries, String suffix) {
-        String text = firstText(entries, suffix);
-        if (!StringUtil.hasText(text)) {
-            return null;
+        try {
+            return JSON.parse(text);
+        } catch (Exception exception) {
+            throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
        }
-        return JSON.parseArray(text);
    }

    private JSONObject asObject(Object value) {
@@ -554,6 +569,9 @@ public class MineruMapper {
        if (value == null) {
            return null;
        }
+        if (value instanceof JSONArray) {
+            return null;
+        }
        return JSON.parseObject(JSON.toJSONString(value));
    }

@@ -622,8 +640,9 @@ public class MineruMapper {
        String type = item.getString("type");
        if ("text".equals(type) || "header".equals(type) || "footer".equals(type)
            || "page_number".equals(type) || "aside_text".equals(type) || "page_footnote".equals(type)
-            || "equation".equals(type)) {
-            return item.getString("text");
+            || "equation".equals(type) || "title".equals(type)) {
+            String text = item.getString("text");
+            return StringUtil.hasText(text) ? text : item.getString("content");
        }
        if ("list".equals(type)) {
            return joinList(toStringList(item.getJSONArray("list_items")));
@@ -635,9 +654,11 @@ public class MineruMapper {
            return joinList(toStringList(item.getJSONArray("image_caption")));
        }
        if ("table".equals(type)) {
-            return joinList(toStringList(item.getJSONArray("table_caption")));
+            String tableCaption = joinList(toStringList(item.getJSONArray("table_caption")));
+            return StringUtil.hasText(tableCaption) ? tableCaption : item.getString("content");
        }
-        return item.getString("text");
+        String text = item.getString("text");
+        return StringUtil.hasText(text) ? text : item.getString("content");
    }

    private String extractTextFromMiddleBlock(JSONObject blockJson) {
@@ -768,6 +789,9 @@ public class MineruMapper {
            return imageDataUrls.get(imagePath);
        }
        String baseName = baseName(imagePath);
+        if (!StringUtil.hasText(baseName)) {
+            return null;
+        }
        for (Map.Entry<String, String> entry : imageDataUrls.entrySet()) {
            if (baseName.equals(baseName(entry.getKey()))) {
                return entry.getValue();
--- a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java
+++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java
@@ -7,6 +7,8 @@ import com.easyagents.document.core.model.ParseResponse;
 import com.easyagents.document.core.model.ParseTaskInfo;
 import com.easyagents.document.core.model.ParseTaskStatus;
 import com.easyagents.document.pdf.PdfDocumentProvider;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import java.util.ArrayList;

@@ -19,6 +21,7 @@ import java.util.ArrayList;
 public class MineruPdfDocumentParseService implements PdfDocumentProvider {

    public static final String PROVIDER_NAME = "mineru";
+    private static final Logger LOG = LoggerFactory.getLogger(MineruPdfDocumentParseService.class);

    private final MineruProperties properties;
    private final MineruPdfClient client;
@@ -64,7 +67,17 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
    @Override
    public ParseResponse parse(ParseRequest request) {
        ParseRequest normalizedRequest = normalizeRequest(request);
-        return mapper.toParseResponse(client.parse(normalizedRequest));
+        LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}, parseMethod={}",
+            PROVIDER_NAME,
+            normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
+            normalizedRequest.getBackend(),
+            normalizedRequest.getParseMethod());
+        ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
+        LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
+            PROVIDER_NAME,
+            normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
+            response == null || response.getResults() == null ? 0 : response.getResults().size());
+        return response;
    }

    @Override
@@ -76,21 +89,41 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
        normalizedRequest.setReturnContentList(true);
        normalizedRequest.setReturnModelOutput(true);
        normalizedRequest.setReturnImages(true);
-        return mapper.toParseTaskStatus(client.submit(normalizedRequest));
+        LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}, parseMethod={}",
+            PROVIDER_NAME,
+            normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
+            normalizedRequest.getBackend(),
+            normalizedRequest.getParseMethod());
+        ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
+        LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
+            PROVIDER_NAME,
+            taskStatus == null ? null : taskStatus.getTaskId(),
+            taskStatus == null ? null : taskStatus.getStatus());
+        return taskStatus;
    }

    @Override
    public ParseTaskStatus queryTask(String taskId) {
        validateTaskId(taskId);
-        return mapper.toParseTaskStatus(client.queryTask(taskId));
+        ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
+        LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
+            PROVIDER_NAME,
+            taskId,
+            taskStatus == null ? null : taskStatus.getStatus());
+        return taskStatus;
    }

    @Override
    public ParseResponse queryResult(String taskId) {
        validateTaskId(taskId);
+        LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
        MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
        ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
        mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
+        LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
+            PROVIDER_NAME,
+            taskId,
+            response == null || response.getResults() == null ? 0 : response.getResults().size());
        return response;
    }

@@ -104,6 +137,11 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
            mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
            taskInfo.setResult(response);
        }
+        LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
+            PROVIDER_NAME,
+            taskId,
+            taskInfo == null ? null : taskInfo.getStatus(),
+            taskInfo != null && taskInfo.getResult() != null);
        return taskInfo;
    }

--- a/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java
+++ b/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java
@@ -121,6 +121,21 @@ public class MineruMapperTest {
        Assert.assertEquals("3.0.9", response.getVersion());
    }

+    @Test
+    public void shouldMapNestedZipWhenModelArtifactIsArray() throws IOException {
+        MineruMapper mapper = new MineruMapper(defaultProperties());
+
+        ParseResponse response = mapper.fromZip(buildNestedDocxZipWithArrayModel());
+
+        Assert.assertEquals(1, response.getResults().size());
+        ParseResult result = response.getResults().get(0);
+        Assert.assertEquals("demo", result.getFileName());
+        Assert.assertEquals("# nested", result.getMarkdown());
+        Assert.assertFalse(result.getBlocks().isEmpty());
+        Assert.assertTrue(result.getArtifacts().getModelOutput() instanceof JSONArray);
+        Assert.assertTrue(result.getArtifacts().getContentList() instanceof JSONArray);
+    }
+
    private MineruProperties defaultProperties() {
        MineruProperties properties = new MineruProperties();
        properties.setBaseUrl("http://127.0.0.1:8000");
@@ -354,6 +369,45 @@ public class MineruMapperTest {
        return outputStream.toByteArray();
    }

+    private byte[] buildNestedDocxZipWithArrayModel() throws IOException {
+        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+        ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream);
+        addEntry(zipOutputStream, "demo/vlm/markdown/demo.md", "# nested");
+        addEntry(zipOutputStream, "demo/vlm/layout/demo_middle.json", middleJson().toJSONString());
+        addEntry(zipOutputStream, "demo/vlm/model/demo_model.json", nestedDocxContentList().toJSONString());
+        addBinaryEntry(zipOutputStream, "demo/vlm/images/figure.png", "image".getBytes(StandardCharsets.UTF_8));
+        zipOutputStream.close();
+        return outputStream.toByteArray();
+    }
+
+    private JSONArray nestedDocxContentList() {
+        JSONArray contentList = new JSONArray();
+
+        JSONObject title = new JSONObject();
+        title.put("type", "title");
+        title.put("content", "二、技术要求");
+        title.put("page_idx", 0);
+        title.put("bbox", bbox());
+        contentList.add(title);
+
+        JSONObject text = new JSONObject();
+        text.put("type", "text");
+        text.put("content", "响应方式");
+        text.put("page_idx", 0);
+        text.put("bbox", bbox());
+        contentList.add(text);
+
+        JSONObject table = new JSONObject();
+        table.put("type", "table");
+        table.put("content", "<table></table>");
+        table.put("table_body", "<table></table>");
+        table.put("page_idx", 0);
+        table.put("bbox", bbox());
+        contentList.add(table);
+
+        return contentList;
+    }
+
    private void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException {
        addBinaryEntry(zipOutputStream, name, content.getBytes(StandardCharsets.UTF_8));
    }
--- a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java
+++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java
@@ -17,4 +17,5 @@ public final class RagMetadataKeys {
    public static final String PART_NO = "partNo";
    public static final String PART_TOTAL = "partTotal";
    public static final String WARNINGS = "warnings";
+    public static final String SOURCE_RANGES = "sourceRanges";
 }
--- a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java
@@ -1,9 +1,5 @@
 package com.easyagents.rag.ingestion.chunk;

-import com.easyagents.core.document.Document;
-import com.easyagents.core.document.DocumentSplitter;
-import com.easyagents.core.document.splitter.RegexDocumentSplitter;
-import com.easyagents.core.document.splitter.SimpleDocumentSplitter;
 import com.easyagents.core.util.StringUtil;
 import com.easyagents.rag.core.*;
 import com.easyagents.rag.ingestion.model.AnalysisResult;
@@ -41,12 +37,12 @@ public class RagSplitStrategyRegistry {
    }

    private List<RagChunk> buildMarkdownChunks(String content, StrategyConfig strategyConfig) {
-        List<String> lines = Arrays.asList(content.split("\\n"));
+        List<LineSlice> lines = sliceLines(content);
        List<SectionChunk> sections = new ArrayList<SectionChunk>();
        Deque<HeadingLevel> stack = new ArrayDeque<HeadingLevel>();
        SectionChunk current = null;
-        for (String rawLine : lines) {
-            String line = rawLine.trim();
+        for (LineSlice lineSlice : lines) {
+            String line = lineSlice.trimmedLine;
            Matcher matcher = MARKDOWN_HEADING.matcher(line);
            if (matcher.matches()) {
                if (current != null) {
@@ -58,27 +54,27 @@ public class RagSplitStrategyRegistry {
                }
                stack.addLast(new HeadingLevel(level, matcher.group(2).trim()));
                current = new SectionChunk(copyPath(stack), matcher.group(2).trim());
-                current.lines.add(line);
+                current.addLine(lineSlice);
            } else {
                if (current == null) {
                    current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
                }
-                current.lines.add(rawLine);
+                current.addLine(lineSlice);
            }
        }
        if (current != null) {
            sections.add(current);
        }
-        return finalizeSectionChunks(sections, strategyConfig);
+        return finalizeSectionChunks(content, sections, strategyConfig);
    }

    private List<RagChunk> buildOutlineChunks(String content, StrategyConfig strategyConfig) {
-        List<String> lines = Arrays.asList(content.split("\\n"));
+        List<LineSlice> lines = sliceLines(content);
        List<SectionChunk> sections = new ArrayList<SectionChunk>();
        Deque<HeadingLevel> stack = new ArrayDeque<HeadingLevel>();
        SectionChunk current = null;
-        for (String rawLine : lines) {
-            String line = rawLine.trim();
+        for (LineSlice lineSlice : lines) {
+            String line = lineSlice.trimmedLine;
            OutlineHeading heading = OutlineHeading.parse(line);
            if (heading != null) {
                if (current != null) {
@@ -89,36 +85,62 @@ public class RagSplitStrategyRegistry {
                }
                stack.addLast(new HeadingLevel(heading.level, heading.title));
                current = new SectionChunk(copyPath(stack), heading.title);
-                current.lines.add(line);
+                current.addLine(lineSlice);
            } else {
                if (current == null) {
                    current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
                }
-                current.lines.add(rawLine);
+                current.addLine(lineSlice);
            }
        }
        if (current != null) {
            sections.add(current);
        }
-        return finalizeSectionChunks(sections, strategyConfig);
+        return finalizeSectionChunks(content, sections, strategyConfig);
    }

-    private List<RagChunk> finalizeSectionChunks(List<SectionChunk> sections, StrategyConfig strategyConfig) {
+    private List<RagChunk> finalizeSectionChunks(String content, List<SectionChunk> sections, StrategyConfig strategyConfig) {
        List<RagChunk> result = new ArrayList<RagChunk>();
        int index = 1;
        for (SectionChunk section : sections) {
-            String content = joinAndTrim(section.lines);
-            if (!StringUtil.hasText(content) || content.equals(section.sourceLabel)) {
+            TextRange baseRange = trimRange(content, section.start, section.end);
+            if (baseRange == null) {
                continue;
            }
-            if (content.length() <= safeChunkSize(strategyConfig)) {
-                result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, content, index++, 1, 1));
+            String sectionContent = content.substring(baseRange.start, baseRange.end);
+            if (!StringUtil.hasText(sectionContent) || sectionContent.equals(section.sourceLabel)) {
                continue;
            }
-            List<String> subContents = splitLongContent(content, strategyConfig.getChunkSize());
+            if (sectionContent.length() <= safeChunkSize(strategyConfig)) {
+                result.add(createChunk(
+                    RagChunkTypes.SECTION,
+                    section.sourceLabel,
+                    section.headingPath,
+                    sectionContent,
+                    index++,
+                    1,
+                    1,
+                    Collections.singletonList(baseRange)
+                ));
+                continue;
+            }
+            List<String> subContents = splitLongContent(sectionContent, strategyConfig.getChunkSize());
            int total = subContents.size();
+            int relativeCursor = 0;
            for (int i = 0; i < subContents.size(); i++) {
-                result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, subContents.get(i), index++, i + 1, total));
+                String subContent = subContents.get(i);
+                TextRange relativeRange = findOrderedRange(sectionContent, subContent, relativeCursor, "章节分块");
+                relativeCursor = relativeRange.end;
+                result.add(createChunk(
+                    RagChunkTypes.SECTION,
+                    section.sourceLabel,
+                    section.headingPath,
+                    subContent,
+                    index++,
+                    i + 1,
+                    total,
+                    Collections.singletonList(relativeRange.offset(baseRange.start))
+                ));
            }
        }
        return postProcess(result);
@@ -127,62 +149,79 @@ public class RagSplitStrategyRegistry {
    private List<RagChunk> buildQaChunks(String content, StrategyConfig strategyConfig) {
        List<RagChunk> result = new ArrayList<RagChunk>();
        String currentQuestion = null;
-        StringBuilder answerBuilder = new StringBuilder();
-        StringBuilder questionBuilder = new StringBuilder();
+        List<LineSlice> answerSlices = new ArrayList<LineSlice>();
+        List<LineSlice> questionSlices = new ArrayList<LineSlice>();
+        boolean answerStarted = false;
        int qaIndex = 1;

-        for (String rawLine : content.split("\\n")) {
-            String line = rawLine.trim();
+        for (LineSlice lineSlice : sliceLines(content)) {
+            String line = lineSlice.trimmedLine;
            if (!StringUtil.hasText(line)) {
                continue;
            }
            Matcher questionMatcher = QUESTION_PREFIX.matcher(line);
            Matcher answerMatcher = ANSWER_PREFIX.matcher(line);
            if (questionMatcher.matches()) {
-                qaIndex = flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig);
+                qaIndex = flushQaChunk(result, currentQuestion, questionSlices, answerSlices, qaIndex, strategyConfig);
                currentQuestion = questionMatcher.group(2).trim();
-                questionBuilder = new StringBuilder(currentQuestion);
-                answerBuilder = new StringBuilder();
+                questionSlices = new ArrayList<LineSlice>();
+                answerSlices = new ArrayList<LineSlice>();
+                questionSlices.add(lineSlice);
+                answerStarted = false;
                continue;
            }
            if (answerMatcher.matches()) {
-                if (answerBuilder.length() > 0) {
-                    answerBuilder.append('\n');
+                if (!StringUtil.hasText(currentQuestion)) {
+                    continue;
                }
-                answerBuilder.append(answerMatcher.group(2).trim());
+                answerSlices.add(lineSlice);
+                answerStarted = true;
                continue;
            }
-            if (answerBuilder.length() > 0) {
-                answerBuilder.append('\n').append(rawLine.trim());
-            } else if (questionBuilder.length() > 0) {
-                questionBuilder.append('\n').append(rawLine.trim());
+            if (!StringUtil.hasText(currentQuestion)) {
+                continue;
+            }
+            if (answerStarted) {
+                answerSlices.add(lineSlice);
+            } else if (!questionSlices.isEmpty()) {
+                questionSlices.add(lineSlice);
            }
        }
-        flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig);
+        flushQaChunk(result, currentQuestion, questionSlices, answerSlices, qaIndex, strategyConfig);
        return postProcess(result);
    }

    private int flushQaChunk(List<RagChunk> result,
                             String currentQuestion,
-                             StringBuilder questionBuilder,
-                             StringBuilder answerBuilder,
+                             List<LineSlice> questionSlices,
+                             List<LineSlice> answerSlices,
                             int qaIndex,
                             StrategyConfig strategyConfig) {
        if (!StringUtil.hasText(currentQuestion)) {
            return qaIndex;
        }
-        if (!StringUtil.hasText(answerBuilder.toString())) {
+        if (answerSlices == null || answerSlices.isEmpty()) {
            return qaIndex;
        }
-        String question = questionBuilder.toString().trim();
-        String answer = answerBuilder.toString().trim();
+        String question = joinLineSlices(questionSlices);
+        String answer = joinLineSlices(answerSlices);
        String baseContent = "问题：" + question + "\n答案：" + answer;
        List<String> subContents = baseContent.length() > safeChunkSize(strategyConfig)
            ? splitLongContent(baseContent, strategyConfig.getChunkSize())
            : Collections.singletonList(baseContent);
        int total = subContents.size();
+        List<TextRange> sourceRanges = buildQaSourceRanges(questionSlices, answerSlices);
        for (int i = 0; i < subContents.size(); i++) {
-            RagChunk chunk = createChunk(RagChunkTypes.QA_PAIR, "Q" + qaIndex + " " + question, Collections.<String>emptyList(), subContents.get(i), result.size() + 1, i + 1, total);
+            RagChunk chunk = createChunk(
+                RagChunkTypes.QA_PAIR,
+                "Q" + qaIndex + " " + question,
+                Collections.<String>emptyList(),
+                subContents.get(i),
+                result.size() + 1,
+                i + 1,
+                total,
+                sourceRanges
+            );
            chunk.setQuestion(question);
            chunk.setAnswer(answer);
            chunk.getOptions().put(RagMetadataKeys.QA_GROUP_ID, "qa-" + qaIndex);
@@ -193,11 +232,26 @@ public class RagSplitStrategyRegistry {

    private List<RagChunk> buildParagraphChunks(String content, StrategyConfig strategyConfig) {
        List<RagChunk> result = new ArrayList<RagChunk>();
-        DocumentSplitter splitter = new SimpleDocumentSplitter(safeChunkSize(strategyConfig), safeOverlap(strategyConfig));
-        List<Document> docs = splitter.split(new Document(content));
        int index = 1;
-        for (Document doc : docs) {
-            result.add(createChunk(RagChunkTypes.PARAGRAPH, "分块 " + index, Collections.<String>emptyList(), doc.getContent(), index, 1, 1));
+        int currentIndex = 0;
+        int maxIndex = content.length();
+        while (currentIndex < maxIndex) {
+            int endIndex = Math.min(currentIndex + safeChunkSize(strategyConfig), maxIndex);
+            TextRange range = trimRange(content, currentIndex, endIndex);
+            currentIndex = currentIndex + safeChunkSize(strategyConfig) - safeOverlap(strategyConfig);
+            if (range == null) {
+                continue;
+            }
+            result.add(createChunk(
+                RagChunkTypes.PARAGRAPH,
+                "分块 " + index,
+                Collections.<String>emptyList(),
+                content.substring(range.start, range.end),
+                index,
+                1,
+                1,
+                Collections.singletonList(range)
+            ));
            index++;
        }
        return postProcess(result);
@@ -205,14 +259,16 @@ public class RagSplitStrategyRegistry {

    private List<RagChunk> buildRegexChunks(String content, StrategyConfig strategyConfig) {
        String regex = StringUtil.hasText(strategyConfig.getRegex()) ? strategyConfig.getRegex() : "\\n\\s*\\n";
-        DocumentSplitter splitter = new RegexDocumentSplitter(regex);
-        List<Document> docs = splitter.split(new Document(content));
        List<RagChunk> result = new ArrayList<RagChunk>();
        int index = 1;
-        for (Document doc : docs) {
-            result.add(createChunk(RagChunkTypes.PARAGRAPH, "正则分块 " + index, Collections.<String>emptyList(), doc.getContent(), index, 1, 1));
-            index++;
+        Pattern pattern = Pattern.compile(regex);
+        Matcher matcher = pattern.matcher(content);
+        int segmentStart = 0;
+        while (matcher.find()) {
+            index = addRegexChunk(result, content, segmentStart, matcher.start(), index);
+            segmentStart = matcher.end();
        }
+        addRegexChunk(result, content, segmentStart, content.length(), index);
        return postProcess(result);
    }

@@ -275,7 +331,8 @@ public class RagSplitStrategyRegistry {
                                 String content,
                                 int index,
                                 int partNo,
-                                 int partTotal) {
+                                 int partTotal,
+                                 List<TextRange> sourceRanges) {
        RagChunk chunk = new RagChunk();
        chunk.setChunkId("chunk-" + index);
        chunk.setChunkType(chunkType);
@@ -290,9 +347,112 @@ public class RagSplitStrategyRegistry {
        if (RagChunkTypes.SECTION.equals(chunkType)) {
            chunk.getOptions().put(RagMetadataKeys.SOURCE_LABEL, sourceLabel);
        }
+        if (sourceRanges != null && !sourceRanges.isEmpty()) {
+            chunk.getOptions().put(RagMetadataKeys.SOURCE_RANGES, toSourceRangeMaps(sourceRanges));
+        }
        return chunk;
    }

+    private int addRegexChunk(List<RagChunk> result, String content, int rawStart, int rawEnd, int index) {
+        TextRange range = trimRange(content, rawStart, rawEnd);
+        if (range == null) {
+            return index;
+        }
+        result.add(createChunk(
+            RagChunkTypes.PARAGRAPH,
+            "正则分块 " + index,
+            Collections.<String>emptyList(),
+            content.substring(range.start, range.end),
+            index,
+            1,
+            1,
+            Collections.singletonList(range)
+        ));
+        return index + 1;
+    }
+
+    private List<LineSlice> sliceLines(String content) {
+        List<LineSlice> result = new ArrayList<LineSlice>();
+        if (content == null || content.isEmpty()) {
+            return result;
+        }
+        int start = 0;
+        for (int i = 0; i <= content.length(); i++) {
+            if (i < content.length() && content.charAt(i) != '\n') {
+                continue;
+            }
+            String rawLine = content.substring(start, i);
+            result.add(new LineSlice(start, i, rawLine));
+            start = i + 1;
+        }
+        return result;
+    }
+
+    private String joinLineSlices(List<LineSlice> slices) {
+        List<String> values = new ArrayList<String>();
+        for (LineSlice slice : slices) {
+            values.add(slice.trimmedLine);
+        }
+        return joinAndTrim(values);
+    }
+
+    private List<TextRange> buildQaSourceRanges(List<LineSlice> questionSlices, List<LineSlice> answerSlices) {
+        List<TextRange> result = new ArrayList<TextRange>();
+        TextRange questionRange = mergeLineSlices(questionSlices);
+        if (questionRange != null) {
+            result.add(questionRange);
+        }
+        TextRange answerRange = mergeLineSlices(answerSlices);
+        if (answerRange != null) {
+            result.add(answerRange);
+        }
+        return result;
+    }
+
+    private TextRange mergeLineSlices(List<LineSlice> slices) {
+        if (slices == null || slices.isEmpty()) {
+            return null;
+        }
+        return new TextRange(slices.get(0).start, slices.get(slices.size() - 1).end);
+    }
+
+    private TextRange trimRange(String content, int rawStart, int rawEnd) {
+        int start = Math.max(0, rawStart);
+        int end = Math.min(content.length(), rawEnd);
+        while (start < end && Character.isWhitespace(content.charAt(start))) {
+            start++;
+        }
+        while (end > start && Character.isWhitespace(content.charAt(end - 1))) {
+            end--;
+        }
+        if (start >= end) {
+            return null;
+        }
+        return new TextRange(start, end);
+    }
+
+    private TextRange findOrderedRange(String baseContent, String chunkContent, int searchStart, String label) {
+        int index = baseContent.indexOf(chunkContent, Math.max(0, searchStart));
+        if (index < 0 && searchStart > 0) {
+            index = baseContent.indexOf(chunkContent);
+        }
+        if (index < 0) {
+            throw new IllegalStateException(label + "无法定位原文区间");
+        }
+        return new TextRange(index, index + chunkContent.length());
+    }
+
+    private List<Map<String, Object>> toSourceRangeMaps(List<TextRange> sourceRanges) {
+        List<Map<String, Object>> result = new ArrayList<Map<String, Object>>();
+        for (TextRange sourceRange : sourceRanges) {
+            Map<String, Object> item = new LinkedHashMap<String, Object>();
+            item.put("start", Integer.valueOf(sourceRange.start));
+            item.put("end", Integer.valueOf(sourceRange.end));
+            result.add(item);
+        }
+        return result;
+    }
+
    private int safeChunkSize(StrategyConfig strategyConfig) {
        Integer chunkSize = strategyConfig.getChunkSize();
        return chunkSize == null || chunkSize.intValue() <= 0 ? RagDefaults.CHUNK_SIZE : chunkSize.intValue();
@@ -320,11 +480,21 @@ public class RagSplitStrategyRegistry {
        private final List<String> headingPath;
        private final String sourceLabel;
        private final List<String> lines = new ArrayList<String>();
+        private int start = -1;
+        private int end = -1;

        private SectionChunk(List<String> headingPath, String sourceLabel) {
            this.headingPath = headingPath;
            this.sourceLabel = sourceLabel;
        }
+
+        private void addLine(LineSlice lineSlice) {
+            if (start < 0) {
+                start = lineSlice.start;
+            }
+            end = lineSlice.end;
+            lines.add(lineSlice.rawLine);
+        }
    }

    private static class HeadingLevel {
@@ -385,4 +555,32 @@ public class RagSplitStrategyRegistry {
            return null;
        }
    }
+
+    private static class LineSlice {
+        private final int start;
+        private final int end;
+        private final String rawLine;
+        private final String trimmedLine;
+
+        private LineSlice(int start, int end, String rawLine) {
+            this.start = start;
+            this.end = end;
+            this.rawLine = rawLine == null ? "" : rawLine;
+            this.trimmedLine = this.rawLine.trim();
+        }
+    }
+
+    private static class TextRange {
+        private final int start;
+        private final int end;
+
+        private TextRange(int start, int end) {
+            this.start = start;
+            this.end = end;
+        }
+
+        private TextRange offset(int offset) {
+            return new TextRange(start + offset, end + offset);
+        }
+    }
 }
--- a/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java
+++ b/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java
@@ -12,6 +12,7 @@ import org.junit.Assert;
 import org.junit.Test;

 import java.util.List;
+import java.util.Map;

 public class RagIngestionPipelineTest {

@@ -58,6 +59,7 @@ public class RagIngestionPipelineTest {
        Assert.assertEquals(3, chunks.size());
        Assert.assertEquals("第1章 总则", chunks.get(0).getSourceLabel());
        Assert.assertEquals(2, chunks.get(1).getHeadingPath().size());
+        assertHasValidSourceRanges(analysis, chunks.get(0));
    }

    @Test
@@ -76,5 +78,38 @@ public class RagIngestionPipelineTest {
        Assert.assertEquals(RagChunkTypes.QA_PAIR, chunks.get(0).getChunkType());
        Assert.assertTrue(chunks.get(0).getContent().contains("问题"));
        Assert.assertTrue(chunks.get(1).getAnswer().contains("系统配置"));
+        assertHasValidSourceRanges(analysis, chunks.get(0));
+        Assert.assertEquals(2, ((List<?>) chunks.get(0).getOptions().get("sourceRanges")).size());
+    }
+
+    @Test
+    public void shouldSplitParagraphDocumentWithSourceRanges() {
+        String content = "第一段内容用于测试原文映射。\n第二段内容继续补充，便于生成多个分块。\n第三段内容再长一点，确保范围映射稳定。";
+        AnalysisResult analysis = recommender.recommend(analyzer.analyze(content, "txt"));
+        StrategyConfig config = StrategyConfig.defaults();
+        config.setStrategyCode(RagStrategyCodes.PARAGRAPH_LENGTH);
+        config.setChunkSize(18);
+        config.setOverlapSize(4);
+
+        List<RagChunk> chunks = registry.split(analysis, config);
+
+        Assert.assertTrue(chunks.size() > 1);
+        assertHasValidSourceRanges(analysis, chunks.get(0));
+    }
+
+    @SuppressWarnings("unchecked")
+    private void assertHasValidSourceRanges(AnalysisResult analysis, RagChunk chunk) {
+        Object rawRanges = chunk.getOptions().get("sourceRanges");
+        Assert.assertTrue(rawRanges instanceof List);
+        List<Map<String, Object>> ranges = (List<Map<String, Object>>) rawRanges;
+        Assert.assertFalse(ranges.isEmpty());
+        int normalizedLength = analysis.getNormalizedContent().length();
+        for (Map<String, Object> range : ranges) {
+            int start = ((Number) range.get("start")).intValue();
+            int end = ((Number) range.get("end")).intValue();
+            Assert.assertTrue(start >= 0);
+            Assert.assertTrue(end > start);
+            Assert.assertTrue(end <= normalizedLength);
+        }
    }
 }