diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java index 7f97a1c..c48d3d1 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruMapper.java @@ -256,14 +256,29 @@ public class MineruMapper { result.setPlainText(markdown); ParseArtifacts artifacts = new ParseArtifacts(); - JSONObject middleJson = firstJsonObject(bundle.entriesBySuffix, "_middle.json"); - JSONArray contentList = firstJsonArray(bundle.entriesBySuffix, "_content_list.json"); - JSONObject modelOutput = firstJsonObject(bundle.entriesBySuffix, "_model.json"); - artifacts.setMiddleJson(middleJson); - artifacts.setContentList(contentList); + Object middleArtifact = firstJsonValue(bundle.entriesBySuffix, "_middle.json"); + Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json"); + Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json"); + + JSONObject middleJson = asObject(middleArtifact); + JSONArray contentList = asArray(contentListArtifact); + Object modelOutput = modelOutputArtifact; + + // MinerU 在 DOCX 等场景下可能将结构化块列表放在 middle/model 工件里,并且直接返回数组。 + if (contentList == null && middleArtifact instanceof JSONArray) { + contentList = (JSONArray) middleArtifact; + middleJson = null; + middleArtifact = null; + } + if (contentList == null && modelOutputArtifact instanceof JSONArray) { + contentList = (JSONArray) modelOutputArtifact; + } + + artifacts.setMiddleJson(middleArtifact); + artifacts.setContentList(contentList == null ? contentListArtifact : contentList); artifacts.setModelOutput(modelOutput); - JSONArray contentListV2 = firstJsonArray(bundle.entriesBySuffix, "_content_list_v2.json"); + JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json")); if (contentListV2 != null) { artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2); } @@ -344,7 +359,11 @@ public class MineruMapper { block.setType(item.getString("type")); block.setPageIndex(item.getInteger("page_idx")); block.setBoundingBox(toDoubleList(item.getJSONArray("bbox"))); - block.setLevel(item.getInteger("text_level")); + Integer blockLevel = item.getInteger("text_level"); + if (blockLevel == null) { + blockLevel = item.getInteger("level"); + } + block.setLevel(blockLevel); block.setText(extractBlockText(item)); block.setHtml(item.getString("table_body")); block.setImagePath(item.getString("img_path")); @@ -531,20 +550,16 @@ public class MineruMapper { return null; } - private JSONObject firstJsonObject(Map entries, String suffix) { + private Object firstJsonValue(Map entries, String suffix) { String text = firstText(entries, suffix); if (!StringUtil.hasText(text)) { return null; } - return JSON.parseObject(text); - } - - private JSONArray firstJsonArray(Map entries, String suffix) { - String text = firstText(entries, suffix); - if (!StringUtil.hasText(text)) { - return null; + try { + return JSON.parse(text); + } catch (Exception exception) { + throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception); } - return JSON.parseArray(text); } private JSONObject asObject(Object value) { @@ -554,6 +569,9 @@ public class MineruMapper { if (value == null) { return null; } + if (value instanceof JSONArray) { + return null; + } return JSON.parseObject(JSON.toJSONString(value)); } @@ -622,8 +640,9 @@ public class MineruMapper { String type = item.getString("type"); if ("text".equals(type) || "header".equals(type) || "footer".equals(type) || "page_number".equals(type) || "aside_text".equals(type) || "page_footnote".equals(type) - || "equation".equals(type)) { - return item.getString("text"); + || "equation".equals(type) || "title".equals(type)) { + String text = item.getString("text"); + return StringUtil.hasText(text) ? text : item.getString("content"); } if ("list".equals(type)) { return joinList(toStringList(item.getJSONArray("list_items"))); @@ -635,9 +654,11 @@ public class MineruMapper { return joinList(toStringList(item.getJSONArray("image_caption"))); } if ("table".equals(type)) { - return joinList(toStringList(item.getJSONArray("table_caption"))); + String tableCaption = joinList(toStringList(item.getJSONArray("table_caption"))); + return StringUtil.hasText(tableCaption) ? tableCaption : item.getString("content"); } - return item.getString("text"); + String text = item.getString("text"); + return StringUtil.hasText(text) ? text : item.getString("content"); } private String extractTextFromMiddleBlock(JSONObject blockJson) { @@ -768,6 +789,9 @@ public class MineruMapper { return imageDataUrls.get(imagePath); } String baseName = baseName(imagePath); + if (!StringUtil.hasText(baseName)) { + return null; + } for (Map.Entry entry : imageDataUrls.entrySet()) { if (baseName.equals(baseName(entry.getKey()))) { return entry.getValue(); diff --git a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java index 9aa7e2e..8132893 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java +++ b/easy-agents-document/easy-agents-document-pdf/src/main/java/com/easyagents/document/pdf/mineru/MineruPdfDocumentParseService.java @@ -7,6 +7,8 @@ import com.easyagents.document.core.model.ParseResponse; import com.easyagents.document.core.model.ParseTaskInfo; import com.easyagents.document.core.model.ParseTaskStatus; import com.easyagents.document.pdf.PdfDocumentProvider; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.ArrayList; @@ -19,6 +21,7 @@ import java.util.ArrayList; public class MineruPdfDocumentParseService implements PdfDocumentProvider { public static final String PROVIDER_NAME = "mineru"; + private static final Logger LOG = LoggerFactory.getLogger(MineruPdfDocumentParseService.class); private final MineruProperties properties; private final MineruPdfClient client; @@ -64,7 +67,17 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider { @Override public ParseResponse parse(ParseRequest request) { ParseRequest normalizedRequest = normalizeRequest(request); - return mapper.toParseResponse(client.parse(normalizedRequest)); + LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}, parseMethod={}", + PROVIDER_NAME, + normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(), + normalizedRequest.getBackend(), + normalizedRequest.getParseMethod()); + ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest)); + LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}", + PROVIDER_NAME, + normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(), + response == null || response.getResults() == null ? 0 : response.getResults().size()); + return response; } @Override @@ -76,21 +89,41 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider { normalizedRequest.setReturnContentList(true); normalizedRequest.setReturnModelOutput(true); normalizedRequest.setReturnImages(true); - return mapper.toParseTaskStatus(client.submit(normalizedRequest)); + LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}, parseMethod={}", + PROVIDER_NAME, + normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(), + normalizedRequest.getBackend(), + normalizedRequest.getParseMethod()); + ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest)); + LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}", + PROVIDER_NAME, + taskStatus == null ? null : taskStatus.getTaskId(), + taskStatus == null ? null : taskStatus.getStatus()); + return taskStatus; } @Override public ParseTaskStatus queryTask(String taskId) { validateTaskId(taskId); - return mapper.toParseTaskStatus(client.queryTask(taskId)); + ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId)); + LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}", + PROVIDER_NAME, + taskId, + taskStatus == null ? null : taskStatus.getStatus()); + return taskStatus; } @Override public ParseResponse queryResult(String taskId) { validateTaskId(taskId); + LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId); MineruTaskStatus taskStatus = waitForTaskCompleted(taskId); ParseResponse response = mapper.fromZip(client.queryResultZip(taskId)); mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion()); + LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}", + PROVIDER_NAME, + taskId, + response == null || response.getResults() == null ? 0 : response.getResults().size()); return response; } @@ -104,6 +137,11 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider { mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion()); taskInfo.setResult(response); } + LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}", + PROVIDER_NAME, + taskId, + taskInfo == null ? null : taskInfo.getStatus(), + taskInfo != null && taskInfo.getResult() != null); return taskInfo; } diff --git a/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java b/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java index b59c7a0..7fbc349 100644 --- a/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java +++ b/easy-agents-document/easy-agents-document-pdf/src/test/java/com/easyagents/document/pdf/mineru/MineruMapperTest.java @@ -121,6 +121,21 @@ public class MineruMapperTest { Assert.assertEquals("3.0.9", response.getVersion()); } + @Test + public void shouldMapNestedZipWhenModelArtifactIsArray() throws IOException { + MineruMapper mapper = new MineruMapper(defaultProperties()); + + ParseResponse response = mapper.fromZip(buildNestedDocxZipWithArrayModel()); + + Assert.assertEquals(1, response.getResults().size()); + ParseResult result = response.getResults().get(0); + Assert.assertEquals("demo", result.getFileName()); + Assert.assertEquals("# nested", result.getMarkdown()); + Assert.assertFalse(result.getBlocks().isEmpty()); + Assert.assertTrue(result.getArtifacts().getModelOutput() instanceof JSONArray); + Assert.assertTrue(result.getArtifacts().getContentList() instanceof JSONArray); + } + private MineruProperties defaultProperties() { MineruProperties properties = new MineruProperties(); properties.setBaseUrl("http://127.0.0.1:8000"); @@ -354,6 +369,45 @@ public class MineruMapperTest { return outputStream.toByteArray(); } + private byte[] buildNestedDocxZipWithArrayModel() throws IOException { + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream); + addEntry(zipOutputStream, "demo/vlm/markdown/demo.md", "# nested"); + addEntry(zipOutputStream, "demo/vlm/layout/demo_middle.json", middleJson().toJSONString()); + addEntry(zipOutputStream, "demo/vlm/model/demo_model.json", nestedDocxContentList().toJSONString()); + addBinaryEntry(zipOutputStream, "demo/vlm/images/figure.png", "image".getBytes(StandardCharsets.UTF_8)); + zipOutputStream.close(); + return outputStream.toByteArray(); + } + + private JSONArray nestedDocxContentList() { + JSONArray contentList = new JSONArray(); + + JSONObject title = new JSONObject(); + title.put("type", "title"); + title.put("content", "二、技术要求"); + title.put("page_idx", 0); + title.put("bbox", bbox()); + contentList.add(title); + + JSONObject text = new JSONObject(); + text.put("type", "text"); + text.put("content", "响应方式"); + text.put("page_idx", 0); + text.put("bbox", bbox()); + contentList.add(text); + + JSONObject table = new JSONObject(); + table.put("type", "table"); + table.put("content", "
"); + table.put("table_body", "
"); + table.put("page_idx", 0); + table.put("bbox", bbox()); + contentList.add(table); + + return contentList; + } + private void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException { addBinaryEntry(zipOutputStream, name, content.getBytes(StandardCharsets.UTF_8)); } diff --git a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java index db86c63..57af671 100644 --- a/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java +++ b/easy-agents-rag/easy-agents-rag-core/src/main/java/com/easyagents/rag/core/RagMetadataKeys.java @@ -17,4 +17,5 @@ public final class RagMetadataKeys { public static final String PART_NO = "partNo"; public static final String PART_TOTAL = "partTotal"; public static final String WARNINGS = "warnings"; + public static final String SOURCE_RANGES = "sourceRanges"; } diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java index 0be40ac..d915857 100644 --- a/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java +++ b/easy-agents-rag/easy-agents-rag-ingestion/src/main/java/com/easyagents/rag/ingestion/chunk/RagSplitStrategyRegistry.java @@ -1,9 +1,5 @@ package com.easyagents.rag.ingestion.chunk; -import com.easyagents.core.document.Document; -import com.easyagents.core.document.DocumentSplitter; -import com.easyagents.core.document.splitter.RegexDocumentSplitter; -import com.easyagents.core.document.splitter.SimpleDocumentSplitter; import com.easyagents.core.util.StringUtil; import com.easyagents.rag.core.*; import com.easyagents.rag.ingestion.model.AnalysisResult; @@ -41,12 +37,12 @@ public class RagSplitStrategyRegistry { } private List buildMarkdownChunks(String content, StrategyConfig strategyConfig) { - List lines = Arrays.asList(content.split("\\n")); + List lines = sliceLines(content); List sections = new ArrayList(); Deque stack = new ArrayDeque(); SectionChunk current = null; - for (String rawLine : lines) { - String line = rawLine.trim(); + for (LineSlice lineSlice : lines) { + String line = lineSlice.trimmedLine; Matcher matcher = MARKDOWN_HEADING.matcher(line); if (matcher.matches()) { if (current != null) { @@ -58,27 +54,27 @@ public class RagSplitStrategyRegistry { } stack.addLast(new HeadingLevel(level, matcher.group(2).trim())); current = new SectionChunk(copyPath(stack), matcher.group(2).trim()); - current.lines.add(line); + current.addLine(lineSlice); } else { if (current == null) { current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落"); } - current.lines.add(rawLine); + current.addLine(lineSlice); } } if (current != null) { sections.add(current); } - return finalizeSectionChunks(sections, strategyConfig); + return finalizeSectionChunks(content, sections, strategyConfig); } private List buildOutlineChunks(String content, StrategyConfig strategyConfig) { - List lines = Arrays.asList(content.split("\\n")); + List lines = sliceLines(content); List sections = new ArrayList(); Deque stack = new ArrayDeque(); SectionChunk current = null; - for (String rawLine : lines) { - String line = rawLine.trim(); + for (LineSlice lineSlice : lines) { + String line = lineSlice.trimmedLine; OutlineHeading heading = OutlineHeading.parse(line); if (heading != null) { if (current != null) { @@ -89,36 +85,62 @@ public class RagSplitStrategyRegistry { } stack.addLast(new HeadingLevel(heading.level, heading.title)); current = new SectionChunk(copyPath(stack), heading.title); - current.lines.add(line); + current.addLine(lineSlice); } else { if (current == null) { current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落"); } - current.lines.add(rawLine); + current.addLine(lineSlice); } } if (current != null) { sections.add(current); } - return finalizeSectionChunks(sections, strategyConfig); + return finalizeSectionChunks(content, sections, strategyConfig); } - private List finalizeSectionChunks(List sections, StrategyConfig strategyConfig) { + private List finalizeSectionChunks(String content, List sections, StrategyConfig strategyConfig) { List result = new ArrayList(); int index = 1; for (SectionChunk section : sections) { - String content = joinAndTrim(section.lines); - if (!StringUtil.hasText(content) || content.equals(section.sourceLabel)) { + TextRange baseRange = trimRange(content, section.start, section.end); + if (baseRange == null) { continue; } - if (content.length() <= safeChunkSize(strategyConfig)) { - result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, content, index++, 1, 1)); + String sectionContent = content.substring(baseRange.start, baseRange.end); + if (!StringUtil.hasText(sectionContent) || sectionContent.equals(section.sourceLabel)) { continue; } - List subContents = splitLongContent(content, strategyConfig.getChunkSize()); + if (sectionContent.length() <= safeChunkSize(strategyConfig)) { + result.add(createChunk( + RagChunkTypes.SECTION, + section.sourceLabel, + section.headingPath, + sectionContent, + index++, + 1, + 1, + Collections.singletonList(baseRange) + )); + continue; + } + List subContents = splitLongContent(sectionContent, strategyConfig.getChunkSize()); int total = subContents.size(); + int relativeCursor = 0; for (int i = 0; i < subContents.size(); i++) { - result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, subContents.get(i), index++, i + 1, total)); + String subContent = subContents.get(i); + TextRange relativeRange = findOrderedRange(sectionContent, subContent, relativeCursor, "章节分块"); + relativeCursor = relativeRange.end; + result.add(createChunk( + RagChunkTypes.SECTION, + section.sourceLabel, + section.headingPath, + subContent, + index++, + i + 1, + total, + Collections.singletonList(relativeRange.offset(baseRange.start)) + )); } } return postProcess(result); @@ -127,62 +149,79 @@ public class RagSplitStrategyRegistry { private List buildQaChunks(String content, StrategyConfig strategyConfig) { List result = new ArrayList(); String currentQuestion = null; - StringBuilder answerBuilder = new StringBuilder(); - StringBuilder questionBuilder = new StringBuilder(); + List answerSlices = new ArrayList(); + List questionSlices = new ArrayList(); + boolean answerStarted = false; int qaIndex = 1; - for (String rawLine : content.split("\\n")) { - String line = rawLine.trim(); + for (LineSlice lineSlice : sliceLines(content)) { + String line = lineSlice.trimmedLine; if (!StringUtil.hasText(line)) { continue; } Matcher questionMatcher = QUESTION_PREFIX.matcher(line); Matcher answerMatcher = ANSWER_PREFIX.matcher(line); if (questionMatcher.matches()) { - qaIndex = flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig); + qaIndex = flushQaChunk(result, currentQuestion, questionSlices, answerSlices, qaIndex, strategyConfig); currentQuestion = questionMatcher.group(2).trim(); - questionBuilder = new StringBuilder(currentQuestion); - answerBuilder = new StringBuilder(); + questionSlices = new ArrayList(); + answerSlices = new ArrayList(); + questionSlices.add(lineSlice); + answerStarted = false; continue; } if (answerMatcher.matches()) { - if (answerBuilder.length() > 0) { - answerBuilder.append('\n'); + if (!StringUtil.hasText(currentQuestion)) { + continue; } - answerBuilder.append(answerMatcher.group(2).trim()); + answerSlices.add(lineSlice); + answerStarted = true; continue; } - if (answerBuilder.length() > 0) { - answerBuilder.append('\n').append(rawLine.trim()); - } else if (questionBuilder.length() > 0) { - questionBuilder.append('\n').append(rawLine.trim()); + if (!StringUtil.hasText(currentQuestion)) { + continue; + } + if (answerStarted) { + answerSlices.add(lineSlice); + } else if (!questionSlices.isEmpty()) { + questionSlices.add(lineSlice); } } - flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig); + flushQaChunk(result, currentQuestion, questionSlices, answerSlices, qaIndex, strategyConfig); return postProcess(result); } private int flushQaChunk(List result, String currentQuestion, - StringBuilder questionBuilder, - StringBuilder answerBuilder, + List questionSlices, + List answerSlices, int qaIndex, StrategyConfig strategyConfig) { if (!StringUtil.hasText(currentQuestion)) { return qaIndex; } - if (!StringUtil.hasText(answerBuilder.toString())) { + if (answerSlices == null || answerSlices.isEmpty()) { return qaIndex; } - String question = questionBuilder.toString().trim(); - String answer = answerBuilder.toString().trim(); + String question = joinLineSlices(questionSlices); + String answer = joinLineSlices(answerSlices); String baseContent = "问题:" + question + "\n答案:" + answer; List subContents = baseContent.length() > safeChunkSize(strategyConfig) ? splitLongContent(baseContent, strategyConfig.getChunkSize()) : Collections.singletonList(baseContent); int total = subContents.size(); + List sourceRanges = buildQaSourceRanges(questionSlices, answerSlices); for (int i = 0; i < subContents.size(); i++) { - RagChunk chunk = createChunk(RagChunkTypes.QA_PAIR, "Q" + qaIndex + " " + question, Collections.emptyList(), subContents.get(i), result.size() + 1, i + 1, total); + RagChunk chunk = createChunk( + RagChunkTypes.QA_PAIR, + "Q" + qaIndex + " " + question, + Collections.emptyList(), + subContents.get(i), + result.size() + 1, + i + 1, + total, + sourceRanges + ); chunk.setQuestion(question); chunk.setAnswer(answer); chunk.getOptions().put(RagMetadataKeys.QA_GROUP_ID, "qa-" + qaIndex); @@ -193,11 +232,26 @@ public class RagSplitStrategyRegistry { private List buildParagraphChunks(String content, StrategyConfig strategyConfig) { List result = new ArrayList(); - DocumentSplitter splitter = new SimpleDocumentSplitter(safeChunkSize(strategyConfig), safeOverlap(strategyConfig)); - List docs = splitter.split(new Document(content)); int index = 1; - for (Document doc : docs) { - result.add(createChunk(RagChunkTypes.PARAGRAPH, "分块 " + index, Collections.emptyList(), doc.getContent(), index, 1, 1)); + int currentIndex = 0; + int maxIndex = content.length(); + while (currentIndex < maxIndex) { + int endIndex = Math.min(currentIndex + safeChunkSize(strategyConfig), maxIndex); + TextRange range = trimRange(content, currentIndex, endIndex); + currentIndex = currentIndex + safeChunkSize(strategyConfig) - safeOverlap(strategyConfig); + if (range == null) { + continue; + } + result.add(createChunk( + RagChunkTypes.PARAGRAPH, + "分块 " + index, + Collections.emptyList(), + content.substring(range.start, range.end), + index, + 1, + 1, + Collections.singletonList(range) + )); index++; } return postProcess(result); @@ -205,14 +259,16 @@ public class RagSplitStrategyRegistry { private List buildRegexChunks(String content, StrategyConfig strategyConfig) { String regex = StringUtil.hasText(strategyConfig.getRegex()) ? strategyConfig.getRegex() : "\\n\\s*\\n"; - DocumentSplitter splitter = new RegexDocumentSplitter(regex); - List docs = splitter.split(new Document(content)); List result = new ArrayList(); int index = 1; - for (Document doc : docs) { - result.add(createChunk(RagChunkTypes.PARAGRAPH, "正则分块 " + index, Collections.emptyList(), doc.getContent(), index, 1, 1)); - index++; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(content); + int segmentStart = 0; + while (matcher.find()) { + index = addRegexChunk(result, content, segmentStart, matcher.start(), index); + segmentStart = matcher.end(); } + addRegexChunk(result, content, segmentStart, content.length(), index); return postProcess(result); } @@ -275,7 +331,8 @@ public class RagSplitStrategyRegistry { String content, int index, int partNo, - int partTotal) { + int partTotal, + List sourceRanges) { RagChunk chunk = new RagChunk(); chunk.setChunkId("chunk-" + index); chunk.setChunkType(chunkType); @@ -290,9 +347,112 @@ public class RagSplitStrategyRegistry { if (RagChunkTypes.SECTION.equals(chunkType)) { chunk.getOptions().put(RagMetadataKeys.SOURCE_LABEL, sourceLabel); } + if (sourceRanges != null && !sourceRanges.isEmpty()) { + chunk.getOptions().put(RagMetadataKeys.SOURCE_RANGES, toSourceRangeMaps(sourceRanges)); + } return chunk; } + private int addRegexChunk(List result, String content, int rawStart, int rawEnd, int index) { + TextRange range = trimRange(content, rawStart, rawEnd); + if (range == null) { + return index; + } + result.add(createChunk( + RagChunkTypes.PARAGRAPH, + "正则分块 " + index, + Collections.emptyList(), + content.substring(range.start, range.end), + index, + 1, + 1, + Collections.singletonList(range) + )); + return index + 1; + } + + private List sliceLines(String content) { + List result = new ArrayList(); + if (content == null || content.isEmpty()) { + return result; + } + int start = 0; + for (int i = 0; i <= content.length(); i++) { + if (i < content.length() && content.charAt(i) != '\n') { + continue; + } + String rawLine = content.substring(start, i); + result.add(new LineSlice(start, i, rawLine)); + start = i + 1; + } + return result; + } + + private String joinLineSlices(List slices) { + List values = new ArrayList(); + for (LineSlice slice : slices) { + values.add(slice.trimmedLine); + } + return joinAndTrim(values); + } + + private List buildQaSourceRanges(List questionSlices, List answerSlices) { + List result = new ArrayList(); + TextRange questionRange = mergeLineSlices(questionSlices); + if (questionRange != null) { + result.add(questionRange); + } + TextRange answerRange = mergeLineSlices(answerSlices); + if (answerRange != null) { + result.add(answerRange); + } + return result; + } + + private TextRange mergeLineSlices(List slices) { + if (slices == null || slices.isEmpty()) { + return null; + } + return new TextRange(slices.get(0).start, slices.get(slices.size() - 1).end); + } + + private TextRange trimRange(String content, int rawStart, int rawEnd) { + int start = Math.max(0, rawStart); + int end = Math.min(content.length(), rawEnd); + while (start < end && Character.isWhitespace(content.charAt(start))) { + start++; + } + while (end > start && Character.isWhitespace(content.charAt(end - 1))) { + end--; + } + if (start >= end) { + return null; + } + return new TextRange(start, end); + } + + private TextRange findOrderedRange(String baseContent, String chunkContent, int searchStart, String label) { + int index = baseContent.indexOf(chunkContent, Math.max(0, searchStart)); + if (index < 0 && searchStart > 0) { + index = baseContent.indexOf(chunkContent); + } + if (index < 0) { + throw new IllegalStateException(label + "无法定位原文区间"); + } + return new TextRange(index, index + chunkContent.length()); + } + + private List> toSourceRangeMaps(List sourceRanges) { + List> result = new ArrayList>(); + for (TextRange sourceRange : sourceRanges) { + Map item = new LinkedHashMap(); + item.put("start", Integer.valueOf(sourceRange.start)); + item.put("end", Integer.valueOf(sourceRange.end)); + result.add(item); + } + return result; + } + private int safeChunkSize(StrategyConfig strategyConfig) { Integer chunkSize = strategyConfig.getChunkSize(); return chunkSize == null || chunkSize.intValue() <= 0 ? RagDefaults.CHUNK_SIZE : chunkSize.intValue(); @@ -320,11 +480,21 @@ public class RagSplitStrategyRegistry { private final List headingPath; private final String sourceLabel; private final List lines = new ArrayList(); + private int start = -1; + private int end = -1; private SectionChunk(List headingPath, String sourceLabel) { this.headingPath = headingPath; this.sourceLabel = sourceLabel; } + + private void addLine(LineSlice lineSlice) { + if (start < 0) { + start = lineSlice.start; + } + end = lineSlice.end; + lines.add(lineSlice.rawLine); + } } private static class HeadingLevel { @@ -385,4 +555,32 @@ public class RagSplitStrategyRegistry { return null; } } + + private static class LineSlice { + private final int start; + private final int end; + private final String rawLine; + private final String trimmedLine; + + private LineSlice(int start, int end, String rawLine) { + this.start = start; + this.end = end; + this.rawLine = rawLine == null ? "" : rawLine; + this.trimmedLine = this.rawLine.trim(); + } + } + + private static class TextRange { + private final int start; + private final int end; + + private TextRange(int start, int end) { + this.start = start; + this.end = end; + } + + private TextRange offset(int offset) { + return new TextRange(start + offset, end + offset); + } + } } diff --git a/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java b/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java index b5f0e5a..d1d94f3 100644 --- a/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java +++ b/easy-agents-rag/easy-agents-rag-ingestion/src/test/java/com/easyagents/rag/ingestion/RagIngestionPipelineTest.java @@ -12,6 +12,7 @@ import org.junit.Assert; import org.junit.Test; import java.util.List; +import java.util.Map; public class RagIngestionPipelineTest { @@ -58,6 +59,7 @@ public class RagIngestionPipelineTest { Assert.assertEquals(3, chunks.size()); Assert.assertEquals("第1章 总则", chunks.get(0).getSourceLabel()); Assert.assertEquals(2, chunks.get(1).getHeadingPath().size()); + assertHasValidSourceRanges(analysis, chunks.get(0)); } @Test @@ -76,5 +78,38 @@ public class RagIngestionPipelineTest { Assert.assertEquals(RagChunkTypes.QA_PAIR, chunks.get(0).getChunkType()); Assert.assertTrue(chunks.get(0).getContent().contains("问题")); Assert.assertTrue(chunks.get(1).getAnswer().contains("系统配置")); + assertHasValidSourceRanges(analysis, chunks.get(0)); + Assert.assertEquals(2, ((List) chunks.get(0).getOptions().get("sourceRanges")).size()); + } + + @Test + public void shouldSplitParagraphDocumentWithSourceRanges() { + String content = "第一段内容用于测试原文映射。\n第二段内容继续补充,便于生成多个分块。\n第三段内容再长一点,确保范围映射稳定。"; + AnalysisResult analysis = recommender.recommend(analyzer.analyze(content, "txt")); + StrategyConfig config = StrategyConfig.defaults(); + config.setStrategyCode(RagStrategyCodes.PARAGRAPH_LENGTH); + config.setChunkSize(18); + config.setOverlapSize(4); + + List chunks = registry.split(analysis, config); + + Assert.assertTrue(chunks.size() > 1); + assertHasValidSourceRanges(analysis, chunks.get(0)); + } + + @SuppressWarnings("unchecked") + private void assertHasValidSourceRanges(AnalysisResult analysis, RagChunk chunk) { + Object rawRanges = chunk.getOptions().get("sourceRanges"); + Assert.assertTrue(rawRanges instanceof List); + List> ranges = (List>) rawRanges; + Assert.assertFalse(ranges.isEmpty()); + int normalizedLength = analysis.getNormalizedContent().length(); + for (Map range : ranges) { + int start = ((Number) range.get("start")).intValue(); + int end = ((Number) range.get("end")).intValue(); + Assert.assertTrue(start >= 0); + Assert.assertTrue(end > start); + Assert.assertTrue(end <= normalizedLength); + } } }