feat: 完善统一文档解析与分块原文映射
- 兼容 MinerU docx 嵌套压缩工件与数组模型输出 - 补充异步解析日志与 sourceRanges 原文区间映射
This commit is contained in:
@@ -256,14 +256,29 @@ public class MineruMapper {
|
|||||||
result.setPlainText(markdown);
|
result.setPlainText(markdown);
|
||||||
|
|
||||||
ParseArtifacts artifacts = new ParseArtifacts();
|
ParseArtifacts artifacts = new ParseArtifacts();
|
||||||
JSONObject middleJson = firstJsonObject(bundle.entriesBySuffix, "_middle.json");
|
Object middleArtifact = firstJsonValue(bundle.entriesBySuffix, "_middle.json");
|
||||||
JSONArray contentList = firstJsonArray(bundle.entriesBySuffix, "_content_list.json");
|
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
|
||||||
JSONObject modelOutput = firstJsonObject(bundle.entriesBySuffix, "_model.json");
|
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
|
||||||
artifacts.setMiddleJson(middleJson);
|
|
||||||
artifacts.setContentList(contentList);
|
JSONObject middleJson = asObject(middleArtifact);
|
||||||
|
JSONArray contentList = asArray(contentListArtifact);
|
||||||
|
Object modelOutput = modelOutputArtifact;
|
||||||
|
|
||||||
|
// MinerU 在 DOCX 等场景下可能将结构化块列表放在 middle/model 工件里,并且直接返回数组。
|
||||||
|
if (contentList == null && middleArtifact instanceof JSONArray) {
|
||||||
|
contentList = (JSONArray) middleArtifact;
|
||||||
|
middleJson = null;
|
||||||
|
middleArtifact = null;
|
||||||
|
}
|
||||||
|
if (contentList == null && modelOutputArtifact instanceof JSONArray) {
|
||||||
|
contentList = (JSONArray) modelOutputArtifact;
|
||||||
|
}
|
||||||
|
|
||||||
|
artifacts.setMiddleJson(middleArtifact);
|
||||||
|
artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
|
||||||
artifacts.setModelOutput(modelOutput);
|
artifacts.setModelOutput(modelOutput);
|
||||||
|
|
||||||
JSONArray contentListV2 = firstJsonArray(bundle.entriesBySuffix, "_content_list_v2.json");
|
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
|
||||||
if (contentListV2 != null) {
|
if (contentListV2 != null) {
|
||||||
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
|
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
|
||||||
}
|
}
|
||||||
@@ -344,7 +359,11 @@ public class MineruMapper {
|
|||||||
block.setType(item.getString("type"));
|
block.setType(item.getString("type"));
|
||||||
block.setPageIndex(item.getInteger("page_idx"));
|
block.setPageIndex(item.getInteger("page_idx"));
|
||||||
block.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
|
block.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
|
||||||
block.setLevel(item.getInteger("text_level"));
|
Integer blockLevel = item.getInteger("text_level");
|
||||||
|
if (blockLevel == null) {
|
||||||
|
blockLevel = item.getInteger("level");
|
||||||
|
}
|
||||||
|
block.setLevel(blockLevel);
|
||||||
block.setText(extractBlockText(item));
|
block.setText(extractBlockText(item));
|
||||||
block.setHtml(item.getString("table_body"));
|
block.setHtml(item.getString("table_body"));
|
||||||
block.setImagePath(item.getString("img_path"));
|
block.setImagePath(item.getString("img_path"));
|
||||||
@@ -531,20 +550,16 @@ public class MineruMapper {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private JSONObject firstJsonObject(Map<String, byte[]> entries, String suffix) {
|
private Object firstJsonValue(Map<String, byte[]> entries, String suffix) {
|
||||||
String text = firstText(entries, suffix);
|
String text = firstText(entries, suffix);
|
||||||
if (!StringUtil.hasText(text)) {
|
if (!StringUtil.hasText(text)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return JSON.parseObject(text);
|
try {
|
||||||
}
|
return JSON.parse(text);
|
||||||
|
} catch (Exception exception) {
|
||||||
private JSONArray firstJsonArray(Map<String, byte[]> entries, String suffix) {
|
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
|
||||||
String text = firstText(entries, suffix);
|
|
||||||
if (!StringUtil.hasText(text)) {
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
return JSON.parseArray(text);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private JSONObject asObject(Object value) {
|
private JSONObject asObject(Object value) {
|
||||||
@@ -554,6 +569,9 @@ public class MineruMapper {
|
|||||||
if (value == null) {
|
if (value == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
if (value instanceof JSONArray) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
return JSON.parseObject(JSON.toJSONString(value));
|
return JSON.parseObject(JSON.toJSONString(value));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -622,8 +640,9 @@ public class MineruMapper {
|
|||||||
String type = item.getString("type");
|
String type = item.getString("type");
|
||||||
if ("text".equals(type) || "header".equals(type) || "footer".equals(type)
|
if ("text".equals(type) || "header".equals(type) || "footer".equals(type)
|
||||||
|| "page_number".equals(type) || "aside_text".equals(type) || "page_footnote".equals(type)
|
|| "page_number".equals(type) || "aside_text".equals(type) || "page_footnote".equals(type)
|
||||||
|| "equation".equals(type)) {
|
|| "equation".equals(type) || "title".equals(type)) {
|
||||||
return item.getString("text");
|
String text = item.getString("text");
|
||||||
|
return StringUtil.hasText(text) ? text : item.getString("content");
|
||||||
}
|
}
|
||||||
if ("list".equals(type)) {
|
if ("list".equals(type)) {
|
||||||
return joinList(toStringList(item.getJSONArray("list_items")));
|
return joinList(toStringList(item.getJSONArray("list_items")));
|
||||||
@@ -635,9 +654,11 @@ public class MineruMapper {
|
|||||||
return joinList(toStringList(item.getJSONArray("image_caption")));
|
return joinList(toStringList(item.getJSONArray("image_caption")));
|
||||||
}
|
}
|
||||||
if ("table".equals(type)) {
|
if ("table".equals(type)) {
|
||||||
return joinList(toStringList(item.getJSONArray("table_caption")));
|
String tableCaption = joinList(toStringList(item.getJSONArray("table_caption")));
|
||||||
|
return StringUtil.hasText(tableCaption) ? tableCaption : item.getString("content");
|
||||||
}
|
}
|
||||||
return item.getString("text");
|
String text = item.getString("text");
|
||||||
|
return StringUtil.hasText(text) ? text : item.getString("content");
|
||||||
}
|
}
|
||||||
|
|
||||||
private String extractTextFromMiddleBlock(JSONObject blockJson) {
|
private String extractTextFromMiddleBlock(JSONObject blockJson) {
|
||||||
@@ -768,6 +789,9 @@ public class MineruMapper {
|
|||||||
return imageDataUrls.get(imagePath);
|
return imageDataUrls.get(imagePath);
|
||||||
}
|
}
|
||||||
String baseName = baseName(imagePath);
|
String baseName = baseName(imagePath);
|
||||||
|
if (!StringUtil.hasText(baseName)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
for (Map.Entry<String, String> entry : imageDataUrls.entrySet()) {
|
for (Map.Entry<String, String> entry : imageDataUrls.entrySet()) {
|
||||||
if (baseName.equals(baseName(entry.getKey()))) {
|
if (baseName.equals(baseName(entry.getKey()))) {
|
||||||
return entry.getValue();
|
return entry.getValue();
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ import com.easyagents.document.core.model.ParseResponse;
|
|||||||
import com.easyagents.document.core.model.ParseTaskInfo;
|
import com.easyagents.document.core.model.ParseTaskInfo;
|
||||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
import com.easyagents.document.core.model.ParseTaskStatus;
|
||||||
import com.easyagents.document.pdf.PdfDocumentProvider;
|
import com.easyagents.document.pdf.PdfDocumentProvider;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
||||||
@@ -19,6 +21,7 @@ import java.util.ArrayList;
|
|||||||
public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||||
|
|
||||||
public static final String PROVIDER_NAME = "mineru";
|
public static final String PROVIDER_NAME = "mineru";
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(MineruPdfDocumentParseService.class);
|
||||||
|
|
||||||
private final MineruProperties properties;
|
private final MineruProperties properties;
|
||||||
private final MineruPdfClient client;
|
private final MineruPdfClient client;
|
||||||
@@ -64,7 +67,17 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
|||||||
@Override
|
@Override
|
||||||
public ParseResponse parse(ParseRequest request) {
|
public ParseResponse parse(ParseRequest request) {
|
||||||
ParseRequest normalizedRequest = normalizeRequest(request);
|
ParseRequest normalizedRequest = normalizeRequest(request);
|
||||||
return mapper.toParseResponse(client.parse(normalizedRequest));
|
LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}, parseMethod={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||||
|
normalizedRequest.getBackend(),
|
||||||
|
normalizedRequest.getParseMethod());
|
||||||
|
ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
|
||||||
|
LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||||
|
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||||
|
return response;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -76,21 +89,41 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
|||||||
normalizedRequest.setReturnContentList(true);
|
normalizedRequest.setReturnContentList(true);
|
||||||
normalizedRequest.setReturnModelOutput(true);
|
normalizedRequest.setReturnModelOutput(true);
|
||||||
normalizedRequest.setReturnImages(true);
|
normalizedRequest.setReturnImages(true);
|
||||||
return mapper.toParseTaskStatus(client.submit(normalizedRequest));
|
LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}, parseMethod={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||||
|
normalizedRequest.getBackend(),
|
||||||
|
normalizedRequest.getParseMethod());
|
||||||
|
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
|
||||||
|
LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
taskStatus == null ? null : taskStatus.getTaskId(),
|
||||||
|
taskStatus == null ? null : taskStatus.getStatus());
|
||||||
|
return taskStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ParseTaskStatus queryTask(String taskId) {
|
public ParseTaskStatus queryTask(String taskId) {
|
||||||
validateTaskId(taskId);
|
validateTaskId(taskId);
|
||||||
return mapper.toParseTaskStatus(client.queryTask(taskId));
|
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
|
||||||
|
LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
taskId,
|
||||||
|
taskStatus == null ? null : taskStatus.getStatus());
|
||||||
|
return taskStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ParseResponse queryResult(String taskId) {
|
public ParseResponse queryResult(String taskId) {
|
||||||
validateTaskId(taskId);
|
validateTaskId(taskId);
|
||||||
|
LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
|
||||||
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
|
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
|
||||||
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
||||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||||
|
LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
taskId,
|
||||||
|
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||||
return response;
|
return response;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -104,6 +137,11 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
|||||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||||
taskInfo.setResult(response);
|
taskInfo.setResult(response);
|
||||||
}
|
}
|
||||||
|
LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
|
||||||
|
PROVIDER_NAME,
|
||||||
|
taskId,
|
||||||
|
taskInfo == null ? null : taskInfo.getStatus(),
|
||||||
|
taskInfo != null && taskInfo.getResult() != null);
|
||||||
return taskInfo;
|
return taskInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -121,6 +121,21 @@ public class MineruMapperTest {
|
|||||||
Assert.assertEquals("3.0.9", response.getVersion());
|
Assert.assertEquals("3.0.9", response.getVersion());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldMapNestedZipWhenModelArtifactIsArray() throws IOException {
|
||||||
|
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||||
|
|
||||||
|
ParseResponse response = mapper.fromZip(buildNestedDocxZipWithArrayModel());
|
||||||
|
|
||||||
|
Assert.assertEquals(1, response.getResults().size());
|
||||||
|
ParseResult result = response.getResults().get(0);
|
||||||
|
Assert.assertEquals("demo", result.getFileName());
|
||||||
|
Assert.assertEquals("# nested", result.getMarkdown());
|
||||||
|
Assert.assertFalse(result.getBlocks().isEmpty());
|
||||||
|
Assert.assertTrue(result.getArtifacts().getModelOutput() instanceof JSONArray);
|
||||||
|
Assert.assertTrue(result.getArtifacts().getContentList() instanceof JSONArray);
|
||||||
|
}
|
||||||
|
|
||||||
private MineruProperties defaultProperties() {
|
private MineruProperties defaultProperties() {
|
||||||
MineruProperties properties = new MineruProperties();
|
MineruProperties properties = new MineruProperties();
|
||||||
properties.setBaseUrl("http://127.0.0.1:8000");
|
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||||
@@ -354,6 +369,45 @@ public class MineruMapperTest {
|
|||||||
return outputStream.toByteArray();
|
return outputStream.toByteArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private byte[] buildNestedDocxZipWithArrayModel() throws IOException {
|
||||||
|
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream);
|
||||||
|
addEntry(zipOutputStream, "demo/vlm/markdown/demo.md", "# nested");
|
||||||
|
addEntry(zipOutputStream, "demo/vlm/layout/demo_middle.json", middleJson().toJSONString());
|
||||||
|
addEntry(zipOutputStream, "demo/vlm/model/demo_model.json", nestedDocxContentList().toJSONString());
|
||||||
|
addBinaryEntry(zipOutputStream, "demo/vlm/images/figure.png", "image".getBytes(StandardCharsets.UTF_8));
|
||||||
|
zipOutputStream.close();
|
||||||
|
return outputStream.toByteArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
private JSONArray nestedDocxContentList() {
|
||||||
|
JSONArray contentList = new JSONArray();
|
||||||
|
|
||||||
|
JSONObject title = new JSONObject();
|
||||||
|
title.put("type", "title");
|
||||||
|
title.put("content", "二、技术要求");
|
||||||
|
title.put("page_idx", 0);
|
||||||
|
title.put("bbox", bbox());
|
||||||
|
contentList.add(title);
|
||||||
|
|
||||||
|
JSONObject text = new JSONObject();
|
||||||
|
text.put("type", "text");
|
||||||
|
text.put("content", "响应方式");
|
||||||
|
text.put("page_idx", 0);
|
||||||
|
text.put("bbox", bbox());
|
||||||
|
contentList.add(text);
|
||||||
|
|
||||||
|
JSONObject table = new JSONObject();
|
||||||
|
table.put("type", "table");
|
||||||
|
table.put("content", "<table></table>");
|
||||||
|
table.put("table_body", "<table></table>");
|
||||||
|
table.put("page_idx", 0);
|
||||||
|
table.put("bbox", bbox());
|
||||||
|
contentList.add(table);
|
||||||
|
|
||||||
|
return contentList;
|
||||||
|
}
|
||||||
|
|
||||||
private void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException {
|
private void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException {
|
||||||
addBinaryEntry(zipOutputStream, name, content.getBytes(StandardCharsets.UTF_8));
|
addBinaryEntry(zipOutputStream, name, content.getBytes(StandardCharsets.UTF_8));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,4 +17,5 @@ public final class RagMetadataKeys {
|
|||||||
public static final String PART_NO = "partNo";
|
public static final String PART_NO = "partNo";
|
||||||
public static final String PART_TOTAL = "partTotal";
|
public static final String PART_TOTAL = "partTotal";
|
||||||
public static final String WARNINGS = "warnings";
|
public static final String WARNINGS = "warnings";
|
||||||
|
public static final String SOURCE_RANGES = "sourceRanges";
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,5 @@
|
|||||||
package com.easyagents.rag.ingestion.chunk;
|
package com.easyagents.rag.ingestion.chunk;
|
||||||
|
|
||||||
import com.easyagents.core.document.Document;
|
|
||||||
import com.easyagents.core.document.DocumentSplitter;
|
|
||||||
import com.easyagents.core.document.splitter.RegexDocumentSplitter;
|
|
||||||
import com.easyagents.core.document.splitter.SimpleDocumentSplitter;
|
|
||||||
import com.easyagents.core.util.StringUtil;
|
import com.easyagents.core.util.StringUtil;
|
||||||
import com.easyagents.rag.core.*;
|
import com.easyagents.rag.core.*;
|
||||||
import com.easyagents.rag.ingestion.model.AnalysisResult;
|
import com.easyagents.rag.ingestion.model.AnalysisResult;
|
||||||
@@ -41,12 +37,12 @@ public class RagSplitStrategyRegistry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private List<RagChunk> buildMarkdownChunks(String content, StrategyConfig strategyConfig) {
|
private List<RagChunk> buildMarkdownChunks(String content, StrategyConfig strategyConfig) {
|
||||||
List<String> lines = Arrays.asList(content.split("\\n"));
|
List<LineSlice> lines = sliceLines(content);
|
||||||
List<SectionChunk> sections = new ArrayList<SectionChunk>();
|
List<SectionChunk> sections = new ArrayList<SectionChunk>();
|
||||||
Deque<HeadingLevel> stack = new ArrayDeque<HeadingLevel>();
|
Deque<HeadingLevel> stack = new ArrayDeque<HeadingLevel>();
|
||||||
SectionChunk current = null;
|
SectionChunk current = null;
|
||||||
for (String rawLine : lines) {
|
for (LineSlice lineSlice : lines) {
|
||||||
String line = rawLine.trim();
|
String line = lineSlice.trimmedLine;
|
||||||
Matcher matcher = MARKDOWN_HEADING.matcher(line);
|
Matcher matcher = MARKDOWN_HEADING.matcher(line);
|
||||||
if (matcher.matches()) {
|
if (matcher.matches()) {
|
||||||
if (current != null) {
|
if (current != null) {
|
||||||
@@ -58,27 +54,27 @@ public class RagSplitStrategyRegistry {
|
|||||||
}
|
}
|
||||||
stack.addLast(new HeadingLevel(level, matcher.group(2).trim()));
|
stack.addLast(new HeadingLevel(level, matcher.group(2).trim()));
|
||||||
current = new SectionChunk(copyPath(stack), matcher.group(2).trim());
|
current = new SectionChunk(copyPath(stack), matcher.group(2).trim());
|
||||||
current.lines.add(line);
|
current.addLine(lineSlice);
|
||||||
} else {
|
} else {
|
||||||
if (current == null) {
|
if (current == null) {
|
||||||
current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
|
current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
|
||||||
}
|
}
|
||||||
current.lines.add(rawLine);
|
current.addLine(lineSlice);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (current != null) {
|
if (current != null) {
|
||||||
sections.add(current);
|
sections.add(current);
|
||||||
}
|
}
|
||||||
return finalizeSectionChunks(sections, strategyConfig);
|
return finalizeSectionChunks(content, sections, strategyConfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<RagChunk> buildOutlineChunks(String content, StrategyConfig strategyConfig) {
|
private List<RagChunk> buildOutlineChunks(String content, StrategyConfig strategyConfig) {
|
||||||
List<String> lines = Arrays.asList(content.split("\\n"));
|
List<LineSlice> lines = sliceLines(content);
|
||||||
List<SectionChunk> sections = new ArrayList<SectionChunk>();
|
List<SectionChunk> sections = new ArrayList<SectionChunk>();
|
||||||
Deque<HeadingLevel> stack = new ArrayDeque<HeadingLevel>();
|
Deque<HeadingLevel> stack = new ArrayDeque<HeadingLevel>();
|
||||||
SectionChunk current = null;
|
SectionChunk current = null;
|
||||||
for (String rawLine : lines) {
|
for (LineSlice lineSlice : lines) {
|
||||||
String line = rawLine.trim();
|
String line = lineSlice.trimmedLine;
|
||||||
OutlineHeading heading = OutlineHeading.parse(line);
|
OutlineHeading heading = OutlineHeading.parse(line);
|
||||||
if (heading != null) {
|
if (heading != null) {
|
||||||
if (current != null) {
|
if (current != null) {
|
||||||
@@ -89,36 +85,62 @@ public class RagSplitStrategyRegistry {
|
|||||||
}
|
}
|
||||||
stack.addLast(new HeadingLevel(heading.level, heading.title));
|
stack.addLast(new HeadingLevel(heading.level, heading.title));
|
||||||
current = new SectionChunk(copyPath(stack), heading.title);
|
current = new SectionChunk(copyPath(stack), heading.title);
|
||||||
current.lines.add(line);
|
current.addLine(lineSlice);
|
||||||
} else {
|
} else {
|
||||||
if (current == null) {
|
if (current == null) {
|
||||||
current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
|
current = new SectionChunk(Collections.singletonList("未命名段落"), "未命名段落");
|
||||||
}
|
}
|
||||||
current.lines.add(rawLine);
|
current.addLine(lineSlice);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (current != null) {
|
if (current != null) {
|
||||||
sections.add(current);
|
sections.add(current);
|
||||||
}
|
}
|
||||||
return finalizeSectionChunks(sections, strategyConfig);
|
return finalizeSectionChunks(content, sections, strategyConfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<RagChunk> finalizeSectionChunks(List<SectionChunk> sections, StrategyConfig strategyConfig) {
|
private List<RagChunk> finalizeSectionChunks(String content, List<SectionChunk> sections, StrategyConfig strategyConfig) {
|
||||||
List<RagChunk> result = new ArrayList<RagChunk>();
|
List<RagChunk> result = new ArrayList<RagChunk>();
|
||||||
int index = 1;
|
int index = 1;
|
||||||
for (SectionChunk section : sections) {
|
for (SectionChunk section : sections) {
|
||||||
String content = joinAndTrim(section.lines);
|
TextRange baseRange = trimRange(content, section.start, section.end);
|
||||||
if (!StringUtil.hasText(content) || content.equals(section.sourceLabel)) {
|
if (baseRange == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (content.length() <= safeChunkSize(strategyConfig)) {
|
String sectionContent = content.substring(baseRange.start, baseRange.end);
|
||||||
result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, content, index++, 1, 1));
|
if (!StringUtil.hasText(sectionContent) || sectionContent.equals(section.sourceLabel)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
List<String> subContents = splitLongContent(content, strategyConfig.getChunkSize());
|
if (sectionContent.length() <= safeChunkSize(strategyConfig)) {
|
||||||
|
result.add(createChunk(
|
||||||
|
RagChunkTypes.SECTION,
|
||||||
|
section.sourceLabel,
|
||||||
|
section.headingPath,
|
||||||
|
sectionContent,
|
||||||
|
index++,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
Collections.singletonList(baseRange)
|
||||||
|
));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
List<String> subContents = splitLongContent(sectionContent, strategyConfig.getChunkSize());
|
||||||
int total = subContents.size();
|
int total = subContents.size();
|
||||||
|
int relativeCursor = 0;
|
||||||
for (int i = 0; i < subContents.size(); i++) {
|
for (int i = 0; i < subContents.size(); i++) {
|
||||||
result.add(createChunk(RagChunkTypes.SECTION, section.sourceLabel, section.headingPath, subContents.get(i), index++, i + 1, total));
|
String subContent = subContents.get(i);
|
||||||
|
TextRange relativeRange = findOrderedRange(sectionContent, subContent, relativeCursor, "章节分块");
|
||||||
|
relativeCursor = relativeRange.end;
|
||||||
|
result.add(createChunk(
|
||||||
|
RagChunkTypes.SECTION,
|
||||||
|
section.sourceLabel,
|
||||||
|
section.headingPath,
|
||||||
|
subContent,
|
||||||
|
index++,
|
||||||
|
i + 1,
|
||||||
|
total,
|
||||||
|
Collections.singletonList(relativeRange.offset(baseRange.start))
|
||||||
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return postProcess(result);
|
return postProcess(result);
|
||||||
@@ -127,62 +149,79 @@ public class RagSplitStrategyRegistry {
|
|||||||
private List<RagChunk> buildQaChunks(String content, StrategyConfig strategyConfig) {
|
private List<RagChunk> buildQaChunks(String content, StrategyConfig strategyConfig) {
|
||||||
List<RagChunk> result = new ArrayList<RagChunk>();
|
List<RagChunk> result = new ArrayList<RagChunk>();
|
||||||
String currentQuestion = null;
|
String currentQuestion = null;
|
||||||
StringBuilder answerBuilder = new StringBuilder();
|
List<LineSlice> answerSlices = new ArrayList<LineSlice>();
|
||||||
StringBuilder questionBuilder = new StringBuilder();
|
List<LineSlice> questionSlices = new ArrayList<LineSlice>();
|
||||||
|
boolean answerStarted = false;
|
||||||
int qaIndex = 1;
|
int qaIndex = 1;
|
||||||
|
|
||||||
for (String rawLine : content.split("\\n")) {
|
for (LineSlice lineSlice : sliceLines(content)) {
|
||||||
String line = rawLine.trim();
|
String line = lineSlice.trimmedLine;
|
||||||
if (!StringUtil.hasText(line)) {
|
if (!StringUtil.hasText(line)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
Matcher questionMatcher = QUESTION_PREFIX.matcher(line);
|
Matcher questionMatcher = QUESTION_PREFIX.matcher(line);
|
||||||
Matcher answerMatcher = ANSWER_PREFIX.matcher(line);
|
Matcher answerMatcher = ANSWER_PREFIX.matcher(line);
|
||||||
if (questionMatcher.matches()) {
|
if (questionMatcher.matches()) {
|
||||||
qaIndex = flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig);
|
qaIndex = flushQaChunk(result, currentQuestion, questionSlices, answerSlices, qaIndex, strategyConfig);
|
||||||
currentQuestion = questionMatcher.group(2).trim();
|
currentQuestion = questionMatcher.group(2).trim();
|
||||||
questionBuilder = new StringBuilder(currentQuestion);
|
questionSlices = new ArrayList<LineSlice>();
|
||||||
answerBuilder = new StringBuilder();
|
answerSlices = new ArrayList<LineSlice>();
|
||||||
|
questionSlices.add(lineSlice);
|
||||||
|
answerStarted = false;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (answerMatcher.matches()) {
|
if (answerMatcher.matches()) {
|
||||||
if (answerBuilder.length() > 0) {
|
if (!StringUtil.hasText(currentQuestion)) {
|
||||||
answerBuilder.append('\n');
|
continue;
|
||||||
}
|
}
|
||||||
answerBuilder.append(answerMatcher.group(2).trim());
|
answerSlices.add(lineSlice);
|
||||||
|
answerStarted = true;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (answerBuilder.length() > 0) {
|
if (!StringUtil.hasText(currentQuestion)) {
|
||||||
answerBuilder.append('\n').append(rawLine.trim());
|
continue;
|
||||||
} else if (questionBuilder.length() > 0) {
|
}
|
||||||
questionBuilder.append('\n').append(rawLine.trim());
|
if (answerStarted) {
|
||||||
|
answerSlices.add(lineSlice);
|
||||||
|
} else if (!questionSlices.isEmpty()) {
|
||||||
|
questionSlices.add(lineSlice);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
flushQaChunk(result, currentQuestion, questionBuilder, answerBuilder, qaIndex, strategyConfig);
|
flushQaChunk(result, currentQuestion, questionSlices, answerSlices, qaIndex, strategyConfig);
|
||||||
return postProcess(result);
|
return postProcess(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int flushQaChunk(List<RagChunk> result,
|
private int flushQaChunk(List<RagChunk> result,
|
||||||
String currentQuestion,
|
String currentQuestion,
|
||||||
StringBuilder questionBuilder,
|
List<LineSlice> questionSlices,
|
||||||
StringBuilder answerBuilder,
|
List<LineSlice> answerSlices,
|
||||||
int qaIndex,
|
int qaIndex,
|
||||||
StrategyConfig strategyConfig) {
|
StrategyConfig strategyConfig) {
|
||||||
if (!StringUtil.hasText(currentQuestion)) {
|
if (!StringUtil.hasText(currentQuestion)) {
|
||||||
return qaIndex;
|
return qaIndex;
|
||||||
}
|
}
|
||||||
if (!StringUtil.hasText(answerBuilder.toString())) {
|
if (answerSlices == null || answerSlices.isEmpty()) {
|
||||||
return qaIndex;
|
return qaIndex;
|
||||||
}
|
}
|
||||||
String question = questionBuilder.toString().trim();
|
String question = joinLineSlices(questionSlices);
|
||||||
String answer = answerBuilder.toString().trim();
|
String answer = joinLineSlices(answerSlices);
|
||||||
String baseContent = "问题:" + question + "\n答案:" + answer;
|
String baseContent = "问题:" + question + "\n答案:" + answer;
|
||||||
List<String> subContents = baseContent.length() > safeChunkSize(strategyConfig)
|
List<String> subContents = baseContent.length() > safeChunkSize(strategyConfig)
|
||||||
? splitLongContent(baseContent, strategyConfig.getChunkSize())
|
? splitLongContent(baseContent, strategyConfig.getChunkSize())
|
||||||
: Collections.singletonList(baseContent);
|
: Collections.singletonList(baseContent);
|
||||||
int total = subContents.size();
|
int total = subContents.size();
|
||||||
|
List<TextRange> sourceRanges = buildQaSourceRanges(questionSlices, answerSlices);
|
||||||
for (int i = 0; i < subContents.size(); i++) {
|
for (int i = 0; i < subContents.size(); i++) {
|
||||||
RagChunk chunk = createChunk(RagChunkTypes.QA_PAIR, "Q" + qaIndex + " " + question, Collections.<String>emptyList(), subContents.get(i), result.size() + 1, i + 1, total);
|
RagChunk chunk = createChunk(
|
||||||
|
RagChunkTypes.QA_PAIR,
|
||||||
|
"Q" + qaIndex + " " + question,
|
||||||
|
Collections.<String>emptyList(),
|
||||||
|
subContents.get(i),
|
||||||
|
result.size() + 1,
|
||||||
|
i + 1,
|
||||||
|
total,
|
||||||
|
sourceRanges
|
||||||
|
);
|
||||||
chunk.setQuestion(question);
|
chunk.setQuestion(question);
|
||||||
chunk.setAnswer(answer);
|
chunk.setAnswer(answer);
|
||||||
chunk.getOptions().put(RagMetadataKeys.QA_GROUP_ID, "qa-" + qaIndex);
|
chunk.getOptions().put(RagMetadataKeys.QA_GROUP_ID, "qa-" + qaIndex);
|
||||||
@@ -193,11 +232,26 @@ public class RagSplitStrategyRegistry {
|
|||||||
|
|
||||||
private List<RagChunk> buildParagraphChunks(String content, StrategyConfig strategyConfig) {
|
private List<RagChunk> buildParagraphChunks(String content, StrategyConfig strategyConfig) {
|
||||||
List<RagChunk> result = new ArrayList<RagChunk>();
|
List<RagChunk> result = new ArrayList<RagChunk>();
|
||||||
DocumentSplitter splitter = new SimpleDocumentSplitter(safeChunkSize(strategyConfig), safeOverlap(strategyConfig));
|
|
||||||
List<Document> docs = splitter.split(new Document(content));
|
|
||||||
int index = 1;
|
int index = 1;
|
||||||
for (Document doc : docs) {
|
int currentIndex = 0;
|
||||||
result.add(createChunk(RagChunkTypes.PARAGRAPH, "分块 " + index, Collections.<String>emptyList(), doc.getContent(), index, 1, 1));
|
int maxIndex = content.length();
|
||||||
|
while (currentIndex < maxIndex) {
|
||||||
|
int endIndex = Math.min(currentIndex + safeChunkSize(strategyConfig), maxIndex);
|
||||||
|
TextRange range = trimRange(content, currentIndex, endIndex);
|
||||||
|
currentIndex = currentIndex + safeChunkSize(strategyConfig) - safeOverlap(strategyConfig);
|
||||||
|
if (range == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
result.add(createChunk(
|
||||||
|
RagChunkTypes.PARAGRAPH,
|
||||||
|
"分块 " + index,
|
||||||
|
Collections.<String>emptyList(),
|
||||||
|
content.substring(range.start, range.end),
|
||||||
|
index,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
Collections.singletonList(range)
|
||||||
|
));
|
||||||
index++;
|
index++;
|
||||||
}
|
}
|
||||||
return postProcess(result);
|
return postProcess(result);
|
||||||
@@ -205,14 +259,16 @@ public class RagSplitStrategyRegistry {
|
|||||||
|
|
||||||
private List<RagChunk> buildRegexChunks(String content, StrategyConfig strategyConfig) {
|
private List<RagChunk> buildRegexChunks(String content, StrategyConfig strategyConfig) {
|
||||||
String regex = StringUtil.hasText(strategyConfig.getRegex()) ? strategyConfig.getRegex() : "\\n\\s*\\n";
|
String regex = StringUtil.hasText(strategyConfig.getRegex()) ? strategyConfig.getRegex() : "\\n\\s*\\n";
|
||||||
DocumentSplitter splitter = new RegexDocumentSplitter(regex);
|
|
||||||
List<Document> docs = splitter.split(new Document(content));
|
|
||||||
List<RagChunk> result = new ArrayList<RagChunk>();
|
List<RagChunk> result = new ArrayList<RagChunk>();
|
||||||
int index = 1;
|
int index = 1;
|
||||||
for (Document doc : docs) {
|
Pattern pattern = Pattern.compile(regex);
|
||||||
result.add(createChunk(RagChunkTypes.PARAGRAPH, "正则分块 " + index, Collections.<String>emptyList(), doc.getContent(), index, 1, 1));
|
Matcher matcher = pattern.matcher(content);
|
||||||
index++;
|
int segmentStart = 0;
|
||||||
|
while (matcher.find()) {
|
||||||
|
index = addRegexChunk(result, content, segmentStart, matcher.start(), index);
|
||||||
|
segmentStart = matcher.end();
|
||||||
}
|
}
|
||||||
|
addRegexChunk(result, content, segmentStart, content.length(), index);
|
||||||
return postProcess(result);
|
return postProcess(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -275,7 +331,8 @@ public class RagSplitStrategyRegistry {
|
|||||||
String content,
|
String content,
|
||||||
int index,
|
int index,
|
||||||
int partNo,
|
int partNo,
|
||||||
int partTotal) {
|
int partTotal,
|
||||||
|
List<TextRange> sourceRanges) {
|
||||||
RagChunk chunk = new RagChunk();
|
RagChunk chunk = new RagChunk();
|
||||||
chunk.setChunkId("chunk-" + index);
|
chunk.setChunkId("chunk-" + index);
|
||||||
chunk.setChunkType(chunkType);
|
chunk.setChunkType(chunkType);
|
||||||
@@ -290,9 +347,112 @@ public class RagSplitStrategyRegistry {
|
|||||||
if (RagChunkTypes.SECTION.equals(chunkType)) {
|
if (RagChunkTypes.SECTION.equals(chunkType)) {
|
||||||
chunk.getOptions().put(RagMetadataKeys.SOURCE_LABEL, sourceLabel);
|
chunk.getOptions().put(RagMetadataKeys.SOURCE_LABEL, sourceLabel);
|
||||||
}
|
}
|
||||||
|
if (sourceRanges != null && !sourceRanges.isEmpty()) {
|
||||||
|
chunk.getOptions().put(RagMetadataKeys.SOURCE_RANGES, toSourceRangeMaps(sourceRanges));
|
||||||
|
}
|
||||||
return chunk;
|
return chunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int addRegexChunk(List<RagChunk> result, String content, int rawStart, int rawEnd, int index) {
|
||||||
|
TextRange range = trimRange(content, rawStart, rawEnd);
|
||||||
|
if (range == null) {
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
result.add(createChunk(
|
||||||
|
RagChunkTypes.PARAGRAPH,
|
||||||
|
"正则分块 " + index,
|
||||||
|
Collections.<String>emptyList(),
|
||||||
|
content.substring(range.start, range.end),
|
||||||
|
index,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
Collections.singletonList(range)
|
||||||
|
));
|
||||||
|
return index + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<LineSlice> sliceLines(String content) {
|
||||||
|
List<LineSlice> result = new ArrayList<LineSlice>();
|
||||||
|
if (content == null || content.isEmpty()) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
int start = 0;
|
||||||
|
for (int i = 0; i <= content.length(); i++) {
|
||||||
|
if (i < content.length() && content.charAt(i) != '\n') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
String rawLine = content.substring(start, i);
|
||||||
|
result.add(new LineSlice(start, i, rawLine));
|
||||||
|
start = i + 1;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String joinLineSlices(List<LineSlice> slices) {
|
||||||
|
List<String> values = new ArrayList<String>();
|
||||||
|
for (LineSlice slice : slices) {
|
||||||
|
values.add(slice.trimmedLine);
|
||||||
|
}
|
||||||
|
return joinAndTrim(values);
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<TextRange> buildQaSourceRanges(List<LineSlice> questionSlices, List<LineSlice> answerSlices) {
|
||||||
|
List<TextRange> result = new ArrayList<TextRange>();
|
||||||
|
TextRange questionRange = mergeLineSlices(questionSlices);
|
||||||
|
if (questionRange != null) {
|
||||||
|
result.add(questionRange);
|
||||||
|
}
|
||||||
|
TextRange answerRange = mergeLineSlices(answerSlices);
|
||||||
|
if (answerRange != null) {
|
||||||
|
result.add(answerRange);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private TextRange mergeLineSlices(List<LineSlice> slices) {
|
||||||
|
if (slices == null || slices.isEmpty()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return new TextRange(slices.get(0).start, slices.get(slices.size() - 1).end);
|
||||||
|
}
|
||||||
|
|
||||||
|
private TextRange trimRange(String content, int rawStart, int rawEnd) {
|
||||||
|
int start = Math.max(0, rawStart);
|
||||||
|
int end = Math.min(content.length(), rawEnd);
|
||||||
|
while (start < end && Character.isWhitespace(content.charAt(start))) {
|
||||||
|
start++;
|
||||||
|
}
|
||||||
|
while (end > start && Character.isWhitespace(content.charAt(end - 1))) {
|
||||||
|
end--;
|
||||||
|
}
|
||||||
|
if (start >= end) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return new TextRange(start, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
private TextRange findOrderedRange(String baseContent, String chunkContent, int searchStart, String label) {
|
||||||
|
int index = baseContent.indexOf(chunkContent, Math.max(0, searchStart));
|
||||||
|
if (index < 0 && searchStart > 0) {
|
||||||
|
index = baseContent.indexOf(chunkContent);
|
||||||
|
}
|
||||||
|
if (index < 0) {
|
||||||
|
throw new IllegalStateException(label + "无法定位原文区间");
|
||||||
|
}
|
||||||
|
return new TextRange(index, index + chunkContent.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Map<String, Object>> toSourceRangeMaps(List<TextRange> sourceRanges) {
|
||||||
|
List<Map<String, Object>> result = new ArrayList<Map<String, Object>>();
|
||||||
|
for (TextRange sourceRange : sourceRanges) {
|
||||||
|
Map<String, Object> item = new LinkedHashMap<String, Object>();
|
||||||
|
item.put("start", Integer.valueOf(sourceRange.start));
|
||||||
|
item.put("end", Integer.valueOf(sourceRange.end));
|
||||||
|
result.add(item);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
private int safeChunkSize(StrategyConfig strategyConfig) {
|
private int safeChunkSize(StrategyConfig strategyConfig) {
|
||||||
Integer chunkSize = strategyConfig.getChunkSize();
|
Integer chunkSize = strategyConfig.getChunkSize();
|
||||||
return chunkSize == null || chunkSize.intValue() <= 0 ? RagDefaults.CHUNK_SIZE : chunkSize.intValue();
|
return chunkSize == null || chunkSize.intValue() <= 0 ? RagDefaults.CHUNK_SIZE : chunkSize.intValue();
|
||||||
@@ -320,11 +480,21 @@ public class RagSplitStrategyRegistry {
|
|||||||
private final List<String> headingPath;
|
private final List<String> headingPath;
|
||||||
private final String sourceLabel;
|
private final String sourceLabel;
|
||||||
private final List<String> lines = new ArrayList<String>();
|
private final List<String> lines = new ArrayList<String>();
|
||||||
|
private int start = -1;
|
||||||
|
private int end = -1;
|
||||||
|
|
||||||
private SectionChunk(List<String> headingPath, String sourceLabel) {
|
private SectionChunk(List<String> headingPath, String sourceLabel) {
|
||||||
this.headingPath = headingPath;
|
this.headingPath = headingPath;
|
||||||
this.sourceLabel = sourceLabel;
|
this.sourceLabel = sourceLabel;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void addLine(LineSlice lineSlice) {
|
||||||
|
if (start < 0) {
|
||||||
|
start = lineSlice.start;
|
||||||
|
}
|
||||||
|
end = lineSlice.end;
|
||||||
|
lines.add(lineSlice.rawLine);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class HeadingLevel {
|
private static class HeadingLevel {
|
||||||
@@ -385,4 +555,32 @@ public class RagSplitStrategyRegistry {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class LineSlice {
|
||||||
|
private final int start;
|
||||||
|
private final int end;
|
||||||
|
private final String rawLine;
|
||||||
|
private final String trimmedLine;
|
||||||
|
|
||||||
|
private LineSlice(int start, int end, String rawLine) {
|
||||||
|
this.start = start;
|
||||||
|
this.end = end;
|
||||||
|
this.rawLine = rawLine == null ? "" : rawLine;
|
||||||
|
this.trimmedLine = this.rawLine.trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class TextRange {
|
||||||
|
private final int start;
|
||||||
|
private final int end;
|
||||||
|
|
||||||
|
private TextRange(int start, int end) {
|
||||||
|
this.start = start;
|
||||||
|
this.end = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
private TextRange offset(int offset) {
|
||||||
|
return new TextRange(start + offset, end + offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import org.junit.Assert;
|
|||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
public class RagIngestionPipelineTest {
|
public class RagIngestionPipelineTest {
|
||||||
|
|
||||||
@@ -58,6 +59,7 @@ public class RagIngestionPipelineTest {
|
|||||||
Assert.assertEquals(3, chunks.size());
|
Assert.assertEquals(3, chunks.size());
|
||||||
Assert.assertEquals("第1章 总则", chunks.get(0).getSourceLabel());
|
Assert.assertEquals("第1章 总则", chunks.get(0).getSourceLabel());
|
||||||
Assert.assertEquals(2, chunks.get(1).getHeadingPath().size());
|
Assert.assertEquals(2, chunks.get(1).getHeadingPath().size());
|
||||||
|
assertHasValidSourceRanges(analysis, chunks.get(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -76,5 +78,38 @@ public class RagIngestionPipelineTest {
|
|||||||
Assert.assertEquals(RagChunkTypes.QA_PAIR, chunks.get(0).getChunkType());
|
Assert.assertEquals(RagChunkTypes.QA_PAIR, chunks.get(0).getChunkType());
|
||||||
Assert.assertTrue(chunks.get(0).getContent().contains("问题"));
|
Assert.assertTrue(chunks.get(0).getContent().contains("问题"));
|
||||||
Assert.assertTrue(chunks.get(1).getAnswer().contains("系统配置"));
|
Assert.assertTrue(chunks.get(1).getAnswer().contains("系统配置"));
|
||||||
|
assertHasValidSourceRanges(analysis, chunks.get(0));
|
||||||
|
Assert.assertEquals(2, ((List<?>) chunks.get(0).getOptions().get("sourceRanges")).size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void shouldSplitParagraphDocumentWithSourceRanges() {
|
||||||
|
String content = "第一段内容用于测试原文映射。\n第二段内容继续补充,便于生成多个分块。\n第三段内容再长一点,确保范围映射稳定。";
|
||||||
|
AnalysisResult analysis = recommender.recommend(analyzer.analyze(content, "txt"));
|
||||||
|
StrategyConfig config = StrategyConfig.defaults();
|
||||||
|
config.setStrategyCode(RagStrategyCodes.PARAGRAPH_LENGTH);
|
||||||
|
config.setChunkSize(18);
|
||||||
|
config.setOverlapSize(4);
|
||||||
|
|
||||||
|
List<RagChunk> chunks = registry.split(analysis, config);
|
||||||
|
|
||||||
|
Assert.assertTrue(chunks.size() > 1);
|
||||||
|
assertHasValidSourceRanges(analysis, chunks.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
private void assertHasValidSourceRanges(AnalysisResult analysis, RagChunk chunk) {
|
||||||
|
Object rawRanges = chunk.getOptions().get("sourceRanges");
|
||||||
|
Assert.assertTrue(rawRanges instanceof List);
|
||||||
|
List<Map<String, Object>> ranges = (List<Map<String, Object>>) rawRanges;
|
||||||
|
Assert.assertFalse(ranges.isEmpty());
|
||||||
|
int normalizedLength = analysis.getNormalizedContent().length();
|
||||||
|
for (Map<String, Object> range : ranges) {
|
||||||
|
int start = ((Number) range.get("start")).intValue();
|
||||||
|
int end = ((Number) range.get("end")).intValue();
|
||||||
|
Assert.assertTrue(start >= 0);
|
||||||
|
Assert.assertTrue(end > start);
|
||||||
|
Assert.assertTrue(end <= normalizedLength);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user