feat: 完善统一文档解析与分块原文映射

- 兼容 MinerU docx 嵌套压缩工件与数组模型输出

- 补充异步解析日志与 sourceRanges 原文区间映射
This commit is contained in:
2026-04-15 19:27:22 +08:00
parent 0c7b362173
commit 547d4f6ee0
6 changed files with 427 additions and 77 deletions

View File

@@ -256,14 +256,29 @@ public class MineruMapper {
result.setPlainText(markdown);
ParseArtifacts artifacts = new ParseArtifacts();
JSONObject middleJson = firstJsonObject(bundle.entriesBySuffix, "_middle.json");
JSONArray contentList = firstJsonArray(bundle.entriesBySuffix, "_content_list.json");
JSONObject modelOutput = firstJsonObject(bundle.entriesBySuffix, "_model.json");
artifacts.setMiddleJson(middleJson);
artifacts.setContentList(contentList);
Object middleArtifact = firstJsonValue(bundle.entriesBySuffix, "_middle.json");
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
JSONObject middleJson = asObject(middleArtifact);
JSONArray contentList = asArray(contentListArtifact);
Object modelOutput = modelOutputArtifact;
// MinerU 在 DOCX 等场景下可能将结构化块列表放在 middle/model 工件里,并且直接返回数组。
if (contentList == null && middleArtifact instanceof JSONArray) {
contentList = (JSONArray) middleArtifact;
middleJson = null;
middleArtifact = null;
}
if (contentList == null && modelOutputArtifact instanceof JSONArray) {
contentList = (JSONArray) modelOutputArtifact;
}
artifacts.setMiddleJson(middleArtifact);
artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
artifacts.setModelOutput(modelOutput);
JSONArray contentListV2 = firstJsonArray(bundle.entriesBySuffix, "_content_list_v2.json");
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
if (contentListV2 != null) {
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
}
@@ -344,7 +359,11 @@ public class MineruMapper {
block.setType(item.getString("type"));
block.setPageIndex(item.getInteger("page_idx"));
block.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
block.setLevel(item.getInteger("text_level"));
Integer blockLevel = item.getInteger("text_level");
if (blockLevel == null) {
blockLevel = item.getInteger("level");
}
block.setLevel(blockLevel);
block.setText(extractBlockText(item));
block.setHtml(item.getString("table_body"));
block.setImagePath(item.getString("img_path"));
@@ -531,20 +550,16 @@ public class MineruMapper {
return null;
}
private JSONObject firstJsonObject(Map<String, byte[]> entries, String suffix) {
private Object firstJsonValue(Map<String, byte[]> entries, String suffix) {
String text = firstText(entries, suffix);
if (!StringUtil.hasText(text)) {
return null;
}
return JSON.parseObject(text);
}
private JSONArray firstJsonArray(Map<String, byte[]> entries, String suffix) {
String text = firstText(entries, suffix);
if (!StringUtil.hasText(text)) {
return null;
try {
return JSON.parse(text);
} catch (Exception exception) {
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
}
return JSON.parseArray(text);
}
private JSONObject asObject(Object value) {
@@ -554,6 +569,9 @@ public class MineruMapper {
if (value == null) {
return null;
}
if (value instanceof JSONArray) {
return null;
}
return JSON.parseObject(JSON.toJSONString(value));
}
@@ -622,8 +640,9 @@ public class MineruMapper {
String type = item.getString("type");
if ("text".equals(type) || "header".equals(type) || "footer".equals(type)
|| "page_number".equals(type) || "aside_text".equals(type) || "page_footnote".equals(type)
|| "equation".equals(type)) {
return item.getString("text");
|| "equation".equals(type) || "title".equals(type)) {
String text = item.getString("text");
return StringUtil.hasText(text) ? text : item.getString("content");
}
if ("list".equals(type)) {
return joinList(toStringList(item.getJSONArray("list_items")));
@@ -635,9 +654,11 @@ public class MineruMapper {
return joinList(toStringList(item.getJSONArray("image_caption")));
}
if ("table".equals(type)) {
return joinList(toStringList(item.getJSONArray("table_caption")));
String tableCaption = joinList(toStringList(item.getJSONArray("table_caption")));
return StringUtil.hasText(tableCaption) ? tableCaption : item.getString("content");
}
return item.getString("text");
String text = item.getString("text");
return StringUtil.hasText(text) ? text : item.getString("content");
}
private String extractTextFromMiddleBlock(JSONObject blockJson) {
@@ -768,6 +789,9 @@ public class MineruMapper {
return imageDataUrls.get(imagePath);
}
String baseName = baseName(imagePath);
if (!StringUtil.hasText(baseName)) {
return null;
}
for (Map.Entry<String, String> entry : imageDataUrls.entrySet()) {
if (baseName.equals(baseName(entry.getKey()))) {
return entry.getValue();

View File

@@ -7,6 +7,8 @@ import com.easyagents.document.core.model.ParseResponse;
import com.easyagents.document.core.model.ParseTaskInfo;
import com.easyagents.document.core.model.ParseTaskStatus;
import com.easyagents.document.pdf.PdfDocumentProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
@@ -19,6 +21,7 @@ import java.util.ArrayList;
public class MineruPdfDocumentParseService implements PdfDocumentProvider {
public static final String PROVIDER_NAME = "mineru";
private static final Logger LOG = LoggerFactory.getLogger(MineruPdfDocumentParseService.class);
private final MineruProperties properties;
private final MineruPdfClient client;
@@ -64,7 +67,17 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
@Override
public ParseResponse parse(ParseRequest request) {
ParseRequest normalizedRequest = normalizeRequest(request);
return mapper.toParseResponse(client.parse(normalizedRequest));
LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}, parseMethod={}",
PROVIDER_NAME,
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
normalizedRequest.getBackend(),
normalizedRequest.getParseMethod());
ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
PROVIDER_NAME,
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
response == null || response.getResults() == null ? 0 : response.getResults().size());
return response;
}
@Override
@@ -76,21 +89,41 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
normalizedRequest.setReturnContentList(true);
normalizedRequest.setReturnModelOutput(true);
normalizedRequest.setReturnImages(true);
return mapper.toParseTaskStatus(client.submit(normalizedRequest));
LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}, parseMethod={}",
PROVIDER_NAME,
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
normalizedRequest.getBackend(),
normalizedRequest.getParseMethod());
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
PROVIDER_NAME,
taskStatus == null ? null : taskStatus.getTaskId(),
taskStatus == null ? null : taskStatus.getStatus());
return taskStatus;
}
@Override
public ParseTaskStatus queryTask(String taskId) {
validateTaskId(taskId);
return mapper.toParseTaskStatus(client.queryTask(taskId));
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
PROVIDER_NAME,
taskId,
taskStatus == null ? null : taskStatus.getStatus());
return taskStatus;
}
@Override
public ParseResponse queryResult(String taskId) {
validateTaskId(taskId);
LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
PROVIDER_NAME,
taskId,
response == null || response.getResults() == null ? 0 : response.getResults().size());
return response;
}
@@ -104,6 +137,11 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
taskInfo.setResult(response);
}
LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
PROVIDER_NAME,
taskId,
taskInfo == null ? null : taskInfo.getStatus(),
taskInfo != null && taskInfo.getResult() != null);
return taskInfo;
}

View File

@@ -121,6 +121,21 @@ public class MineruMapperTest {
Assert.assertEquals("3.0.9", response.getVersion());
}
@Test
public void shouldMapNestedZipWhenModelArtifactIsArray() throws IOException {
MineruMapper mapper = new MineruMapper(defaultProperties());
ParseResponse response = mapper.fromZip(buildNestedDocxZipWithArrayModel());
Assert.assertEquals(1, response.getResults().size());
ParseResult result = response.getResults().get(0);
Assert.assertEquals("demo", result.getFileName());
Assert.assertEquals("# nested", result.getMarkdown());
Assert.assertFalse(result.getBlocks().isEmpty());
Assert.assertTrue(result.getArtifacts().getModelOutput() instanceof JSONArray);
Assert.assertTrue(result.getArtifacts().getContentList() instanceof JSONArray);
}
private MineruProperties defaultProperties() {
MineruProperties properties = new MineruProperties();
properties.setBaseUrl("http://127.0.0.1:8000");
@@ -354,6 +369,45 @@ public class MineruMapperTest {
return outputStream.toByteArray();
}
private byte[] buildNestedDocxZipWithArrayModel() throws IOException {
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream);
addEntry(zipOutputStream, "demo/vlm/markdown/demo.md", "# nested");
addEntry(zipOutputStream, "demo/vlm/layout/demo_middle.json", middleJson().toJSONString());
addEntry(zipOutputStream, "demo/vlm/model/demo_model.json", nestedDocxContentList().toJSONString());
addBinaryEntry(zipOutputStream, "demo/vlm/images/figure.png", "image".getBytes(StandardCharsets.UTF_8));
zipOutputStream.close();
return outputStream.toByteArray();
}
private JSONArray nestedDocxContentList() {
JSONArray contentList = new JSONArray();
JSONObject title = new JSONObject();
title.put("type", "title");
title.put("content", "二、技术要求");
title.put("page_idx", 0);
title.put("bbox", bbox());
contentList.add(title);
JSONObject text = new JSONObject();
text.put("type", "text");
text.put("content", "响应方式");
text.put("page_idx", 0);
text.put("bbox", bbox());
contentList.add(text);
JSONObject table = new JSONObject();
table.put("type", "table");
table.put("content", "<table></table>");
table.put("table_body", "<table></table>");
table.put("page_idx", 0);
table.put("bbox", bbox());
contentList.add(table);
return contentList;
}
private void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException {
addBinaryEntry(zipOutputStream, name, content.getBytes(StandardCharsets.UTF_8));
}