feat: 完善统一文档解析与分块原文映射

- 兼容 MinerU docx 嵌套压缩工件与数组模型输出

- 补充异步解析日志与 sourceRanges 原文区间映射
This commit is contained in:
2026-04-15 19:27:22 +08:00
parent 0c7b362173
commit 547d4f6ee0
6 changed files with 427 additions and 77 deletions

View File

@@ -256,14 +256,29 @@ public class MineruMapper {
result.setPlainText(markdown);
ParseArtifacts artifacts = new ParseArtifacts();
JSONObject middleJson = firstJsonObject(bundle.entriesBySuffix, "_middle.json");
JSONArray contentList = firstJsonArray(bundle.entriesBySuffix, "_content_list.json");
JSONObject modelOutput = firstJsonObject(bundle.entriesBySuffix, "_model.json");
artifacts.setMiddleJson(middleJson);
artifacts.setContentList(contentList);
Object middleArtifact = firstJsonValue(bundle.entriesBySuffix, "_middle.json");
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
JSONObject middleJson = asObject(middleArtifact);
JSONArray contentList = asArray(contentListArtifact);
Object modelOutput = modelOutputArtifact;
// MinerU 在 DOCX 等场景下可能将结构化块列表放在 middle/model 工件里,并且直接返回数组。
if (contentList == null && middleArtifact instanceof JSONArray) {
contentList = (JSONArray) middleArtifact;
middleJson = null;
middleArtifact = null;
}
if (contentList == null && modelOutputArtifact instanceof JSONArray) {
contentList = (JSONArray) modelOutputArtifact;
}
artifacts.setMiddleJson(middleArtifact);
artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
artifacts.setModelOutput(modelOutput);
JSONArray contentListV2 = firstJsonArray(bundle.entriesBySuffix, "_content_list_v2.json");
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
if (contentListV2 != null) {
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
}
@@ -344,7 +359,11 @@ public class MineruMapper {
block.setType(item.getString("type"));
block.setPageIndex(item.getInteger("page_idx"));
block.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
block.setLevel(item.getInteger("text_level"));
Integer blockLevel = item.getInteger("text_level");
if (blockLevel == null) {
blockLevel = item.getInteger("level");
}
block.setLevel(blockLevel);
block.setText(extractBlockText(item));
block.setHtml(item.getString("table_body"));
block.setImagePath(item.getString("img_path"));
@@ -531,20 +550,16 @@ public class MineruMapper {
return null;
}
private JSONObject firstJsonObject(Map<String, byte[]> entries, String suffix) {
private Object firstJsonValue(Map<String, byte[]> entries, String suffix) {
String text = firstText(entries, suffix);
if (!StringUtil.hasText(text)) {
return null;
}
return JSON.parseObject(text);
}
private JSONArray firstJsonArray(Map<String, byte[]> entries, String suffix) {
String text = firstText(entries, suffix);
if (!StringUtil.hasText(text)) {
return null;
try {
return JSON.parse(text);
} catch (Exception exception) {
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
}
return JSON.parseArray(text);
}
private JSONObject asObject(Object value) {
@@ -554,6 +569,9 @@ public class MineruMapper {
if (value == null) {
return null;
}
if (value instanceof JSONArray) {
return null;
}
return JSON.parseObject(JSON.toJSONString(value));
}
@@ -622,8 +640,9 @@ public class MineruMapper {
String type = item.getString("type");
if ("text".equals(type) || "header".equals(type) || "footer".equals(type)
|| "page_number".equals(type) || "aside_text".equals(type) || "page_footnote".equals(type)
|| "equation".equals(type)) {
return item.getString("text");
|| "equation".equals(type) || "title".equals(type)) {
String text = item.getString("text");
return StringUtil.hasText(text) ? text : item.getString("content");
}
if ("list".equals(type)) {
return joinList(toStringList(item.getJSONArray("list_items")));
@@ -635,9 +654,11 @@ public class MineruMapper {
return joinList(toStringList(item.getJSONArray("image_caption")));
}
if ("table".equals(type)) {
return joinList(toStringList(item.getJSONArray("table_caption")));
String tableCaption = joinList(toStringList(item.getJSONArray("table_caption")));
return StringUtil.hasText(tableCaption) ? tableCaption : item.getString("content");
}
return item.getString("text");
String text = item.getString("text");
return StringUtil.hasText(text) ? text : item.getString("content");
}
private String extractTextFromMiddleBlock(JSONObject blockJson) {
@@ -768,6 +789,9 @@ public class MineruMapper {
return imageDataUrls.get(imagePath);
}
String baseName = baseName(imagePath);
if (!StringUtil.hasText(baseName)) {
return null;
}
for (Map.Entry<String, String> entry : imageDataUrls.entrySet()) {
if (baseName.equals(baseName(entry.getKey()))) {
return entry.getValue();

View File

@@ -7,6 +7,8 @@ import com.easyagents.document.core.model.ParseResponse;
import com.easyagents.document.core.model.ParseTaskInfo;
import com.easyagents.document.core.model.ParseTaskStatus;
import com.easyagents.document.pdf.PdfDocumentProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
@@ -19,6 +21,7 @@ import java.util.ArrayList;
public class MineruPdfDocumentParseService implements PdfDocumentProvider {
public static final String PROVIDER_NAME = "mineru";
private static final Logger LOG = LoggerFactory.getLogger(MineruPdfDocumentParseService.class);
private final MineruProperties properties;
private final MineruPdfClient client;
@@ -64,7 +67,17 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
@Override
public ParseResponse parse(ParseRequest request) {
ParseRequest normalizedRequest = normalizeRequest(request);
return mapper.toParseResponse(client.parse(normalizedRequest));
LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}, parseMethod={}",
PROVIDER_NAME,
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
normalizedRequest.getBackend(),
normalizedRequest.getParseMethod());
ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
PROVIDER_NAME,
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
response == null || response.getResults() == null ? 0 : response.getResults().size());
return response;
}
@Override
@@ -76,21 +89,41 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
normalizedRequest.setReturnContentList(true);
normalizedRequest.setReturnModelOutput(true);
normalizedRequest.setReturnImages(true);
return mapper.toParseTaskStatus(client.submit(normalizedRequest));
LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}, parseMethod={}",
PROVIDER_NAME,
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
normalizedRequest.getBackend(),
normalizedRequest.getParseMethod());
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
PROVIDER_NAME,
taskStatus == null ? null : taskStatus.getTaskId(),
taskStatus == null ? null : taskStatus.getStatus());
return taskStatus;
}
@Override
public ParseTaskStatus queryTask(String taskId) {
validateTaskId(taskId);
return mapper.toParseTaskStatus(client.queryTask(taskId));
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
PROVIDER_NAME,
taskId,
taskStatus == null ? null : taskStatus.getStatus());
return taskStatus;
}
@Override
public ParseResponse queryResult(String taskId) {
validateTaskId(taskId);
LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
PROVIDER_NAME,
taskId,
response == null || response.getResults() == null ? 0 : response.getResults().size());
return response;
}
@@ -104,6 +137,11 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
taskInfo.setResult(response);
}
LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
PROVIDER_NAME,
taskId,
taskInfo == null ? null : taskInfo.getStatus(),
taskInfo != null && taskInfo.getResult() != null);
return taskInfo;
}