feat: 完善统一文档解析与分块原文映射
- 兼容 MinerU docx 嵌套压缩工件与数组模型输出 - 补充异步解析日志与 sourceRanges 原文区间映射
This commit is contained in:
@@ -256,14 +256,29 @@ public class MineruMapper {
|
||||
result.setPlainText(markdown);
|
||||
|
||||
ParseArtifacts artifacts = new ParseArtifacts();
|
||||
JSONObject middleJson = firstJsonObject(bundle.entriesBySuffix, "_middle.json");
|
||||
JSONArray contentList = firstJsonArray(bundle.entriesBySuffix, "_content_list.json");
|
||||
JSONObject modelOutput = firstJsonObject(bundle.entriesBySuffix, "_model.json");
|
||||
artifacts.setMiddleJson(middleJson);
|
||||
artifacts.setContentList(contentList);
|
||||
Object middleArtifact = firstJsonValue(bundle.entriesBySuffix, "_middle.json");
|
||||
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
|
||||
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
|
||||
|
||||
JSONObject middleJson = asObject(middleArtifact);
|
||||
JSONArray contentList = asArray(contentListArtifact);
|
||||
Object modelOutput = modelOutputArtifact;
|
||||
|
||||
// MinerU 在 DOCX 等场景下可能将结构化块列表放在 middle/model 工件里,并且直接返回数组。
|
||||
if (contentList == null && middleArtifact instanceof JSONArray) {
|
||||
contentList = (JSONArray) middleArtifact;
|
||||
middleJson = null;
|
||||
middleArtifact = null;
|
||||
}
|
||||
if (contentList == null && modelOutputArtifact instanceof JSONArray) {
|
||||
contentList = (JSONArray) modelOutputArtifact;
|
||||
}
|
||||
|
||||
artifacts.setMiddleJson(middleArtifact);
|
||||
artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
|
||||
artifacts.setModelOutput(modelOutput);
|
||||
|
||||
JSONArray contentListV2 = firstJsonArray(bundle.entriesBySuffix, "_content_list_v2.json");
|
||||
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
|
||||
if (contentListV2 != null) {
|
||||
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
|
||||
}
|
||||
@@ -344,7 +359,11 @@ public class MineruMapper {
|
||||
block.setType(item.getString("type"));
|
||||
block.setPageIndex(item.getInteger("page_idx"));
|
||||
block.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
|
||||
block.setLevel(item.getInteger("text_level"));
|
||||
Integer blockLevel = item.getInteger("text_level");
|
||||
if (blockLevel == null) {
|
||||
blockLevel = item.getInteger("level");
|
||||
}
|
||||
block.setLevel(blockLevel);
|
||||
block.setText(extractBlockText(item));
|
||||
block.setHtml(item.getString("table_body"));
|
||||
block.setImagePath(item.getString("img_path"));
|
||||
@@ -531,20 +550,16 @@ public class MineruMapper {
|
||||
return null;
|
||||
}
|
||||
|
||||
private JSONObject firstJsonObject(Map<String, byte[]> entries, String suffix) {
|
||||
private Object firstJsonValue(Map<String, byte[]> entries, String suffix) {
|
||||
String text = firstText(entries, suffix);
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseObject(text);
|
||||
}
|
||||
|
||||
private JSONArray firstJsonArray(Map<String, byte[]> entries, String suffix) {
|
||||
String text = firstText(entries, suffix);
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return null;
|
||||
try {
|
||||
return JSON.parse(text);
|
||||
} catch (Exception exception) {
|
||||
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
|
||||
}
|
||||
return JSON.parseArray(text);
|
||||
}
|
||||
|
||||
private JSONObject asObject(Object value) {
|
||||
@@ -554,6 +569,9 @@ public class MineruMapper {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
if (value instanceof JSONArray) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseObject(JSON.toJSONString(value));
|
||||
}
|
||||
|
||||
@@ -622,8 +640,9 @@ public class MineruMapper {
|
||||
String type = item.getString("type");
|
||||
if ("text".equals(type) || "header".equals(type) || "footer".equals(type)
|
||||
|| "page_number".equals(type) || "aside_text".equals(type) || "page_footnote".equals(type)
|
||||
|| "equation".equals(type)) {
|
||||
return item.getString("text");
|
||||
|| "equation".equals(type) || "title".equals(type)) {
|
||||
String text = item.getString("text");
|
||||
return StringUtil.hasText(text) ? text : item.getString("content");
|
||||
}
|
||||
if ("list".equals(type)) {
|
||||
return joinList(toStringList(item.getJSONArray("list_items")));
|
||||
@@ -635,9 +654,11 @@ public class MineruMapper {
|
||||
return joinList(toStringList(item.getJSONArray("image_caption")));
|
||||
}
|
||||
if ("table".equals(type)) {
|
||||
return joinList(toStringList(item.getJSONArray("table_caption")));
|
||||
String tableCaption = joinList(toStringList(item.getJSONArray("table_caption")));
|
||||
return StringUtil.hasText(tableCaption) ? tableCaption : item.getString("content");
|
||||
}
|
||||
return item.getString("text");
|
||||
String text = item.getString("text");
|
||||
return StringUtil.hasText(text) ? text : item.getString("content");
|
||||
}
|
||||
|
||||
private String extractTextFromMiddleBlock(JSONObject blockJson) {
|
||||
@@ -768,6 +789,9 @@ public class MineruMapper {
|
||||
return imageDataUrls.get(imagePath);
|
||||
}
|
||||
String baseName = baseName(imagePath);
|
||||
if (!StringUtil.hasText(baseName)) {
|
||||
return null;
|
||||
}
|
||||
for (Map.Entry<String, String> entry : imageDataUrls.entrySet()) {
|
||||
if (baseName.equals(baseName(entry.getKey()))) {
|
||||
return entry.getValue();
|
||||
|
||||
@@ -7,6 +7,8 @@ import com.easyagents.document.core.model.ParseResponse;
|
||||
import com.easyagents.document.core.model.ParseTaskInfo;
|
||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
||||
import com.easyagents.document.pdf.PdfDocumentProvider;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
@@ -19,6 +21,7 @@ import java.util.ArrayList;
|
||||
public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
|
||||
public static final String PROVIDER_NAME = "mineru";
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MineruPdfDocumentParseService.class);
|
||||
|
||||
private final MineruProperties properties;
|
||||
private final MineruPdfClient client;
|
||||
@@ -64,7 +67,17 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
@Override
|
||||
public ParseResponse parse(ParseRequest request) {
|
||||
ParseRequest normalizedRequest = normalizeRequest(request);
|
||||
return mapper.toParseResponse(client.parse(normalizedRequest));
|
||||
LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}, parseMethod={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
normalizedRequest.getBackend(),
|
||||
normalizedRequest.getParseMethod());
|
||||
ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
|
||||
LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -76,21 +89,41 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
normalizedRequest.setReturnContentList(true);
|
||||
normalizedRequest.setReturnModelOutput(true);
|
||||
normalizedRequest.setReturnImages(true);
|
||||
return mapper.toParseTaskStatus(client.submit(normalizedRequest));
|
||||
LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}, parseMethod={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
normalizedRequest.getBackend(),
|
||||
normalizedRequest.getParseMethod());
|
||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
|
||||
LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
|
||||
PROVIDER_NAME,
|
||||
taskStatus == null ? null : taskStatus.getTaskId(),
|
||||
taskStatus == null ? null : taskStatus.getStatus());
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
return mapper.toParseTaskStatus(client.queryTask(taskId));
|
||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
|
||||
LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
taskStatus == null ? null : taskStatus.getStatus());
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
|
||||
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
|
||||
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||
LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||
return response;
|
||||
}
|
||||
|
||||
@@ -104,6 +137,11 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||
taskInfo.setResult(response);
|
||||
}
|
||||
LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
taskInfo == null ? null : taskInfo.getStatus(),
|
||||
taskInfo != null && taskInfo.getResult() != null);
|
||||
return taskInfo;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user