feat: 完善统一文档解析与分块原文映射
- 兼容 MinerU docx 嵌套压缩工件与数组模型输出 - 补充异步解析日志与 sourceRanges 原文区间映射
This commit is contained in:
@@ -256,14 +256,29 @@ public class MineruMapper {
|
||||
result.setPlainText(markdown);
|
||||
|
||||
ParseArtifacts artifacts = new ParseArtifacts();
|
||||
JSONObject middleJson = firstJsonObject(bundle.entriesBySuffix, "_middle.json");
|
||||
JSONArray contentList = firstJsonArray(bundle.entriesBySuffix, "_content_list.json");
|
||||
JSONObject modelOutput = firstJsonObject(bundle.entriesBySuffix, "_model.json");
|
||||
artifacts.setMiddleJson(middleJson);
|
||||
artifacts.setContentList(contentList);
|
||||
Object middleArtifact = firstJsonValue(bundle.entriesBySuffix, "_middle.json");
|
||||
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
|
||||
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
|
||||
|
||||
JSONObject middleJson = asObject(middleArtifact);
|
||||
JSONArray contentList = asArray(contentListArtifact);
|
||||
Object modelOutput = modelOutputArtifact;
|
||||
|
||||
// MinerU 在 DOCX 等场景下可能将结构化块列表放在 middle/model 工件里,并且直接返回数组。
|
||||
if (contentList == null && middleArtifact instanceof JSONArray) {
|
||||
contentList = (JSONArray) middleArtifact;
|
||||
middleJson = null;
|
||||
middleArtifact = null;
|
||||
}
|
||||
if (contentList == null && modelOutputArtifact instanceof JSONArray) {
|
||||
contentList = (JSONArray) modelOutputArtifact;
|
||||
}
|
||||
|
||||
artifacts.setMiddleJson(middleArtifact);
|
||||
artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
|
||||
artifacts.setModelOutput(modelOutput);
|
||||
|
||||
JSONArray contentListV2 = firstJsonArray(bundle.entriesBySuffix, "_content_list_v2.json");
|
||||
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
|
||||
if (contentListV2 != null) {
|
||||
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
|
||||
}
|
||||
@@ -344,7 +359,11 @@ public class MineruMapper {
|
||||
block.setType(item.getString("type"));
|
||||
block.setPageIndex(item.getInteger("page_idx"));
|
||||
block.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
|
||||
block.setLevel(item.getInteger("text_level"));
|
||||
Integer blockLevel = item.getInteger("text_level");
|
||||
if (blockLevel == null) {
|
||||
blockLevel = item.getInteger("level");
|
||||
}
|
||||
block.setLevel(blockLevel);
|
||||
block.setText(extractBlockText(item));
|
||||
block.setHtml(item.getString("table_body"));
|
||||
block.setImagePath(item.getString("img_path"));
|
||||
@@ -531,20 +550,16 @@ public class MineruMapper {
|
||||
return null;
|
||||
}
|
||||
|
||||
private JSONObject firstJsonObject(Map<String, byte[]> entries, String suffix) {
|
||||
private Object firstJsonValue(Map<String, byte[]> entries, String suffix) {
|
||||
String text = firstText(entries, suffix);
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseObject(text);
|
||||
}
|
||||
|
||||
private JSONArray firstJsonArray(Map<String, byte[]> entries, String suffix) {
|
||||
String text = firstText(entries, suffix);
|
||||
if (!StringUtil.hasText(text)) {
|
||||
return null;
|
||||
try {
|
||||
return JSON.parse(text);
|
||||
} catch (Exception exception) {
|
||||
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
|
||||
}
|
||||
return JSON.parseArray(text);
|
||||
}
|
||||
|
||||
private JSONObject asObject(Object value) {
|
||||
@@ -554,6 +569,9 @@ public class MineruMapper {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
if (value instanceof JSONArray) {
|
||||
return null;
|
||||
}
|
||||
return JSON.parseObject(JSON.toJSONString(value));
|
||||
}
|
||||
|
||||
@@ -622,8 +640,9 @@ public class MineruMapper {
|
||||
String type = item.getString("type");
|
||||
if ("text".equals(type) || "header".equals(type) || "footer".equals(type)
|
||||
|| "page_number".equals(type) || "aside_text".equals(type) || "page_footnote".equals(type)
|
||||
|| "equation".equals(type)) {
|
||||
return item.getString("text");
|
||||
|| "equation".equals(type) || "title".equals(type)) {
|
||||
String text = item.getString("text");
|
||||
return StringUtil.hasText(text) ? text : item.getString("content");
|
||||
}
|
||||
if ("list".equals(type)) {
|
||||
return joinList(toStringList(item.getJSONArray("list_items")));
|
||||
@@ -635,9 +654,11 @@ public class MineruMapper {
|
||||
return joinList(toStringList(item.getJSONArray("image_caption")));
|
||||
}
|
||||
if ("table".equals(type)) {
|
||||
return joinList(toStringList(item.getJSONArray("table_caption")));
|
||||
String tableCaption = joinList(toStringList(item.getJSONArray("table_caption")));
|
||||
return StringUtil.hasText(tableCaption) ? tableCaption : item.getString("content");
|
||||
}
|
||||
return item.getString("text");
|
||||
String text = item.getString("text");
|
||||
return StringUtil.hasText(text) ? text : item.getString("content");
|
||||
}
|
||||
|
||||
private String extractTextFromMiddleBlock(JSONObject blockJson) {
|
||||
@@ -768,6 +789,9 @@ public class MineruMapper {
|
||||
return imageDataUrls.get(imagePath);
|
||||
}
|
||||
String baseName = baseName(imagePath);
|
||||
if (!StringUtil.hasText(baseName)) {
|
||||
return null;
|
||||
}
|
||||
for (Map.Entry<String, String> entry : imageDataUrls.entrySet()) {
|
||||
if (baseName.equals(baseName(entry.getKey()))) {
|
||||
return entry.getValue();
|
||||
|
||||
@@ -7,6 +7,8 @@ import com.easyagents.document.core.model.ParseResponse;
|
||||
import com.easyagents.document.core.model.ParseTaskInfo;
|
||||
import com.easyagents.document.core.model.ParseTaskStatus;
|
||||
import com.easyagents.document.pdf.PdfDocumentProvider;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
@@ -19,6 +21,7 @@ import java.util.ArrayList;
|
||||
public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
|
||||
public static final String PROVIDER_NAME = "mineru";
|
||||
private static final Logger LOG = LoggerFactory.getLogger(MineruPdfDocumentParseService.class);
|
||||
|
||||
private final MineruProperties properties;
|
||||
private final MineruPdfClient client;
|
||||
@@ -64,7 +67,17 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
@Override
|
||||
public ParseResponse parse(ParseRequest request) {
|
||||
ParseRequest normalizedRequest = normalizeRequest(request);
|
||||
return mapper.toParseResponse(client.parse(normalizedRequest));
|
||||
LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}, parseMethod={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
normalizedRequest.getBackend(),
|
||||
normalizedRequest.getParseMethod());
|
||||
ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
|
||||
LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||
return response;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -76,21 +89,41 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
normalizedRequest.setReturnContentList(true);
|
||||
normalizedRequest.setReturnModelOutput(true);
|
||||
normalizedRequest.setReturnImages(true);
|
||||
return mapper.toParseTaskStatus(client.submit(normalizedRequest));
|
||||
LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}, parseMethod={}",
|
||||
PROVIDER_NAME,
|
||||
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
|
||||
normalizedRequest.getBackend(),
|
||||
normalizedRequest.getParseMethod());
|
||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
|
||||
LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
|
||||
PROVIDER_NAME,
|
||||
taskStatus == null ? null : taskStatus.getTaskId(),
|
||||
taskStatus == null ? null : taskStatus.getStatus());
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseTaskStatus queryTask(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
return mapper.toParseTaskStatus(client.queryTask(taskId));
|
||||
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
|
||||
LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
taskStatus == null ? null : taskStatus.getStatus());
|
||||
return taskStatus;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ParseResponse queryResult(String taskId) {
|
||||
validateTaskId(taskId);
|
||||
LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
|
||||
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
|
||||
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
|
||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||
LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
response == null || response.getResults() == null ? 0 : response.getResults().size());
|
||||
return response;
|
||||
}
|
||||
|
||||
@@ -104,6 +137,11 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
|
||||
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
|
||||
taskInfo.setResult(response);
|
||||
}
|
||||
LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
|
||||
PROVIDER_NAME,
|
||||
taskId,
|
||||
taskInfo == null ? null : taskInfo.getStatus(),
|
||||
taskInfo != null && taskInfo.getResult() != null);
|
||||
return taskInfo;
|
||||
}
|
||||
|
||||
|
||||
@@ -121,6 +121,21 @@ public class MineruMapperTest {
|
||||
Assert.assertEquals("3.0.9", response.getVersion());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void shouldMapNestedZipWhenModelArtifactIsArray() throws IOException {
|
||||
MineruMapper mapper = new MineruMapper(defaultProperties());
|
||||
|
||||
ParseResponse response = mapper.fromZip(buildNestedDocxZipWithArrayModel());
|
||||
|
||||
Assert.assertEquals(1, response.getResults().size());
|
||||
ParseResult result = response.getResults().get(0);
|
||||
Assert.assertEquals("demo", result.getFileName());
|
||||
Assert.assertEquals("# nested", result.getMarkdown());
|
||||
Assert.assertFalse(result.getBlocks().isEmpty());
|
||||
Assert.assertTrue(result.getArtifacts().getModelOutput() instanceof JSONArray);
|
||||
Assert.assertTrue(result.getArtifacts().getContentList() instanceof JSONArray);
|
||||
}
|
||||
|
||||
private MineruProperties defaultProperties() {
|
||||
MineruProperties properties = new MineruProperties();
|
||||
properties.setBaseUrl("http://127.0.0.1:8000");
|
||||
@@ -354,6 +369,45 @@ public class MineruMapperTest {
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
|
||||
private byte[] buildNestedDocxZipWithArrayModel() throws IOException {
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream);
|
||||
addEntry(zipOutputStream, "demo/vlm/markdown/demo.md", "# nested");
|
||||
addEntry(zipOutputStream, "demo/vlm/layout/demo_middle.json", middleJson().toJSONString());
|
||||
addEntry(zipOutputStream, "demo/vlm/model/demo_model.json", nestedDocxContentList().toJSONString());
|
||||
addBinaryEntry(zipOutputStream, "demo/vlm/images/figure.png", "image".getBytes(StandardCharsets.UTF_8));
|
||||
zipOutputStream.close();
|
||||
return outputStream.toByteArray();
|
||||
}
|
||||
|
||||
private JSONArray nestedDocxContentList() {
|
||||
JSONArray contentList = new JSONArray();
|
||||
|
||||
JSONObject title = new JSONObject();
|
||||
title.put("type", "title");
|
||||
title.put("content", "二、技术要求");
|
||||
title.put("page_idx", 0);
|
||||
title.put("bbox", bbox());
|
||||
contentList.add(title);
|
||||
|
||||
JSONObject text = new JSONObject();
|
||||
text.put("type", "text");
|
||||
text.put("content", "响应方式");
|
||||
text.put("page_idx", 0);
|
||||
text.put("bbox", bbox());
|
||||
contentList.add(text);
|
||||
|
||||
JSONObject table = new JSONObject();
|
||||
table.put("type", "table");
|
||||
table.put("content", "<table></table>");
|
||||
table.put("table_body", "<table></table>");
|
||||
table.put("page_idx", 0);
|
||||
table.put("bbox", bbox());
|
||||
contentList.add(table);
|
||||
|
||||
return contentList;
|
||||
}
|
||||
|
||||
private void addEntry(ZipOutputStream zipOutputStream, String name, String content) throws IOException {
|
||||
addBinaryEntry(zipOutputStream, name, content.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user