feat: 扩展 Office 文档解析能力

- 重构 document-core 与 MinerU 公共层,补齐 Office 异步任务基础设施

- 新增 PPTX/XLSX 解析模块与 starter 自动装配

- 补充 README 与相关测试覆盖
This commit is contained in:
2026-04-16 21:51:16 +08:00
parent 547d4f6ee0
commit b66876d0fd
66 changed files with 4015 additions and 296 deletions

View File

@@ -1,6 +1,7 @@
package com.easyagents.document.pdf;
import com.easyagents.document.core.DocumentParseService;
import com.easyagents.document.core.entity.PdfParseRequest;
/**
* PDF 文档解析服务。
@@ -8,5 +9,5 @@ import com.easyagents.document.core.DocumentParseService;
* @author Codex
* @since 2026-04-14
*/
public interface PdfDocumentParseService extends DocumentParseService {
public interface PdfDocumentParseService extends DocumentParseService<PdfParseRequest> {
}

View File

@@ -1,854 +0,0 @@
package com.easyagents.document.pdf.mineru;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import com.easyagents.core.util.StringUtil;
import com.easyagents.document.core.exception.DocumentParseException;
import com.easyagents.document.core.model.DocumentBlock;
import com.easyagents.document.core.model.DocumentImage;
import com.easyagents.document.core.model.DocumentPage;
import com.easyagents.document.core.model.DocumentTable;
import com.easyagents.document.core.model.ParseArtifacts;
import com.easyagents.document.core.model.ParseRequest;
import com.easyagents.document.core.model.ParseResponse;
import com.easyagents.document.core.model.ParseResult;
import com.easyagents.document.core.model.ParseTaskStatus;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Base64;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
/**
* MinerU 原始协议与统一模型之间的映射器。
*
* @author Codex
* @since 2026-04-14
*/
public class MineruMapper {
private final MineruProperties properties;
/**
* 创建映射器。
*
* @param properties MinerU 配置
*/
public MineruMapper(MineruProperties properties) {
this.properties = properties;
}
/**
* 构建同步请求表单字段。
*
* @param request 解析请求
* @return 表单字段
*/
public Map<String, List<String>> buildSyncFormFields(ParseRequest request) {
Map<String, List<String>> fields = buildBaseFormFields(request);
putSingleValue(fields, "return_md", String.valueOf(isTrue(request.getReturnMarkdown())));
putSingleValue(fields, "return_middle_json", String.valueOf(isTrue(request.getReturnMiddleJson())));
putSingleValue(fields, "return_content_list", String.valueOf(isTrue(request.getReturnContentList())));
putSingleValue(fields, "return_model_output", String.valueOf(isTrue(request.getReturnModelOutput())));
putSingleValue(fields, "return_images", String.valueOf(isTrue(request.getReturnImages())));
putSingleValue(fields, "response_format_zip", "false");
return fields;
}
/**
* 构建异步请求表单字段。
*
* @param request 解析请求
* @return 表单字段
*/
public Map<String, List<String>> buildAsyncFormFields(ParseRequest request) {
Map<String, List<String>> fields = buildBaseFormFields(request);
// 异步结果固定按全量 ZIP 返回,避免超大结果通过 JSON 传输。
putSingleValue(fields, "return_md", "true");
putSingleValue(fields, "return_middle_json", "true");
putSingleValue(fields, "return_content_list", "true");
putSingleValue(fields, "return_model_output", "true");
putSingleValue(fields, "return_images", "true");
putSingleValue(fields, "response_format_zip", "true");
return fields;
}
/**
* 将原始 JSON 转为 MinerU 任务状态 DTO。
*
* @param jsonObject 原始 JSON
* @return 任务状态 DTO
*/
public MineruTaskStatus toTaskStatus(JSONObject jsonObject) {
MineruTaskStatus taskStatus = new MineruTaskStatus();
taskStatus.setTaskId(jsonObject.getString("task_id"));
taskStatus.setStatus(jsonObject.getString("status"));
taskStatus.setBackend(jsonObject.getString("backend"));
taskStatus.setFileNames(toStringList(jsonObject.getJSONArray("file_names")));
taskStatus.setCreatedAt(jsonObject.getString("created_at"));
taskStatus.setStartedAt(jsonObject.getString("started_at"));
taskStatus.setCompletedAt(jsonObject.getString("completed_at"));
taskStatus.setError(jsonObject.getString("error"));
taskStatus.setStatusUrl(jsonObject.getString("status_url"));
taskStatus.setResultUrl(jsonObject.getString("result_url"));
taskStatus.setQueuedAhead(jsonObject.getInteger("queued_ahead"));
taskStatus.setVersion(jsonObject.getString("version"));
taskStatus.setMessage(jsonObject.getString("message"));
return taskStatus;
}
/**
* 将原始 JSON 转为 MinerU 结果 DTO。
*
* @param jsonObject 原始 JSON
* @return 结果 DTO
*/
public MineruResultPayload toResultPayload(JSONObject jsonObject) {
MineruResultPayload payload = new MineruResultPayload();
payload.setBackend(jsonObject.getString("backend"));
payload.setVersion(jsonObject.getString("version"));
Map<String, JSONObject> results = new LinkedHashMap<String, JSONObject>();
JSONObject resultJson = jsonObject.getJSONObject("results");
if (resultJson != null) {
for (String key : resultJson.keySet()) {
results.put(key, resultJson.getJSONObject(key));
}
}
payload.setResults(results);
return payload;
}
/**
* 将 MinerU 任务状态转为统一模型。
*
* @param taskStatus 原始任务状态
* @return 统一任务状态
*/
public ParseTaskStatus toParseTaskStatus(MineruTaskStatus taskStatus) {
ParseTaskStatus status = new ParseTaskStatus();
status.setTaskId(taskStatus.getTaskId());
status.setStatus(taskStatus.getStatus());
status.setBackend(taskStatus.getBackend());
status.setFileNames(taskStatus.getFileNames());
status.setCreatedAt(taskStatus.getCreatedAt());
status.setStartedAt(taskStatus.getStartedAt());
status.setCompletedAt(taskStatus.getCompletedAt());
status.setError(taskStatus.getError());
status.setStatusUrl(taskStatus.getStatusUrl());
status.setResultUrl(taskStatus.getResultUrl());
status.setQueuedAhead(taskStatus.getQueuedAhead());
return status;
}
/**
* 将同步 JSON 结果转为统一响应。
*
* @param payload MinerU 结果 DTO
* @return 统一响应
*/
public ParseResponse toParseResponse(MineruResultPayload payload) {
ParseResponse response = new ParseResponse();
response.setBackend(payload.getBackend());
response.setVersion(payload.getVersion());
List<ParseResult> parseResults = new ArrayList<ParseResult>();
for (Map.Entry<String, JSONObject> entry : payload.getResults().entrySet()) {
parseResults.add(mapSingleResult(entry.getKey(), entry.getValue()));
}
response.setResults(parseResults);
return response;
}
/**
* 将 ZIP 结果转为统一响应。
*
* @param zipBytes ZIP 二进制
* @return 统一响应
*/
public ParseResponse fromZip(byte[] zipBytes) {
Map<String, ZipArtifactBundle> bundles = unzip(zipBytes);
if (bundles.isEmpty()) {
throw new DocumentParseException("MinerU ZIP result does not contain any parse artifacts");
}
ParseResponse response = new ParseResponse();
List<ParseResult> parseResults = new ArrayList<ParseResult>();
for (Map.Entry<String, ZipArtifactBundle> entry : bundles.entrySet()) {
parseResults.add(mapZipBundle(entry.getKey(), entry.getValue()));
}
response.setResults(parseResults);
return response;
}
/**
* 使用异步任务状态和 ZIP 内部工件回填响应元数据。
*
* @param response 统一响应
* @param backend 任务状态中的 backend
* @param version 任务状态中的 version
*/
public void enrichAsyncResponse(ParseResponse response, String backend, String version) {
if (response == null) {
return;
}
response.setBackend(StringUtil.hasText(backend) ? backend : resolveBackendFromResults(response));
String resolvedVersion = StringUtil.hasText(version) ? version : resolveVersionFromResults(response);
response.setVersion(resolvedVersion);
}
private Map<String, List<String>> buildBaseFormFields(ParseRequest request) {
Map<String, List<String>> fields = new LinkedHashMap<String, List<String>>();
putSingleValue(fields, "backend", StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend());
putSingleValue(fields, "parse_method", StringUtil.hasText(request.getParseMethod()) ? request.getParseMethod() : properties.getDefaultParseMethod());
putSingleValue(fields, "formula_enable", String.valueOf(boolOrDefault(request.getFormulaEnabled(), properties.getDefaultFormulaEnable())));
putSingleValue(fields, "table_enable", String.valueOf(boolOrDefault(request.getTableEnabled(), properties.getDefaultTableEnable())));
putSingleValue(fields, "start_page_id", String.valueOf(intOrDefault(request.getStartPageIndex(), 0)));
putSingleValue(fields, "end_page_id", String.valueOf(intOrDefault(request.getEndPageIndex(), 99999)));
List<String> languages = request.getLanguages();
if (languages == null || languages.isEmpty()) {
languages = properties.getDefaultLangList();
}
if (languages != null && !languages.isEmpty()) {
// MinerU 通过重复的 lang_list 表单字段接收多语言参数。
fields.put("lang_list", new ArrayList<String>(languages));
}
return fields;
}
private void putSingleValue(Map<String, List<String>> fields, String key, String value) {
List<String> values = new ArrayList<String>(1);
values.add(value);
fields.put(key, values);
}
private ParseResult mapSingleResult(String fileName, JSONObject fileResult) {
ParseResult result = new ParseResult();
result.setFileName(fileName);
result.setMarkdown(fileResult.getString("md_content"));
result.setPlainText(result.getMarkdown());
ParseArtifacts artifacts = new ParseArtifacts();
artifacts.setMiddleJson(fileResult.get("middle_json"));
artifacts.setContentList(fileResult.get("content_list"));
artifacts.setModelOutput(fileResult.get("model_output"));
result.setArtifacts(artifacts);
Map<String, String> imageDataUrls = toStringMap(fileResult.getJSONObject("images"));
applyStructuredArtifacts(result, imageDataUrls);
if (result.getMarkdown() == null && result.getArtifacts().getMiddleJson() == null && result.getArtifacts().getContentList() == null) {
result.getWarnings().add("MinerU did not return markdown, middle_json or content_list");
}
return result;
}
private ParseResult mapZipBundle(String fileName, ZipArtifactBundle bundle) {
ParseResult result = new ParseResult();
result.setFileName(fileName);
String markdown = firstText(bundle.entriesBySuffix, ".md");
result.setMarkdown(markdown);
result.setPlainText(markdown);
ParseArtifacts artifacts = new ParseArtifacts();
Object middleArtifact = firstJsonValue(bundle.entriesBySuffix, "_middle.json");
Object contentListArtifact = firstJsonValue(bundle.entriesBySuffix, "_content_list.json");
Object modelOutputArtifact = firstJsonValue(bundle.entriesBySuffix, "_model.json");
JSONObject middleJson = asObject(middleArtifact);
JSONArray contentList = asArray(contentListArtifact);
Object modelOutput = modelOutputArtifact;
// MinerU 在 DOCX 等场景下可能将结构化块列表放在 middle/model 工件里,并且直接返回数组。
if (contentList == null && middleArtifact instanceof JSONArray) {
contentList = (JSONArray) middleArtifact;
middleJson = null;
middleArtifact = null;
}
if (contentList == null && modelOutputArtifact instanceof JSONArray) {
contentList = (JSONArray) modelOutputArtifact;
}
artifacts.setMiddleJson(middleArtifact);
artifacts.setContentList(contentList == null ? contentListArtifact : contentList);
artifacts.setModelOutput(modelOutput);
JSONArray contentListV2 = asArray(firstJsonValue(bundle.entriesBySuffix, "_content_list_v2.json"));
if (contentListV2 != null) {
artifacts.getExtraJsonArtifacts().put("contentListV2", contentListV2);
}
for (Map.Entry<String, byte[]> entry : bundle.otherBinaryEntries.entrySet()) {
artifacts.getExtraBinaryArtifacts().put(entry.getKey(), entry.getValue());
}
result.setArtifacts(artifacts);
Map<String, String> imageDataUrls = new LinkedHashMap<String, String>();
for (Map.Entry<String, byte[]> imageEntry : bundle.images.entrySet()) {
imageDataUrls.put(imageEntry.getKey(), toDataUrl(imageEntry.getKey(), imageEntry.getValue()));
}
applyStructuredArtifacts(result, imageDataUrls);
if (markdown == null && middleJson == null && contentList == null) {
throw new DocumentParseException("MinerU ZIP result missing critical artifacts for file: " + fileName);
}
return result;
}
private void applyStructuredArtifacts(ParseResult result, Map<String, String> imageDataUrls) {
JSONObject middleJson = asObject(result.getArtifacts().getMiddleJson());
JSONArray contentList = asArray(result.getArtifacts().getContentList());
if (middleJson != null) {
fillPages(result, middleJson);
result.getMetadata().put("middleBackend", middleJson.getString("_backend"));
result.getMetadata().put("middleVersion", middleJson.getString("_version_name"));
}
if (contentList != null) {
fillFromContentList(result, contentList, imageDataUrls);
} else if (middleJson != null) {
fillFromMiddleJson(result, middleJson, imageDataUrls);
}
if ((result.getImages() == null || result.getImages().isEmpty()) && imageDataUrls != null && !imageDataUrls.isEmpty()) {
for (Map.Entry<String, String> entry : imageDataUrls.entrySet()) {
DocumentImage image = new DocumentImage();
image.setName(baseName(entry.getKey()));
image.setSourcePath(entry.getKey());
image.setDataUrl(entry.getValue());
image.setMimeType(detectMimeType(entry.getKey()));
result.getImages().add(image);
}
}
}
private void fillPages(ParseResult result, JSONObject middleJson) {
JSONArray pdfInfo = middleJson.getJSONArray("pdf_info");
if (pdfInfo == null) {
return;
}
List<DocumentPage> pages = new ArrayList<DocumentPage>();
for (int index = 0; index < pdfInfo.size(); index++) {
JSONObject pageJson = pdfInfo.getJSONObject(index);
DocumentPage page = new DocumentPage();
page.setPageIndex(pageJson.getInteger("page_idx"));
JSONArray pageSize = pageJson.getJSONArray("page_size");
if (pageSize != null && pageSize.size() >= 2) {
page.setWidth(pageSize.getDouble(0));
page.setHeight(pageSize.getDouble(1));
}
page.getMetadata().put("raw", pageJson);
pages.add(page);
}
result.setPages(pages);
}
private void fillFromContentList(ParseResult result, JSONArray contentList, Map<String, String> imageDataUrls) {
for (int index = 0; index < contentList.size(); index++) {
JSONObject item = contentList.getJSONObject(index);
if (item == null) {
continue;
}
DocumentBlock block = new DocumentBlock();
block.setType(item.getString("type"));
block.setPageIndex(item.getInteger("page_idx"));
block.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
Integer blockLevel = item.getInteger("text_level");
if (blockLevel == null) {
blockLevel = item.getInteger("level");
}
block.setLevel(blockLevel);
block.setText(extractBlockText(item));
block.setHtml(item.getString("table_body"));
block.setImagePath(item.getString("img_path"));
block.getMetadata().put("raw", item);
result.getBlocks().add(block);
if ("table".equals(item.getString("type"))) {
DocumentTable table = new DocumentTable();
table.setPageIndex(item.getInteger("page_idx"));
table.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
table.setHtml(item.getString("table_body"));
table.setImagePath(item.getString("img_path"));
table.setCaptions(toStringList(item.getJSONArray("table_caption")));
table.setFootnotes(toStringList(item.getJSONArray("table_footnote")));
result.getTables().add(table);
}
if (isVisualType(item.getString("type"))) {
DocumentImage image = new DocumentImage();
image.setPageIndex(item.getInteger("page_idx"));
image.setBoundingBox(toDoubleList(item.getJSONArray("bbox")));
image.setSourcePath(item.getString("img_path"));
image.setName(baseName(item.getString("img_path")));
image.setMimeType(detectMimeType(item.getString("img_path")));
image.setCaptions(extractCaptions(item));
image.setFootnotes(extractFootnotes(item));
image.setDataUrl(matchDataUrl(item.getString("img_path"), imageDataUrls));
result.getImages().add(image);
}
}
}
private void fillFromMiddleJson(ParseResult result, JSONObject middleJson, Map<String, String> imageDataUrls) {
JSONArray pages = middleJson.getJSONArray("pdf_info");
if (pages == null) {
return;
}
for (int pageIndex = 0; pageIndex < pages.size(); pageIndex++) {
JSONObject page = pages.getJSONObject(pageIndex);
fillBlocksFromMiddlePage(result, page.getJSONArray("para_blocks"), page.getInteger("page_idx"));
fillVisualsFromMiddlePage(result, page.getJSONArray("tables"), page.getInteger("page_idx"), true, imageDataUrls);
fillVisualsFromMiddlePage(result, page.getJSONArray("images"), page.getInteger("page_idx"), false, imageDataUrls);
}
}
private void fillBlocksFromMiddlePage(ParseResult result, JSONArray blocks, Integer pageIndex) {
if (blocks == null) {
return;
}
for (int index = 0; index < blocks.size(); index++) {
JSONObject blockJson = blocks.getJSONObject(index);
if (blockJson == null) {
continue;
}
DocumentBlock block = new DocumentBlock();
block.setType(blockJson.getString("type"));
block.setPageIndex(pageIndex);
block.setBoundingBox(toDoubleList(blockJson.getJSONArray("bbox")));
block.setText(extractTextFromMiddleBlock(blockJson));
block.setImagePath(extractImagePathFromMiddleBlock(blockJson));
block.getMetadata().put("raw", blockJson);
result.getBlocks().add(block);
}
}
private void fillVisualsFromMiddlePage(ParseResult result, JSONArray blocks, Integer pageIndex, boolean table, Map<String, String> imageDataUrls) {
if (blocks == null) {
return;
}
for (int index = 0; index < blocks.size(); index++) {
JSONObject blockJson = blocks.getJSONObject(index);
if (blockJson == null) {
continue;
}
if (table) {
DocumentTable documentTable = new DocumentTable();
documentTable.setPageIndex(pageIndex);
documentTable.setBoundingBox(toDoubleList(blockJson.getJSONArray("bbox")));
documentTable.setCaptions(extractTextsByType(blockJson, "table_caption"));
documentTable.setFootnotes(extractTextsByType(blockJson, "table_footnote"));
documentTable.setImagePath(extractImagePathByType(blockJson, "table_body"));
result.getTables().add(documentTable);
} else {
DocumentImage documentImage = new DocumentImage();
documentImage.setPageIndex(pageIndex);
documentImage.setBoundingBox(toDoubleList(blockJson.getJSONArray("bbox")));
documentImage.setCaptions(extractTextsByType(blockJson, "image_caption"));
documentImage.setFootnotes(extractTextsByType(blockJson, "image_footnote"));
documentImage.setSourcePath(extractImagePathByType(blockJson, "image_body"));
documentImage.setName(baseName(documentImage.getSourcePath()));
documentImage.setMimeType(detectMimeType(documentImage.getSourcePath()));
documentImage.setDataUrl(matchDataUrl(documentImage.getSourcePath(), imageDataUrls));
result.getImages().add(documentImage);
}
}
}
private String resolveBackendFromResults(ParseResponse response) {
if (response.getResults() == null || response.getResults().isEmpty()) {
return properties.getDefaultBackend();
}
for (ParseResult result : response.getResults()) {
Object middleBackend = result.getMetadata().get("middleBackend");
if (middleBackend instanceof String && StringUtil.hasText((String) middleBackend)) {
return (String) middleBackend;
}
}
return properties.getDefaultBackend();
}
private String resolveVersionFromResults(ParseResponse response) {
if (response.getResults() == null || response.getResults().isEmpty()) {
return null;
}
for (ParseResult result : response.getResults()) {
Object middleVersion = result.getMetadata().get("middleVersion");
if (middleVersion instanceof String && StringUtil.hasText((String) middleVersion)) {
return (String) middleVersion;
}
}
return null;
}
private Map<String, ZipArtifactBundle> unzip(byte[] zipBytes) {
Map<String, ZipArtifactBundle> bundles = new LinkedHashMap<String, ZipArtifactBundle>();
try (ZipInputStream zipInputStream = new ZipInputStream(new ByteArrayInputStream(zipBytes))) {
ZipEntry entry;
while ((entry = zipInputStream.getNextEntry()) != null) {
if (entry.isDirectory()) {
continue;
}
byte[] entryBytes = readBytes(zipInputStream);
String entryName = entry.getName();
String fileName = resolveFileName(entryName);
ZipArtifactBundle bundle = bundles.get(fileName);
if (bundle == null) {
bundle = new ZipArtifactBundle();
bundles.put(fileName, bundle);
}
if (entryName.contains("/images/")) {
bundle.images.put(entryName, entryBytes);
} else if (entryName.endsWith(".md")
|| entryName.endsWith("_middle.json")
|| entryName.endsWith("_content_list.json")
|| entryName.endsWith("_content_list_v2.json")
|| entryName.endsWith("_model.json")) {
bundle.entriesBySuffix.put(entryName, entryBytes);
} else {
bundle.otherBinaryEntries.put(entryName, entryBytes);
}
}
} catch (IOException exception) {
throw new DocumentParseException("Failed to unzip MinerU result", exception);
}
return bundles;
}
private byte[] readBytes(ZipInputStream zipInputStream) throws IOException {
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
byte[] buffer = new byte[8192];
int length;
while ((length = zipInputStream.read(buffer)) >= 0) {
outputStream.write(buffer, 0, length);
}
return outputStream.toByteArray();
}
private String resolveFileName(String entryName) {
String[] segments = entryName.split("/");
if (segments.length > 0 && StringUtil.hasText(segments[0])) {
return segments[0];
}
String fileName = baseName(entryName);
int dotIndex = fileName.indexOf('.');
return dotIndex > 0 ? fileName.substring(0, dotIndex) : fileName;
}
private String firstText(Map<String, byte[]> entries, String suffix) {
for (Map.Entry<String, byte[]> entry : entries.entrySet()) {
if (entry.getKey().endsWith(suffix)) {
return new String(entry.getValue());
}
}
return null;
}
private Object firstJsonValue(Map<String, byte[]> entries, String suffix) {
String text = firstText(entries, suffix);
if (!StringUtil.hasText(text)) {
return null;
}
try {
return JSON.parse(text);
} catch (Exception exception) {
throw new DocumentParseException("Failed to parse MinerU JSON artifact: suffix=" + suffix, exception);
}
}
private JSONObject asObject(Object value) {
if (value instanceof JSONObject) {
return (JSONObject) value;
}
if (value == null) {
return null;
}
if (value instanceof JSONArray) {
return null;
}
return JSON.parseObject(JSON.toJSONString(value));
}
private JSONArray asArray(Object value) {
if (value instanceof JSONArray) {
return (JSONArray) value;
}
if (value == null) {
return null;
}
return JSON.parseArray(JSON.toJSONString(value));
}
private List<String> toStringList(JSONArray jsonArray) {
if (jsonArray == null || jsonArray.isEmpty()) {
return new ArrayList<String>();
}
List<String> values = new ArrayList<String>();
for (int index = 0; index < jsonArray.size(); index++) {
values.add(jsonArray.getString(index));
}
return values;
}
private Map<String, String> toStringMap(JSONObject jsonObject) {
if (jsonObject == null || jsonObject.isEmpty()) {
return new LinkedHashMap<String, String>();
}
Map<String, String> values = new LinkedHashMap<String, String>();
for (String key : jsonObject.keySet()) {
values.put(key, jsonObject.getString(key));
}
return values;
}
private List<Double> toDoubleList(JSONArray jsonArray) {
if (jsonArray == null || jsonArray.isEmpty()) {
return new ArrayList<Double>();
}
List<Double> values = new ArrayList<Double>();
for (int index = 0; index < jsonArray.size(); index++) {
values.add(jsonArray.getDouble(index));
}
return values;
}
private List<String> extractCaptions(JSONObject item) {
List<String> texts = new ArrayList<String>();
texts.addAll(toStringList(item.getJSONArray("image_caption")));
texts.addAll(toStringList(item.getJSONArray("table_caption")));
return texts;
}
private List<String> extractFootnotes(JSONObject item) {
List<String> texts = new ArrayList<String>();
texts.addAll(toStringList(item.getJSONArray("image_footnote")));
texts.addAll(toStringList(item.getJSONArray("table_footnote")));
return texts;
}
private boolean isVisualType(String type) {
return "image".equals(type) || "table".equals(type) || "chart".equals(type) || "seal".equals(type);
}
private String extractBlockText(JSONObject item) {
String type = item.getString("type");
if ("text".equals(type) || "header".equals(type) || "footer".equals(type)
|| "page_number".equals(type) || "aside_text".equals(type) || "page_footnote".equals(type)
|| "equation".equals(type) || "title".equals(type)) {
String text = item.getString("text");
return StringUtil.hasText(text) ? text : item.getString("content");
}
if ("list".equals(type)) {
return joinList(toStringList(item.getJSONArray("list_items")));
}
if ("code".equals(type)) {
return item.getString("code_body");
}
if ("image".equals(type)) {
return joinList(toStringList(item.getJSONArray("image_caption")));
}
if ("table".equals(type)) {
String tableCaption = joinList(toStringList(item.getJSONArray("table_caption")));
return StringUtil.hasText(tableCaption) ? tableCaption : item.getString("content");
}
String text = item.getString("text");
return StringUtil.hasText(text) ? text : item.getString("content");
}
private String extractTextFromMiddleBlock(JSONObject blockJson) {
List<String> texts = new ArrayList<String>();
JSONArray blocks = blockJson.getJSONArray("blocks");
if (blocks == null) {
return null;
}
for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) {
JSONObject childBlock = blocks.getJSONObject(blockIndex);
JSONArray lines = childBlock.getJSONArray("lines");
if (lines == null) {
continue;
}
for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) {
JSONObject line = lines.getJSONObject(lineIndex);
JSONArray spans = line.getJSONArray("spans");
if (spans == null) {
continue;
}
for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) {
JSONObject span = spans.getJSONObject(spanIndex);
if (span.containsKey("content")) {
texts.add(span.getString("content"));
}
}
}
}
return joinList(texts);
}
private String extractImagePathFromMiddleBlock(JSONObject blockJson) {
JSONArray blocks = blockJson.getJSONArray("blocks");
if (blocks == null) {
return null;
}
for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) {
JSONObject childBlock = blocks.getJSONObject(blockIndex);
JSONArray lines = childBlock.getJSONArray("lines");
if (lines == null) {
continue;
}
for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) {
JSONObject line = lines.getJSONObject(lineIndex);
JSONArray spans = line.getJSONArray("spans");
if (spans == null) {
continue;
}
for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) {
JSONObject span = spans.getJSONObject(spanIndex);
if (span.containsKey("img_path")) {
return span.getString("img_path");
}
}
}
}
return null;
}
private List<String> extractTextsByType(JSONObject visualBlock, String expectedType) {
List<String> texts = new ArrayList<String>();
JSONArray blocks = visualBlock.getJSONArray("blocks");
if (blocks == null) {
return texts;
}
for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) {
JSONObject childBlock = blocks.getJSONObject(blockIndex);
if (!expectedType.equals(childBlock.getString("type"))) {
continue;
}
JSONArray lines = childBlock.getJSONArray("lines");
if (lines == null) {
continue;
}
for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) {
JSONObject line = lines.getJSONObject(lineIndex);
JSONArray spans = line.getJSONArray("spans");
if (spans == null) {
continue;
}
for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) {
JSONObject span = spans.getJSONObject(spanIndex);
if (span.containsKey("content")) {
texts.add(span.getString("content"));
}
}
}
}
return texts;
}
private String extractImagePathByType(JSONObject visualBlock, String expectedType) {
JSONArray blocks = visualBlock.getJSONArray("blocks");
if (blocks == null) {
return null;
}
for (int blockIndex = 0; blockIndex < blocks.size(); blockIndex++) {
JSONObject childBlock = blocks.getJSONObject(blockIndex);
if (!expectedType.equals(childBlock.getString("type"))) {
continue;
}
JSONArray lines = childBlock.getJSONArray("lines");
if (lines == null) {
continue;
}
for (int lineIndex = 0; lineIndex < lines.size(); lineIndex++) {
JSONObject line = lines.getJSONObject(lineIndex);
JSONArray spans = line.getJSONArray("spans");
if (spans == null) {
continue;
}
for (int spanIndex = 0; spanIndex < spans.size(); spanIndex++) {
JSONObject span = spans.getJSONObject(spanIndex);
if (span.containsKey("img_path")) {
return span.getString("img_path");
}
}
}
}
return null;
}
private String matchDataUrl(String imagePath, Map<String, String> imageDataUrls) {
if (imageDataUrls == null || imageDataUrls.isEmpty()) {
return null;
}
if (StringUtil.hasText(imagePath) && imageDataUrls.containsKey(imagePath)) {
return imageDataUrls.get(imagePath);
}
String baseName = baseName(imagePath);
if (!StringUtil.hasText(baseName)) {
return null;
}
for (Map.Entry<String, String> entry : imageDataUrls.entrySet()) {
if (baseName.equals(baseName(entry.getKey()))) {
return entry.getValue();
}
}
return null;
}
private String baseName(String path) {
if (!StringUtil.hasText(path)) {
return null;
}
int slashIndex = path.lastIndexOf('/');
return slashIndex >= 0 ? path.substring(slashIndex + 1) : path;
}
private String detectMimeType(String path) {
if (!StringUtil.hasText(path)) {
return null;
}
String mimeType = URLConnection.guessContentTypeFromName(path);
return StringUtil.hasText(mimeType) ? mimeType : "application/octet-stream";
}
private String toDataUrl(String path, byte[] content) {
return "data:" + detectMimeType(path) + ";base64," + Base64.getEncoder().encodeToString(content);
}
private String joinList(List<String> values) {
if (values == null || values.isEmpty()) {
return null;
}
StringBuilder builder = new StringBuilder();
for (int index = 0; index < values.size(); index++) {
if (index > 0) {
builder.append('\n');
}
builder.append(values.get(index));
}
return builder.toString();
}
private boolean boolOrDefault(Boolean value, Boolean defaultValue) {
return value == null ? isTrue(defaultValue) : value;
}
private boolean isTrue(Boolean value) {
return value != null && value;
}
private int intOrDefault(Integer value, int defaultValue) {
return value == null ? defaultValue : value;
}
private static class ZipArtifactBundle {
private final Map<String, byte[]> entriesBySuffix = new LinkedHashMap<String, byte[]>();
private final Map<String, byte[]> images = new LinkedHashMap<String, byte[]>();
private final Map<String, byte[]> otherBinaryEntries = new LinkedHashMap<String, byte[]>();
}
}

View File

@@ -1,211 +0,0 @@
package com.easyagents.document.pdf.mineru;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import com.easyagents.core.util.StringUtil;
import com.easyagents.document.core.exception.DocumentParseException;
import com.easyagents.document.core.model.ParseFile;
import com.easyagents.document.core.model.ParseRequest;
import okhttp3.MediaType;
import okhttp3.MultipartBody;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.RequestBody;
import okhttp3.Response;
import okhttp3.ResponseBody;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* MinerU HTTP 客户端。
*
* @author Codex
* @since 2026-04-14
*/
public class MineruPdfClient {
private static final MediaType DEFAULT_PDF_MEDIA_TYPE = MediaType.parse("application/pdf");
private final String baseUrl;
private final OkHttpClient okHttpClient;
private final MineruMapper mineruMapper;
/**
* 创建客户端。
*
* @param properties MinerU 配置
* @param mineruMapper DTO 映射器
*/
public MineruPdfClient(MineruProperties properties, MineruMapper mineruMapper) {
this(
properties,
new OkHttpClient.Builder()
.connectTimeout(properties.getConnectTimeoutMs(), TimeUnit.MILLISECONDS)
.readTimeout(properties.getReadTimeoutMs(), TimeUnit.MILLISECONDS)
.writeTimeout(properties.getWriteTimeoutMs(), TimeUnit.MILLISECONDS)
.build(),
mineruMapper
);
}
/**
* 创建客户端。
*
* @param properties MinerU 配置
* @param okHttpClient HTTP 客户端
* @param mineruMapper DTO 映射器
*/
public MineruPdfClient(MineruProperties properties, OkHttpClient okHttpClient, MineruMapper mineruMapper) {
if (properties == null || !StringUtil.hasText(properties.getBaseUrl())) {
throw new IllegalArgumentException("MinerU baseUrl must not be empty");
}
this.baseUrl = normalizeBaseUrl(properties.getBaseUrl());
this.okHttpClient = okHttpClient;
this.mineruMapper = mineruMapper;
}
/**
* 调用同步解析接口。
*
* @param request 解析请求
* @return 原始结果
*/
public MineruResultPayload parse(ParseRequest request) {
return mineruMapper.toResultPayload(executeJsonMultipart("/file_parse", request, buildSyncFormFields(request)));
}
/**
* 提交异步解析任务。
*
* @param request 解析请求
* @return 原始任务状态
*/
public MineruTaskStatus submit(ParseRequest request) {
return mineruMapper.toTaskStatus(executeJsonMultipart("/tasks", request, buildAsyncFormFields(request)));
}
/**
* 查询任务状态。
*
* @param taskId 任务 ID
* @return 原始任务状态
*/
public MineruTaskStatus queryTask(String taskId) {
return mineruMapper.toTaskStatus(executeJsonGet("/tasks/" + taskId));
}
/**
* 下载异步结果 ZIP。
*
* @param taskId 任务 ID
* @return ZIP 二进制
*/
public byte[] queryResultZip(String taskId) {
String path = "/tasks/" + taskId + "/result";
Request request = new Request.Builder().url(baseUrl + path).get().build();
try (Response response = okHttpClient.newCall(request).execute()) {
ResponseBody body = response.body();
byte[] responseBytes = body == null ? new byte[0] : body.bytes();
if (!response.isSuccessful()) {
throw buildHttpException(path, response.code(), responseBytes);
}
String contentType = response.header("Content-Type");
if (contentType != null && contentType.contains("application/json")) {
JSONObject jsonObject = JSON.parseObject(new String(responseBytes));
throw new DocumentParseException("MinerU async result is not ready: " + jsonObject.toJSONString());
}
if (responseBytes.length < 2 || responseBytes[0] != 'P' || responseBytes[1] != 'K') {
throw new DocumentParseException("MinerU async result is not a valid ZIP payload");
}
return responseBytes;
} catch (IOException exception) {
throw new DocumentParseException("Failed to query MinerU result ZIP", exception);
}
}
protected JSONObject executeJsonMultipart(String path, ParseRequest request, Map<String, List<String>> fields) {
MultipartBody.Builder formBuilder = new MultipartBody.Builder().setType(MultipartBody.FORM);
appendFiles(formBuilder, request.getFiles());
appendStringFields(formBuilder, fields);
Request httpRequest = new Request.Builder()
.url(baseUrl + path)
.post(formBuilder.build())
.build();
return executeJsonRequest(path, httpRequest);
}
protected JSONObject executeJsonGet(String path) {
Request request = new Request.Builder().url(baseUrl + path).get().build();
return executeJsonRequest(path, request);
}
protected JSONObject executeJsonRequest(String path, Request request) {
try (Response response = okHttpClient.newCall(request).execute()) {
ResponseBody body = response.body();
String bodyText = body == null ? "" : body.string();
if (!response.isSuccessful()) {
throw buildHttpException(path, response.code(), bodyText == null ? new byte[0] : bodyText.getBytes());
}
return JSON.parseObject(bodyText);
} catch (IOException exception) {
throw new DocumentParseException("Failed to call MinerU endpoint: " + path, exception);
}
}
private void appendFiles(MultipartBody.Builder formBuilder, List<ParseFile> files) {
if (files == null || files.isEmpty()) {
throw new IllegalArgumentException("Parse request must contain at least one file");
}
for (ParseFile file : files) {
if (file == null || !StringUtil.hasText(file.getFileName()) || file.getContent() == null) {
throw new IllegalArgumentException("Parse request contains an invalid file");
}
MediaType mediaType = StringUtil.hasText(file.getContentType())
? MediaType.parse(file.getContentType())
: DEFAULT_PDF_MEDIA_TYPE;
formBuilder.addFormDataPart(
"files",
file.getFileName(),
RequestBody.create(file.getContent(), mediaType)
);
}
}
private void appendStringFields(MultipartBody.Builder formBuilder, Map<String, List<String>> fields) {
for (Map.Entry<String, List<String>> entry : fields.entrySet()) {
if (entry.getValue() == null) {
continue;
}
for (String value : entry.getValue()) {
if (value != null) {
formBuilder.addFormDataPart(entry.getKey(), value);
}
}
}
}
private Map<String, List<String>> buildSyncFormFields(ParseRequest request) {
return mineruMapper.buildSyncFormFields(request);
}
private Map<String, List<String>> buildAsyncFormFields(ParseRequest request) {
return mineruMapper.buildAsyncFormFields(request);
}
private DocumentParseException buildHttpException(String path, int statusCode, byte[] bodyBytes) {
String bodyText = bodyBytes == null ? "" : new String(bodyBytes);
return new DocumentParseException(
"MinerU request failed: path=" + path + ", status=" + statusCode + ", body=" + bodyText
);
}
private String normalizeBaseUrl(String baseUrl) {
if (baseUrl.endsWith("/")) {
return baseUrl.substring(0, baseUrl.length() - 1);
}
return baseUrl;
}
}

View File

@@ -1,31 +1,23 @@
package com.easyagents.document.pdf.mineru;
import com.easyagents.core.util.StringUtil;
import com.easyagents.document.core.exception.DocumentParseException;
import com.easyagents.document.core.model.ParseRequest;
import com.easyagents.document.core.model.ParseResponse;
import com.easyagents.document.core.model.ParseTaskInfo;
import com.easyagents.document.core.model.ParseTaskStatus;
import com.easyagents.document.core.mineru.MineruClient;
import com.easyagents.document.core.mineru.MineruDocumentParseService;
import com.easyagents.document.core.mineru.MineruMapper;
import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.core.entity.ParseRequest;
import com.easyagents.document.core.entity.PdfParseRequest;
import com.easyagents.document.pdf.PdfDocumentProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import com.easyagents.core.util.StringUtil;
/**
* 基于 MinerU API 的 PDF 解析服务。
*
* @author Codex
* @since 2026-04-14
* @since 2026-04-16
*/
public class MineruPdfDocumentParseService implements PdfDocumentProvider {
public class MineruPdfDocumentParseService extends MineruDocumentParseService<PdfParseRequest> implements PdfDocumentProvider {
public static final String PROVIDER_NAME = "mineru";
private static final Logger LOG = LoggerFactory.getLogger(MineruPdfDocumentParseService.class);
private final MineruProperties properties;
private final MineruPdfClient client;
private final MineruMapper mapper;
/**
* 创建默认服务实例。
@@ -33,7 +25,7 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
* @param properties MinerU 配置
*/
public MineruPdfDocumentParseService(MineruProperties properties) {
this(properties, new MineruMapper(properties));
super(properties);
}
/**
@@ -43,7 +35,7 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
* @param mapper 结果映射器
*/
public MineruPdfDocumentParseService(MineruProperties properties, MineruMapper mapper) {
this(properties, new MineruPdfClient(properties, mapper), mapper);
super(properties, mapper);
}
/**
@@ -53,10 +45,8 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
* @param client HTTP 客户端
* @param mapper 结果映射器
*/
public MineruPdfDocumentParseService(MineruProperties properties, MineruPdfClient client, MineruMapper mapper) {
this.properties = properties;
this.client = client;
this.mapper = mapper;
public MineruPdfDocumentParseService(MineruProperties properties, MineruClient client, MineruMapper mapper) {
super(properties, client, mapper);
}
@Override
@@ -65,145 +55,21 @@ public class MineruPdfDocumentParseService implements PdfDocumentProvider {
}
@Override
public ParseResponse parse(ParseRequest request) {
ParseRequest normalizedRequest = normalizeRequest(request);
LOG.info("MinerU 开始同步解析: provider={}, fileCount={}, backend={}, parseMethod={}",
PROVIDER_NAME,
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
normalizedRequest.getBackend(),
normalizedRequest.getParseMethod());
ParseResponse response = mapper.toParseResponse(client.parse(normalizedRequest));
LOG.info("MinerU 同步解析完成: provider={}, fileCount={}, resultCount={}",
PROVIDER_NAME,
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
response == null || response.getResults() == null ? 0 : response.getResults().size());
return response;
}
@Override
public ParseTaskStatus submit(ParseRequest request) {
ParseRequest normalizedRequest = normalizeRequest(request);
// 异步结果固定走全量 ZIP调用方无需传入裁剪参数。
normalizedRequest.setReturnMarkdown(true);
normalizedRequest.setReturnMiddleJson(true);
normalizedRequest.setReturnContentList(true);
normalizedRequest.setReturnModelOutput(true);
normalizedRequest.setReturnImages(true);
LOG.info("MinerU 开始提交异步解析任务: provider={}, fileCount={}, backend={}, parseMethod={}",
PROVIDER_NAME,
normalizedRequest.getFiles() == null ? 0 : normalizedRequest.getFiles().size(),
normalizedRequest.getBackend(),
normalizedRequest.getParseMethod());
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.submit(normalizedRequest));
LOG.info("MinerU 异步解析任务提交完成: provider={}, taskId={}, status={}",
PROVIDER_NAME,
taskStatus == null ? null : taskStatus.getTaskId(),
taskStatus == null ? null : taskStatus.getStatus());
return taskStatus;
}
@Override
public ParseTaskStatus queryTask(String taskId) {
validateTaskId(taskId);
ParseTaskStatus taskStatus = mapper.toParseTaskStatus(client.queryTask(taskId));
LOG.info("MinerU 查询异步任务状态: provider={}, taskId={}, status={}",
PROVIDER_NAME,
taskId,
taskStatus == null ? null : taskStatus.getStatus());
return taskStatus;
}
@Override
public ParseResponse queryResult(String taskId) {
validateTaskId(taskId);
LOG.info("MinerU 开始获取异步解析结果: provider={}, taskId={}", PROVIDER_NAME, taskId);
MineruTaskStatus taskStatus = waitForTaskCompleted(taskId);
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
LOG.info("MinerU 获取异步解析结果完成: provider={}, taskId={}, resultCount={}",
PROVIDER_NAME,
taskId,
response == null || response.getResults() == null ? 0 : response.getResults().size());
return response;
}
@Override
public ParseTaskInfo queryTaskInfo(String taskId) {
validateTaskId(taskId);
MineruTaskStatus taskStatus = client.queryTask(taskId);
ParseTaskInfo taskInfo = ParseTaskInfo.fromStatus(mapper.toParseTaskStatus(taskStatus));
if ("completed".equalsIgnoreCase(taskStatus.getStatus())) {
ParseResponse response = mapper.fromZip(client.queryResultZip(taskId));
mapper.enrichAsyncResponse(response, taskStatus.getBackend(), taskStatus.getVersion());
taskInfo.setResult(response);
}
LOG.info("MinerU 查询任务聚合信息: provider={}, taskId={}, status={}, hasResult={}",
PROVIDER_NAME,
taskId,
taskInfo == null ? null : taskInfo.getStatus(),
taskInfo != null && taskInfo.getResult() != null);
return taskInfo;
}
private ParseRequest normalizeRequest(ParseRequest request) {
if (request == null) {
throw new IllegalArgumentException("ParseRequest must not be null");
}
if (request.getFiles() == null || request.getFiles().isEmpty()) {
throw new IllegalArgumentException("ParseRequest files must not be empty");
}
ParseRequest normalizedRequest = new ParseRequest();
normalizedRequest.setFiles(new ArrayList<>(request.getFiles()));
normalizedRequest.setBackend(StringUtil.hasText(request.getBackend()) ? request.getBackend() : properties.getDefaultBackend());
normalizedRequest.setParseMethod(StringUtil.hasText(request.getParseMethod()) ? request.getParseMethod() : properties.getDefaultParseMethod());
normalizedRequest.setLanguages(
request.getLanguages() == null || request.getLanguages().isEmpty()
? new ArrayList<String>(properties.getDefaultLangList())
: new ArrayList<String>(request.getLanguages())
protected ParseRequest normalizeRequest(ParseRequest request) {
PdfParseRequest normalizedRequest = PdfParseRequest.from(request);
ParseRequest commonRequest = super.normalizeRequest(normalizedRequest);
commonRequest.copyCommonFieldsTo(normalizedRequest);
normalizedRequest.setParseMethod(
StringUtil.hasText(normalizedRequest.getParseMethod()) ? normalizedRequest.getParseMethod() : getProperties().getDefaultParseMethod()
);
normalizedRequest.setFormulaEnabled(request.getFormulaEnabled() == null ? properties.getDefaultFormulaEnable() : request.getFormulaEnabled());
normalizedRequest.setTableEnabled(request.getTableEnabled() == null ? properties.getDefaultTableEnable() : request.getTableEnabled());
normalizedRequest.setStartPageIndex(request.getStartPageIndex() == null ? 0 : request.getStartPageIndex());
normalizedRequest.setEndPageIndex(request.getEndPageIndex() == null ? 99999 : request.getEndPageIndex());
normalizedRequest.setReturnMarkdown(request.getReturnMarkdown());
normalizedRequest.setReturnMiddleJson(request.getReturnMiddleJson());
normalizedRequest.setReturnContentList(request.getReturnContentList());
normalizedRequest.setReturnModelOutput(request.getReturnModelOutput());
normalizedRequest.setReturnImages(request.getReturnImages());
normalizedRequest.setFormulaEnabled(
normalizedRequest.getFormulaEnabled() == null ? getProperties().getDefaultFormulaEnable() : normalizedRequest.getFormulaEnabled()
);
normalizedRequest.setTableEnabled(
normalizedRequest.getTableEnabled() == null ? getProperties().getDefaultTableEnable() : normalizedRequest.getTableEnabled()
);
normalizedRequest.setStartPageIndex(normalizedRequest.getStartPageIndex() == null ? 0 : normalizedRequest.getStartPageIndex());
normalizedRequest.setEndPageIndex(normalizedRequest.getEndPageIndex() == null ? 99999 : normalizedRequest.getEndPageIndex());
return normalizedRequest;
}
private void validateTaskId(String taskId) {
if (!StringUtil.hasText(taskId)) {
throw new IllegalArgumentException("taskId must not be empty");
}
}
/**
* 轮询任务状态直到完成或失败。
*
* @param taskId 任务 ID
* @return 已完成的任务状态
*/
private MineruTaskStatus waitForTaskCompleted(String taskId) {
long deadline = System.currentTimeMillis() + properties.getResultTimeoutMs();
while (true) {
MineruTaskStatus taskStatus = client.queryTask(taskId);
if ("completed".equals(taskStatus.getStatus())) {
return taskStatus;
}
if ("failed".equals(taskStatus.getStatus())) {
throw new DocumentParseException("MinerU task failed: " + taskStatus.getError());
}
if (System.currentTimeMillis() >= deadline) {
throw new DocumentParseException("MinerU task result timeout: " + taskId);
}
try {
Thread.sleep(properties.getPollIntervalMs());
} catch (InterruptedException exception) {
Thread.currentThread().interrupt();
throw new DocumentParseException("Interrupted while waiting for MinerU task: " + taskId, exception);
}
}
}
}

View File

@@ -1,116 +0,0 @@
package com.easyagents.document.pdf.mineru;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* MinerU PDF 解析配置。
*
* @author Codex
* @since 2026-04-14
*/
public class MineruProperties {
private String baseUrl;
private Integer connectTimeoutMs = 3000;
private Integer readTimeoutMs = 600000;
private Integer writeTimeoutMs = 600000;
private Integer pollIntervalMs = 1000;
private Integer resultTimeoutMs = 1800000;
private String defaultBackend = "vlm-http-client";
private String defaultParseMethod = "auto";
private List<String> defaultLangList = new ArrayList<String>(Arrays.asList("ch"));
private Boolean defaultFormulaEnable = true;
private Boolean defaultTableEnable = true;
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(String baseUrl) {
this.baseUrl = baseUrl;
}
public Integer getConnectTimeoutMs() {
return connectTimeoutMs;
}
public void setConnectTimeoutMs(Integer connectTimeoutMs) {
this.connectTimeoutMs = connectTimeoutMs;
}
public Integer getReadTimeoutMs() {
return readTimeoutMs;
}
public void setReadTimeoutMs(Integer readTimeoutMs) {
this.readTimeoutMs = readTimeoutMs;
}
public Integer getWriteTimeoutMs() {
return writeTimeoutMs;
}
public void setWriteTimeoutMs(Integer writeTimeoutMs) {
this.writeTimeoutMs = writeTimeoutMs;
}
public Integer getPollIntervalMs() {
return pollIntervalMs;
}
public void setPollIntervalMs(Integer pollIntervalMs) {
this.pollIntervalMs = pollIntervalMs;
}
public Integer getResultTimeoutMs() {
return resultTimeoutMs;
}
public void setResultTimeoutMs(Integer resultTimeoutMs) {
this.resultTimeoutMs = resultTimeoutMs;
}
public String getDefaultBackend() {
return defaultBackend;
}
public void setDefaultBackend(String defaultBackend) {
this.defaultBackend = defaultBackend;
}
public String getDefaultParseMethod() {
return defaultParseMethod;
}
public void setDefaultParseMethod(String defaultParseMethod) {
this.defaultParseMethod = defaultParseMethod;
}
public List<String> getDefaultLangList() {
return defaultLangList;
}
public void setDefaultLangList(List<String> defaultLangList) {
this.defaultLangList = defaultLangList == null
? new ArrayList<String>(Arrays.asList("ch"))
: defaultLangList;
}
public Boolean getDefaultFormulaEnable() {
return defaultFormulaEnable;
}
public void setDefaultFormulaEnable(Boolean defaultFormulaEnable) {
this.defaultFormulaEnable = defaultFormulaEnable;
}
public Boolean getDefaultTableEnable() {
return defaultTableEnable;
}
public void setDefaultTableEnable(Boolean defaultTableEnable) {
this.defaultTableEnable = defaultTableEnable;
}
}

View File

@@ -1,43 +0,0 @@
package com.easyagents.document.pdf.mineru;
import com.alibaba.fastjson2.JSONObject;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* MinerU 结果载荷。
*
* @author Codex
* @since 2026-04-14
*/
public class MineruResultPayload {
private String backend;
private String version;
private Map<String, JSONObject> results = new LinkedHashMap<String, JSONObject>();
public String getBackend() {
return backend;
}
public void setBackend(String backend) {
this.backend = backend;
}
public String getVersion() {
return version;
}
public void setVersion(String version) {
this.version = version;
}
public Map<String, JSONObject> getResults() {
return results;
}
public void setResults(Map<String, JSONObject> results) {
this.results = results == null ? new LinkedHashMap<String, JSONObject>() : results;
}
}

View File

@@ -1,131 +0,0 @@
package com.easyagents.document.pdf.mineru;
import java.util.ArrayList;
import java.util.List;
/**
* MinerU 原始任务状态。
*
* @author Codex
* @since 2026-04-14
*/
public class MineruTaskStatus {
private String taskId;
private String status;
private String backend;
private List<String> fileNames = new ArrayList<String>();
private String createdAt;
private String startedAt;
private String completedAt;
private String error;
private String statusUrl;
private String resultUrl;
private Integer queuedAhead;
private String version;
private String message;
public String getTaskId() {
return taskId;
}
public void setTaskId(String taskId) {
this.taskId = taskId;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getBackend() {
return backend;
}
public void setBackend(String backend) {
this.backend = backend;
}
public List<String> getFileNames() {
return fileNames;
}
public void setFileNames(List<String> fileNames) {
this.fileNames = fileNames == null ? new ArrayList<String>() : fileNames;
}
public String getCreatedAt() {
return createdAt;
}
public void setCreatedAt(String createdAt) {
this.createdAt = createdAt;
}
public String getStartedAt() {
return startedAt;
}
public void setStartedAt(String startedAt) {
this.startedAt = startedAt;
}
public String getCompletedAt() {
return completedAt;
}
public void setCompletedAt(String completedAt) {
this.completedAt = completedAt;
}
public String getError() {
return error;
}
public void setError(String error) {
this.error = error;
}
public String getStatusUrl() {
return statusUrl;
}
public void setStatusUrl(String statusUrl) {
this.statusUrl = statusUrl;
}
public String getResultUrl() {
return resultUrl;
}
public void setResultUrl(String resultUrl) {
this.resultUrl = resultUrl;
}
public Integer getQueuedAhead() {
return queuedAhead;
}
public void setQueuedAhead(Integer queuedAhead) {
this.queuedAhead = queuedAhead;
}
public String getVersion() {
return version;
}
public void setVersion(String version) {
this.version = version;
}
public String getMessage() {
return message;
}
public void setMessage(String message) {
this.message = message;
}
}

View File

@@ -2,10 +2,13 @@ package com.easyagents.document.pdf.mineru;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import com.easyagents.document.core.mineru.MineruMapper;
import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.core.mineru.MineruResultPayload;
import com.easyagents.document.core.exception.DocumentParseException;
import com.easyagents.document.core.model.ParseRequest;
import com.easyagents.document.core.model.ParseResponse;
import com.easyagents.document.core.model.ParseResult;
import com.easyagents.document.core.entity.ParseRequest;
import com.easyagents.document.core.entity.ParseResponse;
import com.easyagents.document.core.entity.ParseResult;
import org.junit.Assert;
import org.junit.Test;
@@ -41,6 +44,7 @@ public class MineruMapperTest {
Assert.assertFalse(result.getBlocks().isEmpty());
Assert.assertEquals(1, result.getTables().size());
Assert.assertEquals(2, result.getImages().size());
Assert.assertNotNull(result.getImages().get(0).getContent());
Assert.assertNotNull(result.getArtifacts().getMiddleJson());
Assert.assertNotNull(result.getArtifacts().getContentList());
}
@@ -56,6 +60,7 @@ public class MineruMapperTest {
Assert.assertEquals("# title", result.getPlainText());
Assert.assertEquals(1, result.getTables().size());
Assert.assertEquals(2, result.getImages().size());
Assert.assertNotNull(result.getImages().get(0).getContent());
Assert.assertNotNull(result.getArtifacts().getExtraJsonArtifacts().get("contentListV2"));
}

View File

@@ -1,11 +1,16 @@
package com.easyagents.document.pdf.mineru;
import com.alibaba.fastjson2.JSONObject;
import com.easyagents.document.core.model.ParseFile;
import com.easyagents.document.core.model.ParseRequest;
import com.easyagents.document.core.model.ParseResponse;
import com.easyagents.document.core.model.ParseTaskInfo;
import com.easyagents.document.core.model.ParseTaskStatus;
import com.easyagents.document.core.mineru.MineruClient;
import com.easyagents.document.core.mineru.MineruMapper;
import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.core.mineru.MineruResultPayload;
import com.easyagents.document.core.mineru.MineruTaskStatus;
import com.easyagents.document.core.entity.ParseFile;
import com.easyagents.document.core.entity.ParseRequest;
import com.easyagents.document.core.entity.ParseResponse;
import com.easyagents.document.core.entity.ParseTaskInfo;
import com.easyagents.document.core.entity.ParseTaskStatus;
import okhttp3.Request;
import okio.Buffer;
import org.junit.Assert;
@@ -147,7 +152,7 @@ public class MineruPdfDocumentParseServiceTest {
return count;
}
private static class RecordingClient extends MineruPdfClient {
private static class RecordingClient extends MineruClient {
private ParseRequest lastParseRequest;
private ParseRequest lastSubmitRequest;
@@ -248,7 +253,7 @@ public class MineruPdfDocumentParseServiceTest {
}
}
private static class InspectingMultipartClient extends MineruPdfClient {
private static class InspectingMultipartClient extends MineruClient {
private String lastMultipartBody;