refactor: 收敛文档 OCR 配置并补齐 Office 解析兼容

- 统一 MinerU OCR 配置结构并移除分模块冗余属性类

- 补齐 JSON 字符串化结果拆包、XLSX 图片兼容与对应回归测试
This commit is contained in:
2026-04-18 13:01:17 +08:00
parent b66876d0fd
commit 56ee149e7c
15 changed files with 559 additions and 272 deletions

View File

@@ -12,7 +12,7 @@ import java.util.List;
* @author Codex
* @since 2026-04-16
*/
@ConfigurationProperties(prefix = "easy-agents.document.mineru")
@ConfigurationProperties(prefix = "easy-agents.document.ocr.mineru")
public class CommonMineruDocumentProperties {
private String baseUrl;

View File

@@ -1,119 +0,0 @@
package com.easyagents.spring.boot.document.pdf.mineru;
import org.springframework.boot.context.properties.ConfigurationProperties;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* MinerU Spring Boot 配置。
*
* @author Codex
* @since 2026-04-14
*/
@ConfigurationProperties(prefix = "easy-agents.document.pdf.mineru")
public class MineruDocumentProperties {
private String baseUrl;
private Integer connectTimeoutMs = 3000;
private Integer readTimeoutMs = 600000;
private Integer writeTimeoutMs = 600000;
private Integer pollIntervalMs = 1000;
private Integer resultTimeoutMs = 1800000;
private String defaultBackend = "vlm-http-client";
private String defaultParseMethod = "auto";
private List<String> defaultLangList = new ArrayList<String>(Arrays.asList("ch"));
private Boolean defaultFormulaEnable = true;
private Boolean defaultTableEnable = true;
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(String baseUrl) {
this.baseUrl = baseUrl;
}
public Integer getConnectTimeoutMs() {
return connectTimeoutMs;
}
public void setConnectTimeoutMs(Integer connectTimeoutMs) {
this.connectTimeoutMs = connectTimeoutMs;
}
public Integer getReadTimeoutMs() {
return readTimeoutMs;
}
public void setReadTimeoutMs(Integer readTimeoutMs) {
this.readTimeoutMs = readTimeoutMs;
}
public Integer getWriteTimeoutMs() {
return writeTimeoutMs;
}
public void setWriteTimeoutMs(Integer writeTimeoutMs) {
this.writeTimeoutMs = writeTimeoutMs;
}
public Integer getPollIntervalMs() {
return pollIntervalMs;
}
public void setPollIntervalMs(Integer pollIntervalMs) {
this.pollIntervalMs = pollIntervalMs;
}
public Integer getResultTimeoutMs() {
return resultTimeoutMs;
}
public void setResultTimeoutMs(Integer resultTimeoutMs) {
this.resultTimeoutMs = resultTimeoutMs;
}
public String getDefaultBackend() {
return defaultBackend;
}
public void setDefaultBackend(String defaultBackend) {
this.defaultBackend = defaultBackend;
}
public String getDefaultParseMethod() {
return defaultParseMethod;
}
public void setDefaultParseMethod(String defaultParseMethod) {
this.defaultParseMethod = defaultParseMethod;
}
public List<String> getDefaultLangList() {
return defaultLangList;
}
public void setDefaultLangList(List<String> defaultLangList) {
this.defaultLangList = defaultLangList == null
? new ArrayList<String>(Arrays.asList("ch"))
: defaultLangList;
}
public Boolean getDefaultFormulaEnable() {
return defaultFormulaEnable;
}
public void setDefaultFormulaEnable(Boolean defaultFormulaEnable) {
this.defaultFormulaEnable = defaultFormulaEnable;
}
public Boolean getDefaultTableEnable() {
return defaultTableEnable;
}
public void setDefaultTableEnable(Boolean defaultTableEnable) {
this.defaultTableEnable = defaultTableEnable;
}
}

View File

@@ -10,8 +10,11 @@ import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.beans.factory.config.BeanFactoryPostProcessor;
import org.springframework.beans.factory.support.BeanDefinitionRegistry;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Primary;
/**
* MinerU PDF 文档解析自动装配。
@@ -21,50 +24,94 @@ import org.springframework.context.annotation.Configuration;
*/
@Configuration(proxyBeanMethods = false)
@ConditionalOnClass(MineruPdfDocumentParseService.class)
@ConditionalOnProperty(prefix = "easy-agents.document.pdf", name = "provider", havingValue = "mineru")
@EnableConfigurationProperties({MineruDocumentProperties.class, CommonMineruDocumentProperties.class})
@ConditionalOnProperty(prefix = "easy-agents.document.ocr", name = "provider", havingValue = "mineru")
@EnableConfigurationProperties(CommonMineruDocumentProperties.class)
public class MineruPdfAutoConfiguration {
public static final String DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME = "documentParseService";
/**
* 注册统一 PDF 解析服务。
*
* @param properties Spring Boot 配置
* @param commonProperties Spring Boot 配置
* @return PDF 解析服务
*/
@Bean
@Primary
@ConditionalOnMissingBean(PdfDocumentParseService.class)
public PdfDocumentParseService pdfDocumentParseService(MineruDocumentProperties properties,
CommonMineruDocumentProperties commonProperties) {
return new MineruPdfDocumentParseService(toMineruProperties(properties, commonProperties));
public PdfDocumentParseService pdfDocumentParseService(CommonMineruDocumentProperties commonProperties) {
return new MineruPdfDocumentParseService(toMineruProperties(commonProperties));
}
/**
* 将 PDF 服务以统一文档解析服务类型暴露,便于调用方直接按抽象注入
* 将默认文档解析服务名注册为 PDF 服务别名,避免重复创建同类型 Bean
* 这里显式走 alias而不是第二个 {@link DocumentParseService} Bean
* 这样既能保持默认契约,也不会破坏按 {@link PdfDocumentParseService} 类型的唯一注入。
*
* @param pdfDocumentParseService PDF 解析服务
* @return 统一文档解析服务
* @return BeanFactory 后置处理器
*/
@Bean
@ConditionalOnMissingBean(DocumentParseService.class)
public DocumentParseService documentParseService(PdfDocumentParseService pdfDocumentParseService) {
return pdfDocumentParseService;
public static BeanFactoryPostProcessor defaultDocumentParseServiceAliasPostProcessor() {
return beanFactory -> {
if (!(beanFactory instanceof BeanDefinitionRegistry)) {
return;
}
BeanDefinitionRegistry registry = (BeanDefinitionRegistry) beanFactory;
String aliasTarget = resolveAliasTarget(beanFactory, registry);
if (!StringUtil.hasText(aliasTarget)) {
return;
}
if (registry.containsBeanDefinition(DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME)
|| registry.isAlias(DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME)) {
return;
}
if (registry.containsBeanDefinition(aliasTarget)
&& !registry.getBeanDefinition(aliasTarget).isPrimary()) {
registry.getBeanDefinition(aliasTarget).setPrimary(true);
}
registry.registerAlias(aliasTarget, DEFAULT_DOCUMENT_PARSE_SERVICE_BEAN_NAME);
};
}
private MineruProperties toMineruProperties(MineruDocumentProperties properties,
CommonMineruDocumentProperties commonProperties) {
private static String resolveAliasTarget(org.springframework.beans.factory.config.ConfigurableListableBeanFactory beanFactory,
BeanDefinitionRegistry registry) {
String[] candidateNames = beanFactory.getBeanNamesForType(PdfDocumentParseService.class, true, false);
if (candidateNames == null || candidateNames.length == 0) {
return null;
}
if (candidateNames.length == 1) {
return candidateNames[0];
}
String primaryBeanName = null;
for (String candidateName : candidateNames) {
if (!registry.containsBeanDefinition(candidateName)) {
continue;
}
if (!registry.getBeanDefinition(candidateName).isPrimary()) {
continue;
}
if (primaryBeanName != null) {
return null;
}
primaryBeanName = candidateName;
}
return primaryBeanName;
}
private MineruProperties toMineruProperties(CommonMineruDocumentProperties commonProperties) {
MineruProperties mineruProperties = new MineruProperties();
boolean useCommon = commonProperties != null && StringUtil.hasText(commonProperties.getBaseUrl());
mineruProperties.setBaseUrl(useCommon ? commonProperties.getBaseUrl() : properties.getBaseUrl());
mineruProperties.setConnectTimeoutMs(useCommon ? commonProperties.getConnectTimeoutMs() : properties.getConnectTimeoutMs());
mineruProperties.setReadTimeoutMs(useCommon ? commonProperties.getReadTimeoutMs() : properties.getReadTimeoutMs());
mineruProperties.setWriteTimeoutMs(useCommon ? commonProperties.getWriteTimeoutMs() : properties.getWriteTimeoutMs());
mineruProperties.setPollIntervalMs(useCommon ? commonProperties.getPollIntervalMs() : properties.getPollIntervalMs());
mineruProperties.setResultTimeoutMs(useCommon ? commonProperties.getResultTimeoutMs() : properties.getResultTimeoutMs());
mineruProperties.setDefaultBackend(useCommon ? commonProperties.getDefaultBackend() : properties.getDefaultBackend());
mineruProperties.setDefaultParseMethod(useCommon ? commonProperties.getDefaultParseMethod() : properties.getDefaultParseMethod());
mineruProperties.setDefaultLangList(useCommon ? commonProperties.getDefaultLangList() : properties.getDefaultLangList());
mineruProperties.setDefaultFormulaEnable(useCommon ? commonProperties.getDefaultFormulaEnable() : properties.getDefaultFormulaEnable());
mineruProperties.setDefaultTableEnable(useCommon ? commonProperties.getDefaultTableEnable() : properties.getDefaultTableEnable());
mineruProperties.setBaseUrl(useCommon ? commonProperties.getBaseUrl() : null);
mineruProperties.setConnectTimeoutMs(useCommon ? commonProperties.getConnectTimeoutMs() : null);
mineruProperties.setReadTimeoutMs(useCommon ? commonProperties.getReadTimeoutMs() : null);
mineruProperties.setWriteTimeoutMs(useCommon ? commonProperties.getWriteTimeoutMs() : null);
mineruProperties.setPollIntervalMs(useCommon ? commonProperties.getPollIntervalMs() : null);
mineruProperties.setResultTimeoutMs(useCommon ? commonProperties.getResultTimeoutMs() : null);
mineruProperties.setDefaultBackend(useCommon ? commonProperties.getDefaultBackend() : null);
mineruProperties.setDefaultParseMethod(useCommon ? commonProperties.getDefaultParseMethod() : null);
mineruProperties.setDefaultLangList(useCommon ? commonProperties.getDefaultLangList() : null);
mineruProperties.setDefaultFormulaEnable(useCommon ? commonProperties.getDefaultFormulaEnable() : null);
mineruProperties.setDefaultTableEnable(useCommon ? commonProperties.getDefaultTableEnable() : null);
return mineruProperties;
}
}

View File

@@ -6,6 +6,7 @@ import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.pptx.PptxDocumentParseService;
import com.easyagents.document.pptx.mineru.MineruPptxDocumentParseService;
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
@@ -24,21 +25,24 @@ import java.util.concurrent.Executors;
*/
@Configuration(proxyBeanMethods = false)
@ConditionalOnClass(MineruPptxDocumentParseService.class)
@ConditionalOnProperty(prefix = "easy-agents.document.pptx", name = "enabled", havingValue = "true")
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, PptxDocumentProperties.class})
@ConditionalOnProperty(prefix = "easy-agents.document.ocr", name = "provider", havingValue = "mineru")
@EnableConfigurationProperties(CommonMineruDocumentProperties.class)
public class MineruPptxAutoConfiguration {
public static final String PPTX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME = "pptxDocumentAsyncTaskManager";
private static final int DEFAULT_ASYNC_THREADS = 2;
@Bean
@ConditionalOnMissingBean(name = "pptxDocumentAsyncTaskManager")
public DocumentAsyncTaskManager pptxDocumentAsyncTaskManager(PptxDocumentProperties properties) {
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads();
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
@ConditionalOnMissingBean(name = PPTX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
public DocumentAsyncTaskManager pptxDocumentAsyncTaskManager() {
ExecutorService executorService = Executors.newFixedThreadPool(DEFAULT_ASYNC_THREADS);
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
}
@Bean
@ConditionalOnMissingBean(PptxDocumentParseService.class)
public PptxDocumentParseService pptxDocumentParseService(CommonMineruDocumentProperties commonProperties,
@Qualifier(PPTX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
DocumentAsyncTaskManager pptxDocumentAsyncTaskManager) {
return new MineruPptxDocumentParseService(toMineruProperties(commonProperties), pptxDocumentAsyncTaskManager);
}

View File

@@ -1,32 +0,0 @@
package com.easyagents.spring.boot.document.pptx;
import org.springframework.boot.context.properties.ConfigurationProperties;
/**
* PPTX 文档配置。
*
* @author Codex
* @since 2026-04-16
*/
@ConfigurationProperties(prefix = "easy-agents.document.pptx")
public class PptxDocumentProperties {
private Boolean enabled = false;
private Integer asyncThreads = 2;
public Boolean getEnabled() {
return enabled;
}
public void setEnabled(Boolean enabled) {
this.enabled = enabled;
}
public Integer getAsyncThreads() {
return asyncThreads;
}
public void setAsyncThreads(Integer asyncThreads) {
this.asyncThreads = asyncThreads;
}
}

View File

@@ -6,6 +6,7 @@ import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.xlsx.XlsxDocumentParseService;
import com.easyagents.document.xlsx.mineru.MineruXlsxDocumentParseService;
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
@@ -24,21 +25,24 @@ import java.util.concurrent.Executors;
*/
@Configuration(proxyBeanMethods = false)
@ConditionalOnClass(MineruXlsxDocumentParseService.class)
@ConditionalOnProperty(prefix = "easy-agents.document.xlsx", name = "enabled", havingValue = "true")
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, XlsxDocumentProperties.class})
@ConditionalOnProperty(prefix = "easy-agents.document.ocr", name = "provider", havingValue = "mineru")
@EnableConfigurationProperties(CommonMineruDocumentProperties.class)
public class MineruXlsxAutoConfiguration {
public static final String XLSX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME = "xlsxDocumentAsyncTaskManager";
private static final int DEFAULT_ASYNC_THREADS = 2;
@Bean
@ConditionalOnMissingBean(name = "xlsxDocumentAsyncTaskManager")
public DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager(XlsxDocumentProperties properties) {
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads();
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
@ConditionalOnMissingBean(name = XLSX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
public DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager() {
ExecutorService executorService = Executors.newFixedThreadPool(DEFAULT_ASYNC_THREADS);
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
}
@Bean
@ConditionalOnMissingBean(XlsxDocumentParseService.class)
public XlsxDocumentParseService xlsxDocumentParseService(CommonMineruDocumentProperties commonProperties,
@Qualifier(XLSX_DOCUMENT_ASYNC_TASK_MANAGER_BEAN_NAME)
DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager) {
return new MineruXlsxDocumentParseService(toMineruProperties(commonProperties), xlsxDocumentAsyncTaskManager);
}

View File

@@ -1,32 +0,0 @@
package com.easyagents.spring.boot.document.xlsx;
import org.springframework.boot.context.properties.ConfigurationProperties;
/**
* XLSX 文档配置。
*
* @author Codex
* @since 2026-04-16
*/
@ConfigurationProperties(prefix = "easy-agents.document.xlsx")
public class XlsxDocumentProperties {
private Boolean enabled = false;
private Integer asyncThreads = 2;
public Boolean getEnabled() {
return enabled;
}
public void setEnabled(Boolean enabled) {
this.enabled = enabled;
}
public Integer getAsyncThreads() {
return asyncThreads;
}
public void setAsyncThreads(Integer asyncThreads) {
this.asyncThreads = asyncThreads;
}
}

View File

@@ -1,6 +1,10 @@
package com.easyagents.spring.boot.autoconfigure;
import com.easyagents.document.core.DocumentParseService;
import com.easyagents.document.core.entity.ParseRequest;
import com.easyagents.document.core.entity.ParseResponse;
import com.easyagents.document.core.entity.ParseTaskInfo;
import com.easyagents.document.core.entity.ParseTaskStatus;
import com.easyagents.document.pdf.PdfDocumentParseService;
import com.easyagents.document.pptx.PptxDocumentParseService;
import com.easyagents.document.xlsx.XlsxDocumentParseService;
@@ -13,6 +17,8 @@ import com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration;
import com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration;
import org.junit.Assert;
import org.junit.Test;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.boot.test.context.runner.ApplicationContextRunner;
public class StarterConditionalAutoConfigurationTest {
@@ -49,27 +55,105 @@ public class StarterConditionalAutoConfigurationTest {
public void shouldCreateMineruDocumentBeansWhenConfigured() {
contextRunner
.withPropertyValues(
"easy-agents.document.pdf.provider=mineru",
"easy-agents.document.pdf.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
"easy-agents.document.ocr.provider=mineru",
"easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
)
.run(context -> {
Assert.assertNotNull(context.getBean(PdfDocumentParseService.class));
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
Assert.assertNotNull(context.getBean(DocumentParseService.class));
});
}
@Test
public void shouldCreatePptxAndXlsxBeansWhenEnabled() {
public void shouldCreatePptxAndXlsxBeansWhenMineruOcrConfigured() {
contextRunner
.withPropertyValues(
"easy-agents.document.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api",
"easy-agents.document.pptx.enabled=true",
"easy-agents.document.xlsx.enabled=true"
"easy-agents.document.ocr.provider=mineru",
"easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
)
.run(context -> {
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
Assert.assertFalse(context.containsBean("documentParseService"));
Assert.assertNotNull(context.getBean(DocumentParseService.class));
});
}
@Test
public void shouldKeepPdfAsDefaultDocumentParseServiceWhenMineruOcrConfigured() {
contextRunner
.withPropertyValues(
"easy-agents.document.ocr.provider=mineru",
"easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
)
.run(context -> {
PdfDocumentParseService pdfService = context.getBean(PdfDocumentParseService.class);
Assert.assertNotNull(pdfService);
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
Assert.assertSame(pdfService, context.getBean("documentParseService"));
Assert.assertSame(pdfService, context.getBean(DocumentParseService.class));
});
}
@Test
public void shouldAliasCustomNamedPdfServiceAsDefaultDocumentParseService() {
new ApplicationContextRunner()
.withUserConfiguration(CustomPdfParseServiceConfiguration.class)
.withUserConfiguration(
RagIngestionAutoConfiguration.class,
OllamaAutoConfiguration.class,
OpenSearchAutoConfiguration.class,
MineruPdfAutoConfiguration.class,
MineruPptxAutoConfiguration.class,
MineruXlsxAutoConfiguration.class
)
.withPropertyValues(
"easy-agents.document.ocr.provider=mineru",
"easy-agents.document.ocr.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api"
)
.run(context -> {
PdfDocumentParseService pdfService = context.getBean(PdfDocumentParseService.class);
Assert.assertSame(pdfService, context.getBean("documentParseService"));
Assert.assertSame(pdfService, context.getBean(DocumentParseService.class));
});
}
@Configuration(proxyBeanMethods = false)
static class CustomPdfParseServiceConfiguration {
@Bean("customPdfService")
PdfDocumentParseService customPdfService() {
return new NoopPdfDocumentParseService();
}
}
static class NoopPdfDocumentParseService implements PdfDocumentParseService {
@Override
public ParseResponse parse(ParseRequest request) {
return new ParseResponse();
}
@Override
public ParseTaskStatus submit(ParseRequest request) {
return new ParseTaskStatus();
}
@Override
public ParseTaskStatus queryTask(String taskId) {
return new ParseTaskStatus();
}
@Override
public ParseResponse queryResult(String taskId) {
return new ParseResponse();
}
@Override
public ParseTaskInfo queryTaskInfo(String taskId) {
return new ParseTaskInfo();
}
}
}