feat: 扩展 Office 文档解析能力

- 重构 document-core 与 MinerU 公共层,补齐 Office 异步任务基础设施

- 新增 PPTX/XLSX 解析模块与 starter 自动装配

- 补充 README 与相关测试覆盖
This commit is contained in:
2026-04-16 21:51:16 +08:00
parent 547d4f6ee0
commit b66876d0fd
66 changed files with 4015 additions and 296 deletions

View File

@@ -61,6 +61,16 @@
<artifactId>easy-agents-document-pdf</artifactId>
</dependency>
<dependency>
<groupId>com.easyagents</groupId>
<artifactId>easy-agents-document-pptx</artifactId>
</dependency>
<dependency>
<groupId>com.easyagents</groupId>
<artifactId>easy-agents-document-xlsx</artifactId>
</dependency>
<dependency>
<groupId>com.easyagents</groupId>
<artifactId>easy-agents-rag-ingestion</artifactId>

View File

@@ -0,0 +1,119 @@
package com.easyagents.spring.boot.document.mineru;
import org.springframework.boot.context.properties.ConfigurationProperties;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* 通用 MinerU 文档配置。
*
* @author Codex
* @since 2026-04-16
*/
@ConfigurationProperties(prefix = "easy-agents.document.mineru")
public class CommonMineruDocumentProperties {
private String baseUrl;
private Integer connectTimeoutMs = 3000;
private Integer readTimeoutMs = 600000;
private Integer writeTimeoutMs = 600000;
private Integer pollIntervalMs = 1000;
private Integer resultTimeoutMs = 1800000;
private String defaultBackend = "vlm-http-client";
private String defaultParseMethod = "auto";
private List<String> defaultLangList = new ArrayList<String>(Arrays.asList("ch"));
private Boolean defaultFormulaEnable = true;
private Boolean defaultTableEnable = true;
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(String baseUrl) {
this.baseUrl = baseUrl;
}
public Integer getConnectTimeoutMs() {
return connectTimeoutMs;
}
public void setConnectTimeoutMs(Integer connectTimeoutMs) {
this.connectTimeoutMs = connectTimeoutMs;
}
public Integer getReadTimeoutMs() {
return readTimeoutMs;
}
public void setReadTimeoutMs(Integer readTimeoutMs) {
this.readTimeoutMs = readTimeoutMs;
}
public Integer getWriteTimeoutMs() {
return writeTimeoutMs;
}
public void setWriteTimeoutMs(Integer writeTimeoutMs) {
this.writeTimeoutMs = writeTimeoutMs;
}
public Integer getPollIntervalMs() {
return pollIntervalMs;
}
public void setPollIntervalMs(Integer pollIntervalMs) {
this.pollIntervalMs = pollIntervalMs;
}
public Integer getResultTimeoutMs() {
return resultTimeoutMs;
}
public void setResultTimeoutMs(Integer resultTimeoutMs) {
this.resultTimeoutMs = resultTimeoutMs;
}
public String getDefaultBackend() {
return defaultBackend;
}
public void setDefaultBackend(String defaultBackend) {
this.defaultBackend = defaultBackend;
}
public String getDefaultParseMethod() {
return defaultParseMethod;
}
public void setDefaultParseMethod(String defaultParseMethod) {
this.defaultParseMethod = defaultParseMethod;
}
public List<String> getDefaultLangList() {
return defaultLangList;
}
public void setDefaultLangList(List<String> defaultLangList) {
this.defaultLangList = defaultLangList == null
? new ArrayList<String>(Arrays.asList("ch"))
: defaultLangList;
}
public Boolean getDefaultFormulaEnable() {
return defaultFormulaEnable;
}
public void setDefaultFormulaEnable(Boolean defaultFormulaEnable) {
this.defaultFormulaEnable = defaultFormulaEnable;
}
public Boolean getDefaultTableEnable() {
return defaultTableEnable;
}
public void setDefaultTableEnable(Boolean defaultTableEnable) {
this.defaultTableEnable = defaultTableEnable;
}
}

View File

@@ -1,9 +1,11 @@
package com.easyagents.spring.boot.document.pdf.mineru;
import com.easyagents.document.core.DocumentParseService;
import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.pdf.PdfDocumentParseService;
import com.easyagents.document.pdf.mineru.MineruPdfDocumentParseService;
import com.easyagents.document.pdf.mineru.MineruProperties;
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
import com.easyagents.core.util.StringUtil;
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
@@ -20,7 +22,7 @@ import org.springframework.context.annotation.Configuration;
@Configuration(proxyBeanMethods = false)
@ConditionalOnClass(MineruPdfDocumentParseService.class)
@ConditionalOnProperty(prefix = "easy-agents.document.pdf", name = "provider", havingValue = "mineru")
@EnableConfigurationProperties(MineruDocumentProperties.class)
@EnableConfigurationProperties({MineruDocumentProperties.class, CommonMineruDocumentProperties.class})
public class MineruPdfAutoConfiguration {
/**
@@ -31,8 +33,9 @@ public class MineruPdfAutoConfiguration {
*/
@Bean
@ConditionalOnMissingBean(PdfDocumentParseService.class)
public PdfDocumentParseService pdfDocumentParseService(MineruDocumentProperties properties) {
return new MineruPdfDocumentParseService(toMineruProperties(properties));
public PdfDocumentParseService pdfDocumentParseService(MineruDocumentProperties properties,
CommonMineruDocumentProperties commonProperties) {
return new MineruPdfDocumentParseService(toMineruProperties(properties, commonProperties));
}
/**
@@ -47,19 +50,21 @@ public class MineruPdfAutoConfiguration {
return pdfDocumentParseService;
}
private MineruProperties toMineruProperties(MineruDocumentProperties properties) {
private MineruProperties toMineruProperties(MineruDocumentProperties properties,
CommonMineruDocumentProperties commonProperties) {
MineruProperties mineruProperties = new MineruProperties();
mineruProperties.setBaseUrl(properties.getBaseUrl());
mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs());
mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs());
mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs());
mineruProperties.setPollIntervalMs(properties.getPollIntervalMs());
mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs());
mineruProperties.setDefaultBackend(properties.getDefaultBackend());
mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod());
mineruProperties.setDefaultLangList(properties.getDefaultLangList());
mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable());
mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable());
boolean useCommon = commonProperties != null && StringUtil.hasText(commonProperties.getBaseUrl());
mineruProperties.setBaseUrl(useCommon ? commonProperties.getBaseUrl() : properties.getBaseUrl());
mineruProperties.setConnectTimeoutMs(useCommon ? commonProperties.getConnectTimeoutMs() : properties.getConnectTimeoutMs());
mineruProperties.setReadTimeoutMs(useCommon ? commonProperties.getReadTimeoutMs() : properties.getReadTimeoutMs());
mineruProperties.setWriteTimeoutMs(useCommon ? commonProperties.getWriteTimeoutMs() : properties.getWriteTimeoutMs());
mineruProperties.setPollIntervalMs(useCommon ? commonProperties.getPollIntervalMs() : properties.getPollIntervalMs());
mineruProperties.setResultTimeoutMs(useCommon ? commonProperties.getResultTimeoutMs() : properties.getResultTimeoutMs());
mineruProperties.setDefaultBackend(useCommon ? commonProperties.getDefaultBackend() : properties.getDefaultBackend());
mineruProperties.setDefaultParseMethod(useCommon ? commonProperties.getDefaultParseMethod() : properties.getDefaultParseMethod());
mineruProperties.setDefaultLangList(useCommon ? commonProperties.getDefaultLangList() : properties.getDefaultLangList());
mineruProperties.setDefaultFormulaEnable(useCommon ? commonProperties.getDefaultFormulaEnable() : properties.getDefaultFormulaEnable());
mineruProperties.setDefaultTableEnable(useCommon ? commonProperties.getDefaultTableEnable() : properties.getDefaultTableEnable());
return mineruProperties;
}
}

View File

@@ -0,0 +1,61 @@
package com.easyagents.spring.boot.document.pptx;
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.pptx.PptxDocumentParseService;
import com.easyagents.document.pptx.mineru.MineruPptxDocumentParseService;
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* MinerU PPTX 自动装配。
*
* @author Codex
* @since 2026-04-16
*/
@Configuration(proxyBeanMethods = false)
@ConditionalOnClass(MineruPptxDocumentParseService.class)
@ConditionalOnProperty(prefix = "easy-agents.document.pptx", name = "enabled", havingValue = "true")
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, PptxDocumentProperties.class})
public class MineruPptxAutoConfiguration {
@Bean
@ConditionalOnMissingBean(name = "pptxDocumentAsyncTaskManager")
public DocumentAsyncTaskManager pptxDocumentAsyncTaskManager(PptxDocumentProperties properties) {
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads();
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
}
@Bean
@ConditionalOnMissingBean(PptxDocumentParseService.class)
public PptxDocumentParseService pptxDocumentParseService(CommonMineruDocumentProperties commonProperties,
DocumentAsyncTaskManager pptxDocumentAsyncTaskManager) {
return new MineruPptxDocumentParseService(toMineruProperties(commonProperties), pptxDocumentAsyncTaskManager);
}
private MineruProperties toMineruProperties(CommonMineruDocumentProperties properties) {
MineruProperties mineruProperties = new MineruProperties();
mineruProperties.setBaseUrl(properties.getBaseUrl());
mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs());
mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs());
mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs());
mineruProperties.setPollIntervalMs(properties.getPollIntervalMs());
mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs());
mineruProperties.setDefaultBackend(properties.getDefaultBackend());
mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod());
mineruProperties.setDefaultLangList(properties.getDefaultLangList());
mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable());
mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable());
return mineruProperties;
}
}

View File

@@ -0,0 +1,32 @@
package com.easyagents.spring.boot.document.pptx;
import org.springframework.boot.context.properties.ConfigurationProperties;
/**
* PPTX 文档配置。
*
* @author Codex
* @since 2026-04-16
*/
@ConfigurationProperties(prefix = "easy-agents.document.pptx")
public class PptxDocumentProperties {
private Boolean enabled = false;
private Integer asyncThreads = 2;
public Boolean getEnabled() {
return enabled;
}
public void setEnabled(Boolean enabled) {
this.enabled = enabled;
}
public Integer getAsyncThreads() {
return asyncThreads;
}
public void setAsyncThreads(Integer asyncThreads) {
this.asyncThreads = asyncThreads;
}
}

View File

@@ -0,0 +1,61 @@
package com.easyagents.spring.boot.document.xlsx;
import com.easyagents.document.core.async.DocumentAsyncTaskManager;
import com.easyagents.document.core.async.InMemoryDocumentAsyncTaskRepository;
import com.easyagents.document.core.mineru.MineruProperties;
import com.easyagents.document.xlsx.XlsxDocumentParseService;
import com.easyagents.document.xlsx.mineru.MineruXlsxDocumentParseService;
import com.easyagents.spring.boot.document.mineru.CommonMineruDocumentProperties;
import org.springframework.boot.autoconfigure.condition.ConditionalOnClass;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* MinerU XLSX 自动装配。
*
* @author Codex
* @since 2026-04-16
*/
@Configuration(proxyBeanMethods = false)
@ConditionalOnClass(MineruXlsxDocumentParseService.class)
@ConditionalOnProperty(prefix = "easy-agents.document.xlsx", name = "enabled", havingValue = "true")
@EnableConfigurationProperties({CommonMineruDocumentProperties.class, XlsxDocumentProperties.class})
public class MineruXlsxAutoConfiguration {
@Bean
@ConditionalOnMissingBean(name = "xlsxDocumentAsyncTaskManager")
public DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager(XlsxDocumentProperties properties) {
int threadCount = properties.getAsyncThreads() == null || properties.getAsyncThreads() <= 0 ? 2 : properties.getAsyncThreads();
ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
return new DocumentAsyncTaskManager(new InMemoryDocumentAsyncTaskRepository(), executorService);
}
@Bean
@ConditionalOnMissingBean(XlsxDocumentParseService.class)
public XlsxDocumentParseService xlsxDocumentParseService(CommonMineruDocumentProperties commonProperties,
DocumentAsyncTaskManager xlsxDocumentAsyncTaskManager) {
return new MineruXlsxDocumentParseService(toMineruProperties(commonProperties), xlsxDocumentAsyncTaskManager);
}
private MineruProperties toMineruProperties(CommonMineruDocumentProperties properties) {
MineruProperties mineruProperties = new MineruProperties();
mineruProperties.setBaseUrl(properties.getBaseUrl());
mineruProperties.setConnectTimeoutMs(properties.getConnectTimeoutMs());
mineruProperties.setReadTimeoutMs(properties.getReadTimeoutMs());
mineruProperties.setWriteTimeoutMs(properties.getWriteTimeoutMs());
mineruProperties.setPollIntervalMs(properties.getPollIntervalMs());
mineruProperties.setResultTimeoutMs(properties.getResultTimeoutMs());
mineruProperties.setDefaultBackend(properties.getDefaultBackend());
mineruProperties.setDefaultParseMethod(properties.getDefaultParseMethod());
mineruProperties.setDefaultLangList(properties.getDefaultLangList());
mineruProperties.setDefaultFormulaEnable(properties.getDefaultFormulaEnable());
mineruProperties.setDefaultTableEnable(properties.getDefaultTableEnable());
return mineruProperties;
}
}

View File

@@ -0,0 +1,32 @@
package com.easyagents.spring.boot.document.xlsx;
import org.springframework.boot.context.properties.ConfigurationProperties;
/**
* XLSX 文档配置。
*
* @author Codex
* @since 2026-04-16
*/
@ConfigurationProperties(prefix = "easy-agents.document.xlsx")
public class XlsxDocumentProperties {
private Boolean enabled = false;
private Integer asyncThreads = 2;
public Boolean getEnabled() {
return enabled;
}
public void setEnabled(Boolean enabled) {
this.enabled = enabled;
}
public Integer getAsyncThreads() {
return asyncThreads;
}
public void setAsyncThreads(Integer asyncThreads) {
this.asyncThreads = asyncThreads;
}
}

View File

@@ -9,3 +9,5 @@ com.easyagents.spring.boot.store.elasticsearch.ElasticSearchAutoConfiguration
com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration
com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration
com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration
com.easyagents.spring.boot.document.pptx.MineruPptxAutoConfiguration
com.easyagents.spring.boot.document.xlsx.MineruXlsxAutoConfiguration

View File

@@ -2,8 +2,12 @@ package com.easyagents.spring.boot.autoconfigure;
import com.easyagents.document.core.DocumentParseService;
import com.easyagents.document.pdf.PdfDocumentParseService;
import com.easyagents.document.pptx.PptxDocumentParseService;
import com.easyagents.document.xlsx.XlsxDocumentParseService;
import com.easyagents.llm.ollama.OllamaChatModel;
import com.easyagents.spring.boot.document.pptx.MineruPptxAutoConfiguration;
import com.easyagents.spring.boot.document.pdf.mineru.MineruPdfAutoConfiguration;
import com.easyagents.spring.boot.document.xlsx.MineruXlsxAutoConfiguration;
import com.easyagents.spring.boot.llm.ollama.OllamaAutoConfiguration;
import com.easyagents.spring.boot.rag.ingestion.RagIngestionAutoConfiguration;
import com.easyagents.spring.boot.store.opensearch.OpenSearchAutoConfiguration;
@@ -18,7 +22,9 @@ public class StarterConditionalAutoConfigurationTest {
RagIngestionAutoConfiguration.class,
OllamaAutoConfiguration.class,
OpenSearchAutoConfiguration.class,
MineruPdfAutoConfiguration.class
MineruPdfAutoConfiguration.class,
MineruPptxAutoConfiguration.class,
MineruXlsxAutoConfiguration.class
);
@Test
@@ -51,4 +57,19 @@ public class StarterConditionalAutoConfigurationTest {
Assert.assertNotNull(context.getBean(DocumentParseService.class));
});
}
@Test
public void shouldCreatePptxAndXlsxBeansWhenEnabled() {
contextRunner
.withPropertyValues(
"easy-agents.document.mineru.base-url=https://hub.wust.edu.cn/modelServer/mineru-api",
"easy-agents.document.pptx.enabled=true",
"easy-agents.document.xlsx.enabled=true"
)
.run(context -> {
Assert.assertNotNull(context.getBean(PptxDocumentParseService.class));
Assert.assertNotNull(context.getBean(XlsxDocumentParseService.class));
Assert.assertFalse(context.containsBean("documentParseService"));
});
}
}