From e7a52a228ce8aeae9da9b3cc4ca22fe0afb749bb Mon Sep 17 00:00:00 2001 From: PIG AI Date: Wed, 23 Jul 2025 18:28:00 +0800 Subject: [PATCH 1/2] =?UTF-8?q?fix(parser):=20=20=E5=9C=A8UnstructuredPars?= =?UTF-8?q?er=E4=B8=AD=E4=BD=BF=E7=94=A8isWordExtension=E6=9B=BF=E4=BB=A3?= =?UTF-8?q?=E5=8E=9F=E6=9C=89=E6=A3=80=E6=B5=8B=E9=80=BB=E8=BE=91=EF=BC=8C?= =?UTF-8?q?=E5=8E=9F=E6=9C=89=E6=A3=80=E6=B5=8B=E9=80=BB=E8=BE=91=20filena?= =?UTF-8?q?me=20filepath=20=E4=BD=BF=E7=94=A8=E6=B7=B7=E4=B9=B1=EF=BC=8C?= =?UTF-8?q?=E4=BC=A0=E9=80=92stream=20=E7=9A=84=E6=96=B9=E5=BC=8F=E5=A7=8B?= =?UTF-8?q?=E7=BB=88=E7=B1=BB=E5=9E=8B=E4=B8=8D=E5=AF=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../unstructured/UnstructuredParser.java | 2 +- .../unstructured/util/UnstructuredUtils.java | 14 +++++++++++-- .../infra/unstructured/WordParserTest.java | 21 +++++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java b/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java index e4cf1cd..9fa2a7a 100644 --- a/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java +++ b/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java @@ -160,7 +160,7 @@ public class UnstructuredParser { */ public static DocumentResult toStructuredResult(InputStream inputStream, String fileName) { // 根据文件名检测格式 - if (isWordDocument(fileName)) { + if (isWordExtension(fileName)) { return UnstructuredWord.toStructuredResult(inputStream, fileName); } // TODO: 添加其他格式的支持 diff --git a/src/main/java/com/torchv/infra/unstructured/util/UnstructuredUtils.java b/src/main/java/com/torchv/infra/unstructured/util/UnstructuredUtils.java index 722cd68..ef269d5 100644 --- a/src/main/java/com/torchv/infra/unstructured/util/UnstructuredUtils.java +++ b/src/main/java/com/torchv/infra/unstructured/util/UnstructuredUtils.java @@ -84,8 +84,18 @@ public class UnstructuredUtils { return false; } - String fileName = file.getName().toLowerCase(); - return fileName.endsWith(".docx") || fileName.endsWith(".doc"); + return isWordExtension(file.getName()); + } + + /** + * 判断文件是否为Word文档名扩展名 + * + * @param fileName 文件名 + * @return 如果是docx / doc 返回true,否则返回false + */ + public static boolean isWordExtension(String fileName) { + String extension = getFileExtension(fileName.toLowerCase()); + return "docx".equals(extension) || "doc".equals(extension); } /** diff --git a/src/test/java/com/torchv/infra/unstructured/WordParserTest.java b/src/test/java/com/torchv/infra/unstructured/WordParserTest.java index b585027..61c643c 100644 --- a/src/test/java/com/torchv/infra/unstructured/WordParserTest.java +++ b/src/test/java/com/torchv/infra/unstructured/WordParserTest.java @@ -17,12 +17,17 @@ package com.torchv.infra.unstructured; +import cn.hutool.core.io.FileUtil; import com.torchv.infra.unstructured.core.DocumentResult; import lombok.extern.slf4j.Slf4j; import org.junit.Test; +import java.io.BufferedInputStream; +import java.io.File; import java.util.List; +import static org.junit.Assert.*; + /** * @author xiaoymin@foxmail.com * 2025/7/19 23:11 @@ -86,4 +91,20 @@ public class WordParserTest { // 获取结构化结果,提供更多控制 } + + /** + * 测试通过输入流转换为结构化结果 + * + * @throws Exception 测试过程中可能抛出的异常 + */ + @Test + public void test_structured_result_by_stream(){ + String filePath = "src/test/resources/docs/test.docx"; + BufferedInputStream inputStream = FileUtil.getInputStream(new File(filePath)); + String name = FileUtil.getName(filePath); + DocumentResult structuredResult = UnstructuredParser.toStructuredResult(inputStream, name); + log.info(structuredResult.getContent()); + assertNotNull(structuredResult); + } + } -- Gitee From 12a66ecfe780daf778d98f86650baf041897a8e4 Mon Sep 17 00:00:00 2001 From: PIG AI Date: Wed, 23 Jul 2025 18:33:53 +0800 Subject: [PATCH 2/2] =?UTF-8?q?feat(parser):=20=E6=96=B0=E5=A2=9E=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E9=80=9A=E8=BF=87InputStream=E5=A4=84=E7=90=86Word2MD?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../infra/unstructured/UnstructuredParser.java | 16 ++++++++++++++++ .../infra/unstructured/WordParserTest.java | 12 ++++++++++++ 2 files changed, 28 insertions(+) diff --git a/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java b/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java index 9fa2a7a..53d453c 100644 --- a/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java +++ b/src/main/java/com/torchv/infra/unstructured/UnstructuredParser.java @@ -96,6 +96,22 @@ public class UnstructuredParser { } return toMarkdown(file.getAbsolutePath()); } + + /** + * 将输入流转换为Markdown格式的字符串 + * + * @param inputStream 输入流 + * @param fileName 文件名,用于检测格式 + * @return 转换后的Markdown字符串 + * @throws UnsupportedOperationException 当文件格式不支持时抛出异常 + */ + public static String toMarkdown(InputStream inputStream, String fileName) { + // 根据文件名检测格式 + if (!isWordExtension(fileName)) { + throw new UnsupportedOperationException("暂不支持的文件格式: " + fileName); + } + return UnstructuredWord.toMarkdown(inputStream, fileName); + } /** * 解析文档为Markdown格式(保留表格的HTML结构) diff --git a/src/test/java/com/torchv/infra/unstructured/WordParserTest.java b/src/test/java/com/torchv/infra/unstructured/WordParserTest.java index 61c643c..6f9348a 100644 --- a/src/test/java/com/torchv/infra/unstructured/WordParserTest.java +++ b/src/test/java/com/torchv/infra/unstructured/WordParserTest.java @@ -92,6 +92,18 @@ public class WordParserTest { } + /** + * 测试解析doc文件为markdown格式 + */ + @Test + public void test_parse_4() { + String filePath = "src/test/resources/docs/test.doc"; + String name = FileUtil.getName(filePath); + BufferedInputStream inputStream = FileUtil.getInputStream(new File(filePath)); + String content = UnstructuredParser.toMarkdown(inputStream,name); + log.info(content); + } + /** * 测试通过输入流转换为结构化结果 * -- Gitee