infiniflow · huang-aoqin · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -46,6 +46,7 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
     if [ "$NEED_MIRROR" == "1" ]; then \
         sed -i 's|http://ports.ubuntu.com|http://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list; \
         sed -i 's|http://archive.ubuntu.com|http://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list; \
+        sed -i 's|http://security.ubuntu.com|http://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list; \
     fi; \
     rm -f /etc/apt/apt.conf.d/docker-clean && \
     echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache && \
@@ -60,7 +61,8 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
     apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \
     apt install -y libjemalloc-dev && \
     apt install -y python3-pip pipx nginx unzip curl wget git vim less && \
-    apt install -y ghostscript
+    apt install -y ghostscript && \
+    apt install -y libreoffice
 
 RUN if [ "$NEED_MIRROR" == "1" ]; then \
         pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple && \

diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py
@@ -211,24 +211,32 @@ def thumbnail_img(filename, blob):
         return buffered.getvalue()
 
     elif re.match(r".*\.(ppt|pptx)$", filename):
-        import aspose.pydrawing as drawing
-        import aspose.slides as slides
-
         try:
-            with slides.Presentation(BytesIO(blob)) as presentation:
-                buffered = BytesIO()
-                scale = 0.03
-                img = None
-                for _ in range(10):
-                    # https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
-                    presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png)
-                    img = buffered.getvalue()
-                    if len(img) >= 64000:
-                        scale = scale / 2.0
-                        buffered = BytesIO()
-                    else:
-                        break
-                return img
+            os.environ[
+                "LD_LIBRARY_PATH"] = "/usr/lib/libreoffice/program:" + os.environ.get(
+                "LD_LIBRARY_PATH", "")
+            with tempfile.NamedTemporaryFile(suffix='.ppt') as tmp_ppt:
+                tmp_ppt.write(blob)
+                tmp_ppt_path = tmp_ppt.name
+
+                with tempfile.TemporaryDirectory() as tmp_dir:
+                    cmd = [
+                        "libreoffice",
+                        "--headless",
+                        "--convert-to", "pdf",
+                        "--outdir", os.path.dirname(tmp_dir),
+                        tmp_ppt_path
+                    ]
+                    subprocess.run(cmd, check=True, capture_output=True,
+                                   text=True)
+                    pdf_name = os.path.splitext(tmp_ppt_path)[0] + '.pdf'
+                    pdf_path = os.path.join(tmp_dir, pdf_name)
+                    with pdfplumber.open(pdf_path) as pdf:
+                        if pdf.pages:
+                            buffered = BytesIO()
+                            pdf.pages[0].to_image(resolution=int(72)).save(buffered)
+                            img = buffered.getvalue()
+                            return img
         except Exception:
             pass
     return None

diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py
@@ -17,7 +17,9 @@
 import logging
 from io import BytesIO
 from pptx import Presentation
-
+import tempfile
+import subprocess
+import os
 
 class RAGFlowPptParser:
     def __init__(self):
@@ -75,9 +77,33 @@ def __extract(self, shape):
             return ""
 
     def __call__(self, fnm, from_page, to_page, callback=None):
-        ppt = Presentation(fnm) if isinstance(
-            fnm, str) else Presentation(
-            BytesIO(fnm))
+        os.environ[
+            "LD_LIBRARY_PATH"] = "/usr/lib/libreoffice/program:" + os.environ.get(
+            "LD_LIBRARY_PATH", "")
+        if (isinstance(fnm, str) and fnm.lower().endswith('.ppt')) or (isinstance(fnm, bytes) and fnm[:8] == b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'):
+            with tempfile.NamedTemporaryFile(suffix='.ppt') as tmp_ppt:
+                if isinstance(fnm, bytes):
+                    tmp_ppt.write(fnm)
+                    tmp_ppt.flush()
+                    ppt_path = tmp_ppt.name
+                else:
+                    ppt_path = fnm
+                with tempfile.TemporaryDirectory() as tmp_dir:
+                    cmd = [
+                        "libreoffice",
+                        "--headless",
+                        "--convert-to", "pptx",
+                        "--outdir", tmp_dir,
+                        ppt_path
+                    ]
+                    subprocess.run(cmd, check=True, capture_output=True)
+                    pptx_name = os.path.splitext(os.path.basename(tmp_ppt.name))[0] + '.pptx'
+                    pptx_path = os.path.join(tmp_dir, pptx_name)
+                    ppt = Presentation(pptx_path)
+        else:
+            ppt = Presentation(fnm) if isinstance(fnm, str) else Presentation(
+                BytesIO(fnm))
+
         txts = []
         self.total_page = len(ppt.slides)
         for i, slide in enumerate(ppt.slides):

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ dependencies = [
     "azure-storage-file-datalake==12.16.0",
     "anthropic==0.34.1",
     "arxiv==2.1.3",
-    "aspose-slides>=24.9.0,<25.0.0; platform_machine == 'x86_64' or (sys_platform == 'darwin' and platform_machine == 'arm64')",
+    "libreoffce>=7.3.7.0",
     "beartype>=0.18.5,<0.19.0",
     "bio==1.7.1",
     "blinker==1.7.0",

diff --git a/rag/app/presentation.py b/rag/app/presentation.py
@@ -18,36 +18,40 @@
 import re
 from io import BytesIO
 
-from PIL import Image
-
 from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
 from deepdoc.parser.pdf_parser import VisionParser
 from rag.nlp import tokenize, is_english
 from rag.nlp import rag_tokenizer
 from deepdoc.parser import PdfParser, PptParser, PlainParser
 from PyPDF2 import PdfReader as pdf2_read
-
+import tempfile
+import subprocess
+import os
+import pdfplumber
 
 class Ppt(PptParser):
     def __call__(self, fnm, from_page, to_page, callback=None):
         txts = super().__call__(fnm, from_page, to_page)
 
         callback(0.5, "Text extraction finished.")
-        import aspose.slides as slides
-        import aspose.pydrawing as drawing
-        imgs = []
-        with slides.Presentation(BytesIO(fnm)) as presentation:
-            for i, slide in enumerate(presentation.slides[from_page: to_page]):
-                try:
-                    with BytesIO() as buffered:
-                        slide.get_thumbnail(
-                            0.1, 0.1).save(
-                            buffered, drawing.imaging.ImageFormat.jpeg)
-                        buffered.seek(0)
-                        imgs.append(Image.open(buffered).copy())
-                except RuntimeError as e:
-                    raise RuntimeError(f'ppt parse error at page {i+1}, original error: {str(e)}') from e
+        with tempfile.NamedTemporaryFile(suffix='.pptx') as tmp_input:
+            tmp_input.write(
+                fnm if isinstance(fnm, bytes) else open(fnm, 'rb').read())
+            input_path = tmp_input.name
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                cmd = [
+                    "libreoffice",
+                    "--headless",
+                    "--convert-to", "pdf",
+                    "--outdir", tmp_dir,
+                    input_path
+                ]
+                subprocess.run(cmd, check=True, capture_output=True)
+                pdf_name = os.path.splitext(os.path.basename(input_path))[0] + '.pdf'
+                pdf_path = os.path.join(tmp_dir, pdf_name)
+                with pdfplumber.open(pdf_path) as pdf:
+                    imgs = [p.to_image(resolution=72).annotated for i, p in enumerate(pdf.pages[from_page:to_page])]
         assert len(imgs) == len(
             txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
         callback(0.9, "Image extraction finished")