Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
if [ "$NEED_MIRROR" == "1" ]; then \
sed -i 's|http://ports.ubuntu.com|http://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list; \
sed -i 's|http://archive.ubuntu.com|http://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list; \
sed -i 's|http://security.ubuntu.com|http://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list; \
fi; \
rm -f /etc/apt/apt.conf.d/docker-clean && \
echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache && \
Expand All @@ -60,7 +61,8 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \
apt install -y libjemalloc-dev && \
apt install -y python3-pip pipx nginx unzip curl wget git vim less && \
apt install -y ghostscript
apt install -y ghostscript && \
apt install -y libreoffice

RUN if [ "$NEED_MIRROR" == "1" ]; then \
pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
Expand Down
42 changes: 25 additions & 17 deletions api/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,24 +211,32 @@ def thumbnail_img(filename, blob):
return buffered.getvalue()

elif re.match(r".*\.(ppt|pptx)$", filename):
import aspose.pydrawing as drawing
import aspose.slides as slides

try:
with slides.Presentation(BytesIO(blob)) as presentation:
buffered = BytesIO()
scale = 0.03
img = None
for _ in range(10):
# https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png)
img = buffered.getvalue()
if len(img) >= 64000:
scale = scale / 2.0
buffered = BytesIO()
else:
break
return img
os.environ[
"LD_LIBRARY_PATH"] = "/usr/lib/libreoffice/program:" + os.environ.get(
"LD_LIBRARY_PATH", "")
with tempfile.NamedTemporaryFile(suffix='.ppt') as tmp_ppt:
tmp_ppt.write(blob)
tmp_ppt_path = tmp_ppt.name

with tempfile.TemporaryDirectory() as tmp_dir:
cmd = [
"libreoffice",
"--headless",
"--convert-to", "pdf",
"--outdir", os.path.dirname(tmp_dir),
tmp_ppt_path
]
subprocess.run(cmd, check=True, capture_output=True,
text=True)
pdf_name = os.path.splitext(tmp_ppt_path)[0] + '.pdf'
pdf_path = os.path.join(tmp_dir, pdf_name)
with pdfplumber.open(pdf_path) as pdf:
if pdf.pages:
buffered = BytesIO()
pdf.pages[0].to_image(resolution=int(72)).save(buffered)
img = buffered.getvalue()
return img
except Exception:
pass
return None
Expand Down
34 changes: 30 additions & 4 deletions deepdoc/parser/ppt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
import logging
from io import BytesIO
from pptx import Presentation

import tempfile
import subprocess
import os

class RAGFlowPptParser:
def __init__(self):
Expand Down Expand Up @@ -75,9 +77,33 @@ def __extract(self, shape):
return ""

def __call__(self, fnm, from_page, to_page, callback=None):
ppt = Presentation(fnm) if isinstance(
fnm, str) else Presentation(
BytesIO(fnm))
os.environ[
"LD_LIBRARY_PATH"] = "/usr/lib/libreoffice/program:" + os.environ.get(
"LD_LIBRARY_PATH", "")
if (isinstance(fnm, str) and fnm.lower().endswith('.ppt')) or (isinstance(fnm, bytes) and fnm[:8] == b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'):
with tempfile.NamedTemporaryFile(suffix='.ppt') as tmp_ppt:
if isinstance(fnm, bytes):
tmp_ppt.write(fnm)
tmp_ppt.flush()
ppt_path = tmp_ppt.name
else:
ppt_path = fnm
with tempfile.TemporaryDirectory() as tmp_dir:
cmd = [
"libreoffice",
"--headless",
"--convert-to", "pptx",
"--outdir", tmp_dir,
ppt_path
]
subprocess.run(cmd, check=True, capture_output=True)
pptx_name = os.path.splitext(os.path.basename(tmp_ppt.name))[0] + '.pptx'
pptx_path = os.path.join(tmp_dir, pptx_name)
ppt = Presentation(pptx_path)
else:
ppt = Presentation(fnm) if isinstance(fnm, str) else Presentation(
BytesIO(fnm))

txts = []
self.total_page = len(ppt.slides)
for i, slide in enumerate(ppt.slides):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies = [
"azure-storage-file-datalake==12.16.0",
"anthropic==0.34.1",
"arxiv==2.1.3",
"aspose-slides>=24.9.0,<25.0.0; platform_machine == 'x86_64' or (sys_platform == 'darwin' and platform_machine == 'arm64')",
"libreoffce>=7.3.7.0",
"beartype>=0.18.5,<0.19.0",
"bio==1.7.1",
"blinker==1.7.0",
Expand Down
38 changes: 21 additions & 17 deletions rag/app/presentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,40 @@
import re
from io import BytesIO

from PIL import Image

from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from deepdoc.parser.pdf_parser import VisionParser
from rag.nlp import tokenize, is_english
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read

import tempfile
import subprocess
import os
import pdfplumber

class Ppt(PptParser):
def __call__(self, fnm, from_page, to_page, callback=None):
txts = super().__call__(fnm, from_page, to_page)

callback(0.5, "Text extraction finished.")
import aspose.slides as slides
import aspose.pydrawing as drawing
imgs = []
with slides.Presentation(BytesIO(fnm)) as presentation:
for i, slide in enumerate(presentation.slides[from_page: to_page]):
try:
with BytesIO() as buffered:
slide.get_thumbnail(
0.1, 0.1).save(
buffered, drawing.imaging.ImageFormat.jpeg)
buffered.seek(0)
imgs.append(Image.open(buffered).copy())
except RuntimeError as e:
raise RuntimeError(f'ppt parse error at page {i+1}, original error: {str(e)}') from e
with tempfile.NamedTemporaryFile(suffix='.pptx') as tmp_input:
tmp_input.write(
fnm if isinstance(fnm, bytes) else open(fnm, 'rb').read())
input_path = tmp_input.name
with tempfile.TemporaryDirectory() as tmp_dir:
cmd = [
"libreoffice",
"--headless",
"--convert-to", "pdf",
"--outdir", tmp_dir,
input_path
]
subprocess.run(cmd, check=True, capture_output=True)
pdf_name = os.path.splitext(os.path.basename(input_path))[0] + '.pdf'
pdf_path = os.path.join(tmp_dir, pdf_name)
with pdfplumber.open(pdf_path) as pdf:
imgs = [p.to_image(resolution=72).annotated for i, p in enumerate(pdf.pages[from_page:to_page])]
assert len(imgs) == len(
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
callback(0.9, "Image extraction finished")
Expand Down