Skip to content

Commit e5fdd26

Browse files
authored
Merge pull request #9 from cpetersen/pptx
Working on pptx parsing
2 parents 2922769 + f220d18 commit e5fdd26

File tree

5 files changed

+154
-14
lines changed

5 files changed

+154
-14
lines changed

README.md

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -121,21 +121,12 @@ excel_text = parser.parse_xlsx(excel_data)
121121
| PDF | .pdf | `parse_pdf` | Text extraction via MuPDF |
122122
| Word | .docx | `parse_docx` | Office Open XML format |
123123
| Excel | .xlsx, .xls | `parse_xlsx` | Both modern and legacy formats |
124-
| PowerPoint | .pptx | - | **Not yet supported** - see [implementation plan](docs/PPTX_PLAN.md) |
124+
| PowerPoint | .pptx | `parse_pptx` | Text extraction from slides and notes |
125125
| Images | .png, .jpg, .jpeg, .tiff, .bmp | `ocr_image` | OCR via bundled Tesseract |
126126
| JSON | .json | `parse_json` | Pretty-printed output |
127127
| XML/HTML | .xml, .html | `parse_xml` | Extracts text content |
128128
| Text | .txt, .csv, .md | `parse_text` | With encoding detection |
129129

130-
### Note on PowerPoint Support
131-
132-
While PPTX files are listed in our features, they are not yet fully implemented. Currently, PPTX files will return binary data instead of extracted text. We have a detailed [implementation plan](docs/PPTX_PLAN.md) for adding proper PPTX support in a future release. This will involve:
133-
- Adding ZIP archive handling capabilities
134-
- Implementing XML extraction from PowerPoint slide files
135-
- Following the same Office Open XML approach used for DOCX files
136-
137-
For now, if you need to extract text from PowerPoint files, we recommend converting them to PDF first.
138-
139130
## Performance
140131

141132
ParseKit is built with performance in mind:

ext/parsekit/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ tesseract-rs = "0.1" # Tesseract with optional bundling
2020
image = "0.25" # Image processing library (match rusty-tesseract's version)
2121
calamine = "0.30" # Excel parsing
2222
docx-rs = "0.4" # Word document parsing
23+
zip = "2.1" # ZIP archive handling for PPTX
2324
quick-xml = "0.36" # XML parsing
2425
serde_json = "1.0" # JSON parsing
2526
regex = "1.10" # Text parsing

ext/parsekit/src/parser.rs

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ impl Parser {
7878
match file_type.as_str() {
7979
"pdf" => self.parse_pdf(data),
8080
"docx" => self.parse_docx(data),
81+
"pptx" => self.parse_pptx(data),
8182
"xlsx" | "xls" => self.parse_xlsx(data),
8283
"json" => self.parse_json(data),
8384
"xml" | "html" => self.parse_xml(data),
@@ -328,6 +329,143 @@ impl Parser {
328329
}
329330
}
330331

332+
/// Parse PPTX (PowerPoint) files - exposed to Ruby
333+
fn parse_pptx(&self, data: Vec<u8>) -> Result<String, Error> {
334+
use std::io::{Cursor, Read};
335+
use zip::ZipArchive;
336+
337+
let cursor = Cursor::new(data);
338+
let mut archive = match ZipArchive::new(cursor) {
339+
Ok(archive) => archive,
340+
Err(e) => {
341+
return Err(Error::new(
342+
magnus::exception::runtime_error(),
343+
format!("Failed to open PPTX as ZIP: {}", e),
344+
))
345+
}
346+
};
347+
348+
let mut all_text = Vec::new();
349+
let mut slide_numbers = Vec::new();
350+
351+
// First, collect slide numbers and sort them
352+
for i in 0..archive.len() {
353+
let file = match archive.by_index(i) {
354+
Ok(file) => file,
355+
Err(_) => continue,
356+
};
357+
358+
let name = file.name();
359+
// Match slide XML files (e.g., ppt/slides/slide1.xml)
360+
if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") && !name.contains("_rels") {
361+
// Extract slide number from filename
362+
if let Some(num_str) = name
363+
.strip_prefix("ppt/slides/slide")
364+
.and_then(|s| s.strip_suffix(".xml"))
365+
{
366+
if let Ok(num) = num_str.parse::<usize>() {
367+
slide_numbers.push((num, i));
368+
}
369+
}
370+
}
371+
}
372+
373+
// Sort by slide number to maintain order
374+
slide_numbers.sort_by_key(|&(num, _)| num);
375+
376+
// Now process slides in order
377+
for (_, index) in slide_numbers {
378+
let mut file = match archive.by_index(index) {
379+
Ok(file) => file,
380+
Err(_) => continue,
381+
};
382+
383+
let mut contents = String::new();
384+
if file.read_to_string(&mut contents).is_ok() {
385+
// Extract text from slide XML
386+
let text = self.extract_text_from_slide_xml(&contents);
387+
if !text.is_empty() {
388+
all_text.push(text);
389+
}
390+
}
391+
}
392+
393+
// Also extract notes if present
394+
for i in 0..archive.len() {
395+
let mut file = match archive.by_index(i) {
396+
Ok(file) => file,
397+
Err(_) => continue,
398+
};
399+
400+
let name = file.name();
401+
// Match notes slide XML files
402+
if name.starts_with("ppt/notesSlides/notesSlide") && name.ends_with(".xml") && !name.contains("_rels") {
403+
let mut contents = String::new();
404+
if file.read_to_string(&mut contents).is_ok() {
405+
let text = self.extract_text_from_slide_xml(&contents);
406+
if !text.is_empty() {
407+
all_text.push(format!("[Notes: {}]", text));
408+
}
409+
}
410+
}
411+
}
412+
413+
if all_text.is_empty() {
414+
Ok("".to_string())
415+
} else {
416+
Ok(all_text.join("\n\n"))
417+
}
418+
}
419+
420+
/// Helper method to extract text from slide XML
421+
fn extract_text_from_slide_xml(&self, xml_content: &str) -> String {
422+
use quick_xml::events::Event;
423+
use quick_xml::Reader;
424+
425+
let mut reader = Reader::from_str(xml_content);
426+
427+
let mut text_parts = Vec::new();
428+
let mut buf = Vec::new();
429+
let mut in_text_element = false;
430+
431+
loop {
432+
match reader.read_event_into(&mut buf) {
433+
Ok(Event::Start(ref e)) => {
434+
// Look for text elements (a:t or t)
435+
let name = e.name();
436+
let local_name_bytes = name.local_name();
437+
let local_name = std::str::from_utf8(local_name_bytes.as_ref()).unwrap_or("");
438+
if local_name == "t" {
439+
in_text_element = true;
440+
}
441+
}
442+
Ok(Event::Text(e)) => {
443+
if in_text_element {
444+
if let Ok(text) = e.unescape() {
445+
let text_str = text.trim();
446+
if !text_str.is_empty() {
447+
text_parts.push(text_str.to_string());
448+
}
449+
}
450+
}
451+
}
452+
Ok(Event::End(ref e)) => {
453+
let name = e.name();
454+
let local_name_bytes = name.local_name();
455+
let local_name = std::str::from_utf8(local_name_bytes.as_ref()).unwrap_or("");
456+
if local_name == "t" {
457+
in_text_element = false;
458+
}
459+
}
460+
Ok(Event::Eof) => break,
461+
_ => {}
462+
}
463+
buf.clear();
464+
}
465+
466+
text_parts.join(" ")
467+
}
468+
331469
/// Parse Excel files - exposed to Ruby
332470
fn parse_xlsx(&self, data: Vec<u8>) -> Result<String, Error> {
333471
use calamine::{Reader, Xlsx};
@@ -486,6 +624,7 @@ impl Parser {
486624
"htm".to_string(), // HTML files (alternative extension)
487625
"md".to_string(), // Markdown files
488626
"docx".to_string(),
627+
"pptx".to_string(),
489628
"xlsx".to_string(),
490629
"xls".to_string(),
491630
"csv".to_string(),
@@ -543,6 +682,7 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
543682
// Individual parser methods exposed to Ruby
544683
class.define_method("parse_pdf", method!(Parser::parse_pdf, 1))?;
545684
class.define_method("parse_docx", method!(Parser::parse_docx, 1))?;
685+
class.define_method("parse_pptx", method!(Parser::parse_pptx, 1))?;
546686
class.define_method("parse_xlsx", method!(Parser::parse_xlsx, 1))?;
547687
class.define_method("parse_json", method!(Parser::parse_json, 1))?;
548688
class.define_method("parse_xml", method!(Parser::parse_xml, 1))?;

lib/parsekit/parser.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def detect_format(path)
8989

9090
case ext.downcase
9191
when 'docx' then :docx
92+
when 'pptx' then :pptx
9293
when 'xlsx', 'xls' then :xlsx
9394
when 'pdf' then :pdf
9495
when 'json' then :json

spec/parsekit/integration_spec.rb

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,17 @@
102102

103103
result = parser.parse_file(pptx_file)
104104
expect(result).to be_a(String)
105-
# PPTX parsing appears to be broken - returns binary data
106-
# This needs to be fixed in the parser implementation
107-
# For now, we just check it returns a string
108-
# TODO: Fix PPTX parsing and add proper content assertions
105+
expect(result).not_to be_empty
106+
107+
# Check for content we know is in the sample PPTX
108+
expect(result).to include("Microsoft Powerpoint document")
109+
expect(result).to include("Bullet points")
110+
expect(result).to include("Bold text")
111+
expect(result).to include("Italic text")
112+
expect(result).to include("Unicode")
113+
expect(result).to include("Table example")
114+
expect(result).to include("Column 1")
115+
expect(result).to include("Data A")
109116
end
110117
end
111118
end

0 commit comments

Comments
 (0)