Merge pull request #9 from cpetersen/pptx

cpetersen · web-flow · commit e5fdd265212e · 2025-09-05T18:17:31.000-07:00
Working on pptx parsing
diff --git a/README.md b/README.md
@@ -121,21 +121,12 @@ excel_text = parser.parse_xlsx(excel_data)
 | PDF | .pdf | `parse_pdf` | Text extraction via MuPDF |
 | Word | .docx | `parse_docx` | Office Open XML format |
 | Excel | .xlsx, .xls | `parse_xlsx` | Both modern and legacy formats |
-| PowerPoint | .pptx | - | **Not yet supported** - see [implementation plan](docs/PPTX_PLAN.md) |
+| PowerPoint | .pptx | `parse_pptx` | Text extraction from slides and notes |
 | Images | .png, .jpg, .jpeg, .tiff, .bmp | `ocr_image` | OCR via bundled Tesseract |
 | JSON | .json | `parse_json` | Pretty-printed output |
 | XML/HTML | .xml, .html | `parse_xml` | Extracts text content |
 | Text | .txt, .csv, .md | `parse_text` | With encoding detection |
 
-### Note on PowerPoint Support
-
-While PPTX files are listed in our features, they are not yet fully implemented. Currently, PPTX files will return binary data instead of extracted text. We have a detailed [implementation plan](docs/PPTX_PLAN.md) for adding proper PPTX support in a future release. This will involve:
-- Adding ZIP archive handling capabilities
-- Implementing XML extraction from PowerPoint slide files
-- Following the same Office Open XML approach used for DOCX files
-
-For now, if you need to extract text from PowerPoint files, we recommend converting them to PDF first.
-
 ## Performance
 
 ParseKit is built with performance in mind:
diff --git a/ext/parsekit/Cargo.toml b/ext/parsekit/Cargo.toml
@@ -20,6 +20,7 @@ tesseract-rs = "0.1"  # Tesseract with optional bundling
 image = "0.25"  # Image processing library (match rusty-tesseract's version)
 calamine = "0.30"  # Excel parsing
 docx-rs = "0.4"  # Word document parsing
+zip = "2.1"  # ZIP archive handling for PPTX
 quick-xml = "0.36"  # XML parsing
 serde_json = "1.0"  # JSON parsing
 regex = "1.10"  # Text parsing
diff --git a/ext/parsekit/src/parser.rs b/ext/parsekit/src/parser.rs
@@ -78,6 +78,7 @@ impl Parser {
         match file_type.as_str() {
             "pdf" => self.parse_pdf(data),
             "docx" => self.parse_docx(data),
+            "pptx" => self.parse_pptx(data),
             "xlsx" | "xls" => self.parse_xlsx(data),
             "json" => self.parse_json(data),
             "xml" | "html" => self.parse_xml(data),
@@ -328,6 +329,143 @@ impl Parser {
         }
     }
 
+    /// Parse PPTX (PowerPoint) files - exposed to Ruby
+    fn parse_pptx(&self, data: Vec<u8>) -> Result<String, Error> {
+        use std::io::{Cursor, Read};
+        use zip::ZipArchive;
+        
+        let cursor = Cursor::new(data);
+        let mut archive = match ZipArchive::new(cursor) {
+            Ok(archive) => archive,
+            Err(e) => {
+                return Err(Error::new(
+                    magnus::exception::runtime_error(),
+                    format!("Failed to open PPTX as ZIP: {}", e),
+                ))
+            }
+        };
+        
+        let mut all_text = Vec::new();
+        let mut slide_numbers = Vec::new();
+        
+        // First, collect slide numbers and sort them
+        for i in 0..archive.len() {
+            let file = match archive.by_index(i) {
+                Ok(file) => file,
+                Err(_) => continue,
+            };
+            
+            let name = file.name();
+            // Match slide XML files (e.g., ppt/slides/slide1.xml)
+            if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") && !name.contains("_rels") {
+                // Extract slide number from filename
+                if let Some(num_str) = name
+                    .strip_prefix("ppt/slides/slide")
+                    .and_then(|s| s.strip_suffix(".xml"))
+                {
+                    if let Ok(num) = num_str.parse::<usize>() {
+                        slide_numbers.push((num, i));
+                    }
+                }
+            }
+        }
+        
+        // Sort by slide number to maintain order
+        slide_numbers.sort_by_key(|&(num, _)| num);
+        
+        // Now process slides in order
+        for (_, index) in slide_numbers {
+            let mut file = match archive.by_index(index) {
+                Ok(file) => file,
+                Err(_) => continue,
+            };
+            
+            let mut contents = String::new();
+            if file.read_to_string(&mut contents).is_ok() {
+                // Extract text from slide XML
+                let text = self.extract_text_from_slide_xml(&contents);
+                if !text.is_empty() {
+                    all_text.push(text);
+                }
+            }
+        }
+        
+        // Also extract notes if present
+        for i in 0..archive.len() {
+            let mut file = match archive.by_index(i) {
+                Ok(file) => file,
+                Err(_) => continue,
+            };
+            
+            let name = file.name();
+            // Match notes slide XML files
+            if name.starts_with("ppt/notesSlides/notesSlide") && name.ends_with(".xml") && !name.contains("_rels") {
+                let mut contents = String::new();
+                if file.read_to_string(&mut contents).is_ok() {
+                    let text = self.extract_text_from_slide_xml(&contents);
+                    if !text.is_empty() {
+                        all_text.push(format!("[Notes: {}]", text));
+                    }
+                }
+            }
+        }
+        
+        if all_text.is_empty() {
+            Ok("".to_string())
+        } else {
+            Ok(all_text.join("\n\n"))
+        }
+    }
+    
+    /// Helper method to extract text from slide XML
+    fn extract_text_from_slide_xml(&self, xml_content: &str) -> String {
+        use quick_xml::events::Event;
+        use quick_xml::Reader;
+        
+        let mut reader = Reader::from_str(xml_content);
+        
+        let mut text_parts = Vec::new();
+        let mut buf = Vec::new();
+        let mut in_text_element = false;
+        
+        loop {
+            match reader.read_event_into(&mut buf) {
+                Ok(Event::Start(ref e)) => {
+                    // Look for text elements (a:t or t)
+                    let name = e.name();
+                    let local_name_bytes = name.local_name();
+                    let local_name = std::str::from_utf8(local_name_bytes.as_ref()).unwrap_or("");
+                    if local_name == "t" {
+                        in_text_element = true;
+                    }
+                }
+                Ok(Event::Text(e)) => {
+                    if in_text_element {
+                        if let Ok(text) = e.unescape() {
+                            let text_str = text.trim();
+                            if !text_str.is_empty() {
+                                text_parts.push(text_str.to_string());
+                            }
+                        }
+                    }
+                }
+                Ok(Event::End(ref e)) => {
+                    let name = e.name();
+                    let local_name_bytes = name.local_name();
+                    let local_name = std::str::from_utf8(local_name_bytes.as_ref()).unwrap_or("");
+                    if local_name == "t" {
+                        in_text_element = false;
+                    }
+                }
+                Ok(Event::Eof) => break,
+                _ => {}
+            }
+            buf.clear();
+        }
+        
+        text_parts.join(" ")
+    }
+
     /// Parse Excel files - exposed to Ruby
     fn parse_xlsx(&self, data: Vec<u8>) -> Result<String, Error> {
         use calamine::{Reader, Xlsx};
@@ -486,6 +624,7 @@ impl Parser {
             "htm".to_string(), // HTML files (alternative extension)
             "md".to_string(),  // Markdown files
             "docx".to_string(),
+            "pptx".to_string(),
             "xlsx".to_string(),
             "xls".to_string(),
             "csv".to_string(),
@@ -543,6 +682,7 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
     // Individual parser methods exposed to Ruby
     class.define_method("parse_pdf", method!(Parser::parse_pdf, 1))?;
     class.define_method("parse_docx", method!(Parser::parse_docx, 1))?;
+    class.define_method("parse_pptx", method!(Parser::parse_pptx, 1))?;
     class.define_method("parse_xlsx", method!(Parser::parse_xlsx, 1))?;
     class.define_method("parse_json", method!(Parser::parse_json, 1))?;
     class.define_method("parse_xml", method!(Parser::parse_xml, 1))?;
diff --git a/lib/parsekit/parser.rb b/lib/parsekit/parser.rb
@@ -89,6 +89,7 @@ def detect_format(path)
       
       case ext.downcase
       when 'docx' then :docx
+      when 'pptx' then :pptx
       when 'xlsx', 'xls' then :xlsx
       when 'pdf' then :pdf
       when 'json' then :json
diff --git a/spec/parsekit/integration_spec.rb b/spec/parsekit/integration_spec.rb
@@ -102,10 +102,17 @@
 
         result = parser.parse_file(pptx_file)
         expect(result).to be_a(String)
-        # PPTX parsing appears to be broken - returns binary data
-        # This needs to be fixed in the parser implementation
-        # For now, we just check it returns a string
-        # TODO: Fix PPTX parsing and add proper content assertions
+        expect(result).not_to be_empty
+        
+        # Check for content we know is in the sample PPTX
+        expect(result).to include("Microsoft Powerpoint document")
+        expect(result).to include("Bullet points")
+        expect(result).to include("Bold text")
+        expect(result).to include("Italic text")
+        expect(result).to include("Unicode")
+        expect(result).to include("Table example")
+        expect(result).to include("Column 1")
+        expect(result).to include("Data A")
       end
     end
   end