@@ -78,6 +78,7 @@ impl Parser {
7878 match file_type. as_str ( ) {
7979 "pdf" => self . parse_pdf ( data) ,
8080 "docx" => self . parse_docx ( data) ,
81+ "pptx" => self . parse_pptx ( data) ,
8182 "xlsx" | "xls" => self . parse_xlsx ( data) ,
8283 "json" => self . parse_json ( data) ,
8384 "xml" | "html" => self . parse_xml ( data) ,
@@ -328,6 +329,143 @@ impl Parser {
328329 }
329330 }
330331
332+ /// Parse PPTX (PowerPoint) files - exposed to Ruby
333+ fn parse_pptx ( & self , data : Vec < u8 > ) -> Result < String , Error > {
334+ use std:: io:: { Cursor , Read } ;
335+ use zip:: ZipArchive ;
336+
337+ let cursor = Cursor :: new ( data) ;
338+ let mut archive = match ZipArchive :: new ( cursor) {
339+ Ok ( archive) => archive,
340+ Err ( e) => {
341+ return Err ( Error :: new (
342+ magnus:: exception:: runtime_error ( ) ,
343+ format ! ( "Failed to open PPTX as ZIP: {}" , e) ,
344+ ) )
345+ }
346+ } ;
347+
348+ let mut all_text = Vec :: new ( ) ;
349+ let mut slide_numbers = Vec :: new ( ) ;
350+
351+ // First, collect slide numbers and sort them
352+ for i in 0 ..archive. len ( ) {
353+ let file = match archive. by_index ( i) {
354+ Ok ( file) => file,
355+ Err ( _) => continue ,
356+ } ;
357+
358+ let name = file. name ( ) ;
359+ // Match slide XML files (e.g., ppt/slides/slide1.xml)
360+ if name. starts_with ( "ppt/slides/slide" ) && name. ends_with ( ".xml" ) && !name. contains ( "_rels" ) {
361+ // Extract slide number from filename
362+ if let Some ( num_str) = name
363+ . strip_prefix ( "ppt/slides/slide" )
364+ . and_then ( |s| s. strip_suffix ( ".xml" ) )
365+ {
366+ if let Ok ( num) = num_str. parse :: < usize > ( ) {
367+ slide_numbers. push ( ( num, i) ) ;
368+ }
369+ }
370+ }
371+ }
372+
373+ // Sort by slide number to maintain order
374+ slide_numbers. sort_by_key ( |& ( num, _) | num) ;
375+
376+ // Now process slides in order
377+ for ( _, index) in slide_numbers {
378+ let mut file = match archive. by_index ( index) {
379+ Ok ( file) => file,
380+ Err ( _) => continue ,
381+ } ;
382+
383+ let mut contents = String :: new ( ) ;
384+ if file. read_to_string ( & mut contents) . is_ok ( ) {
385+ // Extract text from slide XML
386+ let text = self . extract_text_from_slide_xml ( & contents) ;
387+ if !text. is_empty ( ) {
388+ all_text. push ( text) ;
389+ }
390+ }
391+ }
392+
393+ // Also extract notes if present
394+ for i in 0 ..archive. len ( ) {
395+ let mut file = match archive. by_index ( i) {
396+ Ok ( file) => file,
397+ Err ( _) => continue ,
398+ } ;
399+
400+ let name = file. name ( ) ;
401+ // Match notes slide XML files
402+ if name. starts_with ( "ppt/notesSlides/notesSlide" ) && name. ends_with ( ".xml" ) && !name. contains ( "_rels" ) {
403+ let mut contents = String :: new ( ) ;
404+ if file. read_to_string ( & mut contents) . is_ok ( ) {
405+ let text = self . extract_text_from_slide_xml ( & contents) ;
406+ if !text. is_empty ( ) {
407+ all_text. push ( format ! ( "[Notes: {}]" , text) ) ;
408+ }
409+ }
410+ }
411+ }
412+
413+ if all_text. is_empty ( ) {
414+ Ok ( "" . to_string ( ) )
415+ } else {
416+ Ok ( all_text. join ( "\n \n " ) )
417+ }
418+ }
419+
420+ /// Helper method to extract text from slide XML
421+ fn extract_text_from_slide_xml ( & self , xml_content : & str ) -> String {
422+ use quick_xml:: events:: Event ;
423+ use quick_xml:: Reader ;
424+
425+ let mut reader = Reader :: from_str ( xml_content) ;
426+
427+ let mut text_parts = Vec :: new ( ) ;
428+ let mut buf = Vec :: new ( ) ;
429+ let mut in_text_element = false ;
430+
431+ loop {
432+ match reader. read_event_into ( & mut buf) {
433+ Ok ( Event :: Start ( ref e) ) => {
434+ // Look for text elements (a:t or t)
435+ let name = e. name ( ) ;
436+ let local_name_bytes = name. local_name ( ) ;
437+ let local_name = std:: str:: from_utf8 ( local_name_bytes. as_ref ( ) ) . unwrap_or ( "" ) ;
438+ if local_name == "t" {
439+ in_text_element = true ;
440+ }
441+ }
442+ Ok ( Event :: Text ( e) ) => {
443+ if in_text_element {
444+ if let Ok ( text) = e. unescape ( ) {
445+ let text_str = text. trim ( ) ;
446+ if !text_str. is_empty ( ) {
447+ text_parts. push ( text_str. to_string ( ) ) ;
448+ }
449+ }
450+ }
451+ }
452+ Ok ( Event :: End ( ref e) ) => {
453+ let name = e. name ( ) ;
454+ let local_name_bytes = name. local_name ( ) ;
455+ let local_name = std:: str:: from_utf8 ( local_name_bytes. as_ref ( ) ) . unwrap_or ( "" ) ;
456+ if local_name == "t" {
457+ in_text_element = false ;
458+ }
459+ }
460+ Ok ( Event :: Eof ) => break ,
461+ _ => { }
462+ }
463+ buf. clear ( ) ;
464+ }
465+
466+ text_parts. join ( " " )
467+ }
468+
331469 /// Parse Excel files - exposed to Ruby
332470 fn parse_xlsx ( & self , data : Vec < u8 > ) -> Result < String , Error > {
333471 use calamine:: { Reader , Xlsx } ;
@@ -486,6 +624,7 @@ impl Parser {
486624 "htm" . to_string( ) , // HTML files (alternative extension)
487625 "md" . to_string( ) , // Markdown files
488626 "docx" . to_string( ) ,
627+ "pptx" . to_string( ) ,
489628 "xlsx" . to_string( ) ,
490629 "xls" . to_string( ) ,
491630 "csv" . to_string( ) ,
@@ -543,6 +682,7 @@ pub fn init(_ruby: &Ruby, module: RModule) -> Result<(), Error> {
543682 // Individual parser methods exposed to Ruby
544683 class. define_method ( "parse_pdf" , method ! ( Parser :: parse_pdf, 1 ) ) ?;
545684 class. define_method ( "parse_docx" , method ! ( Parser :: parse_docx, 1 ) ) ?;
685+ class. define_method ( "parse_pptx" , method ! ( Parser :: parse_pptx, 1 ) ) ?;
546686 class. define_method ( "parse_xlsx" , method ! ( Parser :: parse_xlsx, 1 ) ) ?;
547687 class. define_method ( "parse_json" , method ! ( Parser :: parse_json, 1 ) ) ?;
548688 class. define_method ( "parse_xml" , method ! ( Parser :: parse_xml, 1 ) ) ?;
0 commit comments