diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..484fd26c8 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,22 @@ +# Alef-generated binding and e2e files — collapsed in GitHub PR diffs +packages/csharp/** linguist-generated=true +packages/dart/** linguist-generated=true +packages/elixir/** linguist-generated=true +packages/gleam/** linguist-generated=true +packages/go/** linguist-generated=true +packages/java/** linguist-generated=true +packages/kotlin/** linguist-generated=true +packages/php/** linguist-generated=true +packages/python/** linguist-generated=true +packages/r/** linguist-generated=true +packages/ruby/** linguist-generated=true +packages/swift/** linguist-generated=true +packages/typescript/** linguist-generated=true +packages/wasm/** linguist-generated=true +packages/zig/** linguist-generated=true +crates/kreuzberg-ffi/** linguist-generated=true +crates/kreuzberg-node/** linguist-generated=true +crates/kreuzberg-php/** linguist-generated=true +crates/kreuzberg-py/** linguist-generated=true +crates/kreuzberg-wasm/** linguist-generated=true +e2e/** linguist-generated=true diff --git a/Cargo.lock b/Cargo.lock index f8b54ce0e..ce7418007 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -224,6 +224,9 @@ name = "arbitrary" version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +dependencies = [ + "derive_arbitrary", +] [[package]] name = "arg_enum_proc_macro" @@ -780,6 +783,15 @@ dependencies = [ "serde", ] +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + [[package]] name = "bzip2" version = "0.6.1" @@ -789,6 +801,16 @@ dependencies = [ "libbz2-rs-sys", ] +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "calamine" version = "0.34.0" @@ -931,6 +953,17 @@ dependencies = [ "uuid", ] +[[package]] +name = "cfb" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a4f8e55be323b378facfcf1f06aa97f6ec17cf4ac84fb17325093aaf62da41" +dependencies = [ + "byteorder", + "fnv", + "uuid", +] + [[package]] name = "cfb" version = "0.14.0" @@ -1734,6 +1767,17 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "derive_builder" version = "0.20.2" @@ -3632,6 +3676,7 @@ dependencies = [ "tracing-opentelemetry", "tracing-subscriber", "tree-sitter-language-pack", + "unhwp", "unicode-normalization", "ureq 3.3.0", "urlencoding", @@ -3998,6 +4043,16 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "lzma-rs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" +dependencies = [ + "byteorder", + "crc", +] + [[package]] name = "lzma-rust2" version = "0.15.7" @@ -4014,6 +4069,17 @@ version = "0.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47bb1e988e6fb779cf720ad431242d3f03167c1b3f2b1aae7f1a94b2495b36ae" +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "lzw" version = "0.10.0" @@ -5133,6 +5199,17 @@ dependencies = [ "unicase", ] +[[package]] +name = "pulldown-cmark" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f86ba2052aebccc42cbbb3ed234b8b13ce76f75c3551a303cb2bcffcff12bb14" +dependencies = [ + "bitflags 2.11.1", + "memchr", + "unicase", +] + [[package]] name = "pulldown-cmark" version = "0.13.3" @@ -5152,6 +5229,15 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae" +[[package]] +name = "pulldown-cmark-to-cmark" +version = "18.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e02b63adcb49f2eb675b1694b413b3e9fedbf549dfe2cc98727ad97a0c30650" +dependencies = [ + "pulldown-cmark 0.12.2", +] + [[package]] name = "pxfm" version = "0.1.29" @@ -6163,7 +6249,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29225600349ef74beda5a9fffb36ac660a24613c0bde9315d0c49be1d51e9c24" dependencies = [ "aes 0.8.4", - "bzip2", + "bzip2 0.6.1", "cbc 0.1.2", "crc32fast", "getrandom 0.4.2", @@ -7279,6 +7365,26 @@ version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" +[[package]] +name = "unhwp" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7e7d0d14b2b326473082e9c3c8ea8c291b0b998b49ba395de374db9dcaa1b8a" +dependencies = [ + "cfb 0.10.0", + "flate2", + "pulldown-cmark 0.12.2", + "pulldown-cmark-to-cmark", + "quick-xml 0.37.5", + "rayon", + "regex", + "serde", + "serde_json", + "thiserror", + "unicode-normalization", + "zip 2.4.2", +] + [[package]] name = "unicase" version = "2.9.0" @@ -8308,6 +8414,15 @@ dependencies = [ "rustix", ] +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "y4m" version = "0.8.0" @@ -8433,6 +8548,36 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "zip" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" +dependencies = [ + "aes 0.8.4", + "arbitrary", + "bzip2 0.5.2", + "constant_time_eq 0.3.1", + "crc32fast", + "crossbeam-utils", + "deflate64", + "displaydoc", + "flate2", + "getrandom 0.3.4", + "hmac", + "indexmap", + "lzma-rs", + "memchr", + "pbkdf2", + "sha1", + "thiserror", + "time", + "xz2", + "zeroize", + "zopfli", + "zstd", +] + [[package]] name = "zip" version = "7.2.0" @@ -8440,7 +8585,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0" dependencies = [ "aes 0.8.4", - "bzip2", + "bzip2 0.6.1", "constant_time_eq 0.3.1", "crc32fast", "deflate64", diff --git a/alef.toml b/alef.toml index addc8dcb0..cb4b3b6fb 100644 --- a/alef.toml +++ b/alef.toml @@ -560,6 +560,7 @@ types = [ "GzipExtractor", "HtmlExtractor", "HwpExtractor", + "HwpxExtractor", "ImageExtractor", "JatsExtractor", "JupyterExtractor", diff --git a/crates/kreuzberg/Cargo.toml b/crates/kreuzberg/Cargo.toml index 68c99642c..89dae2252 100644 --- a/crates/kreuzberg/Cargo.toml +++ b/crates/kreuzberg/Cargo.toml @@ -67,6 +67,7 @@ office = [ "html", ] hwp = ["dep:cfb", "dep:flate2"] +hwpx = ["dep:unhwp"] iwork = ["dep:zip", "dep:snap"] email = ["dep:mail-parser", "dep:cfb", "dep:outlook-pst", "dep:tempfile", "dep:chrono"] html = ["dep:html-to-markdown-rs", "dep:v_htmlescape"] @@ -208,6 +209,7 @@ formats = [ "excel", "office", "hwp", + "hwpx", "iwork", "email", "html", @@ -369,6 +371,7 @@ tower-http = { version = "0.6", features = [ ], optional = true } tracing = { workspace = true } tracing-opentelemetry = { version = "0.32", optional = true } +unhwp = { version = "0.2.4", default-features = false, features = ["hwpx"], optional = true } unicode-normalization = { version = "0.1.25", optional = true } urlencoding = "2" utoipa = { version = "5.4", features = ["axum_extras"], optional = true } diff --git a/crates/kreuzberg/src/core/mime.rs b/crates/kreuzberg/src/core/mime.rs index 069a9c647..9d27a8bab 100644 --- a/crates/kreuzberg/src/core/mime.rs +++ b/crates/kreuzberg/src/core/mime.rs @@ -41,6 +41,7 @@ pub(crate) const SOURCE_CODE_MIME_TYPE: &str = "text/x-source-code"; pub(crate) const EXCEL_MIME_TYPE: &str = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; +pub(crate) const HWPX_MIME_TYPE: &str = "application/haansofthwpx"; pub(crate) const IWORK_PAGES_MIME_TYPE: &str = "application/x-iwork-pages-sffpages"; pub(crate) const IWORK_NUMBERS_MIME_TYPE: &str = "application/x-iwork-numbers-sffnumbers"; pub(crate) const IWORK_KEYNOTE_MIME_TYPE: &str = "application/x-iwork-keynote-sffkey"; @@ -783,6 +784,12 @@ fn detect_office_format_from_zip(content: &[u8]) -> Option<&'static str> { const NUMBERS_MARKER: &[u8] = b"Index/CalculationEngine.iwa"; const KEYNOTE_MARKER: &[u8] = b"Index/Presentation.iwa"; + // HWPX: ZIP of OWPML XML, contains Contents/content.hpf manifest + const HWPX_MARKER: &[u8] = b"Contents/content.hpf"; + if contains_subsequence(content, HWPX_MARKER) { + return Some(HWPX_MIME_TYPE); + } + // Check iWork first (before generic Office) since iWork ZIPs also contain XML if contains_subsequence(content, PAGES_MARKER) { return Some(IWORK_PAGES_MIME_TYPE); diff --git a/crates/kreuzberg/src/extractors/hwp.rs b/crates/kreuzberg/src/extractors/hwp.rs index 0a4558b08..37f43b9c2 100644 --- a/crates/kreuzberg/src/extractors/hwp.rs +++ b/crates/kreuzberg/src/extractors/hwp.rs @@ -88,7 +88,7 @@ impl DocumentExtractor for HwpExtractor { } fn supported_mime_types(&self) -> &[&str] { - &["application/x-hwp", "application/haansofthwpx"] + &["application/x-hwp"] } fn priority(&self) -> i32 { @@ -106,10 +106,7 @@ mod tests { assert_eq!(extractor.name(), "hwp-extractor"); assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION")); assert_eq!(extractor.priority(), 50); - assert_eq!( - extractor.supported_mime_types(), - &["application/x-hwp", "application/haansofthwpx"] - ); + assert_eq!(extractor.supported_mime_types(), &["application/x-hwp"]); } #[test] @@ -118,4 +115,20 @@ mod tests { assert!(extractor.initialize().is_ok()); assert!(extractor.shutdown().is_ok()); } + + #[test] + fn test_hwpx_mime_not_routed_to_hwp_extractor() { + use crate::KreuzbergError; + use crate::plugins::registry::DocumentExtractorRegistry; + use std::sync::Arc; + + let mut registry = DocumentExtractorRegistry::new(); + registry.register(Arc::new(HwpExtractor::new())).unwrap(); + + let result = registry.get("application/haansofthwpx"); + assert!( + matches!(result, Err(KreuzbergError::UnsupportedFormat(_))), + "application/haansofthwpx must not be routed to HwpExtractor" + ); + } } diff --git a/crates/kreuzberg/src/extractors/hwpx.rs b/crates/kreuzberg/src/extractors/hwpx.rs new file mode 100644 index 000000000..45a8449a9 --- /dev/null +++ b/crates/kreuzberg/src/extractors/hwpx.rs @@ -0,0 +1,192 @@ +//! Hangul Word Processor XML (.hwpx) extractor. +//! +//! Extracts text, headings, tables, and images from HWPX documents using the `unhwp` crate. + +use std::borrow::Cow; + +use async_trait::async_trait; +use bytes::Bytes; + +use crate::Result; +use crate::core::config::ExtractionConfig; +use crate::plugins::{DocumentExtractor, Plugin}; +use crate::types::ExtractedImage; +use crate::types::internal::InternalDocument; +use crate::types::internal_builder::InternalDocumentBuilder; + +/// Extractor for Hangul Word Processor XML (.hwpx) files. +/// +/// Supports HWPX (Open HWPML), the ZIP-based XML successor to the binary HWP 5.0 format. +pub struct HwpxExtractor; + +impl HwpxExtractor { + pub(crate) fn new() -> Self { + Self + } +} + +impl Default for HwpxExtractor { + fn default() -> Self { + Self::new() + } +} + +impl Plugin for HwpxExtractor { + fn name(&self) -> &str { + "hwpx-extractor" + } + + fn version(&self) -> String { + env!("CARGO_PKG_VERSION").to_string() + } + + fn initialize(&self) -> Result<()> { + Ok(()) + } + + fn shutdown(&self) -> Result<()> { + Ok(()) + } + + fn description(&self) -> &str { + "Hangul Word Processor XML (.hwpx) text extraction" + } + + fn author(&self) -> &str { + "Kreuzberg Team" + } +} + +fn mime_to_format(mime: &str) -> Cow<'static, str> { + match mime { + "image/png" => Cow::Borrowed("png"), + "image/jpeg" | "image/jpg" => Cow::Borrowed("jpeg"), + "image/gif" => Cow::Borrowed("gif"), + "image/bmp" => Cow::Borrowed("bmp"), + "image/webp" => Cow::Borrowed("webp"), + _ => Cow::Borrowed("bin"), + } +} + +fn build_hwpx_internal_document(doc: unhwp::model::Document, mime_type: &str) -> InternalDocument { + let mut builder = InternalDocumentBuilder::new("hwpx"); + builder.set_mime_type(Cow::Owned(mime_type.to_string())); + let mut image_index: usize = 0; + + for section in &doc.sections { + for block in §ion.content { + match block { + unhwp::model::Block::Paragraph(p) => { + if p.style.is_heading() && p.has_text_content() { + let text = p.plain_text(); + let trimmed = text.trim(); + if !trimmed.is_empty() { + builder.push_heading(p.style.heading_level, trimmed, None, None); + } + } else if p.has_text_content() { + let text = p.plain_text(); + let trimmed = text.trim(); + if !trimmed.is_empty() { + builder.push_paragraph(trimmed, vec![], None, None); + } + } + + for inline in &p.content { + if let unhwp::model::InlineContent::Image(img_ref) = inline + && let Some(resource) = doc.resources.get(&img_ref.id) + { + let image = ExtractedImage { + data: Bytes::from(resource.data.clone()), + format: mime_to_format(resource.mime_type.as_deref().unwrap_or("")), + image_index, + page_number: None, + width: img_ref.width, + height: img_ref.height, + colorspace: None, + bits_per_component: None, + is_mask: false, + description: img_ref.alt_text.clone(), + ocr_result: None, + bounding_box: None, + source_path: None, + image_kind: None, + kind_confidence: None, + cluster_id: None, + }; + builder.push_image(img_ref.alt_text.as_deref(), image, None, None); + image_index += 1; + } + } + } + unhwp::model::Block::Table(t) => { + if !t.rows.is_empty() { + let cells: Vec> = t + .rows + .iter() + .map(|row| row.cells.iter().map(|cell| cell.plain_text()).collect()) + .collect(); + builder.push_table_from_cells(&cells, None, None); + } + } + } + } + } + + builder.build() +} + +#[cfg_attr(not(target_arch = "wasm32"), async_trait)] +#[cfg_attr(target_arch = "wasm32", async_trait(?Send))] +impl DocumentExtractor for HwpxExtractor { + async fn extract_bytes( + &self, + content: &[u8], + mime_type: &str, + _config: &ExtractionConfig, + ) -> Result { + let doc = unhwp::parse_bytes(content) + .map_err(|e| crate::KreuzbergError::parsing(format!("Failed to parse HWPX: {e}")))?; + Ok(build_hwpx_internal_document(doc, mime_type)) + } + + fn supported_mime_types(&self) -> &[&str] { + &["application/haansofthwpx"] + } + + fn priority(&self) -> i32 { + 50 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_hwpx_extractor_plugin_interface() { + let extractor = HwpxExtractor::new(); + assert_eq!(extractor.name(), "hwpx-extractor"); + assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION")); + assert_eq!(extractor.priority(), 50); + assert_eq!(extractor.supported_mime_types(), &["application/haansofthwpx"]); + } + + #[test] + fn test_hwpx_extractor_initialize_shutdown() { + let extractor = HwpxExtractor::new(); + assert!(extractor.initialize().is_ok()); + assert!(extractor.shutdown().is_ok()); + } + + #[cfg(feature = "hwp")] + #[test] + fn test_hwpx_not_routed_to_hwp_extractor() { + use crate::extractors::hwp::HwpExtractor; + + let hwp = HwpExtractor::new(); + assert!( + !hwp.supported_mime_types().contains(&"application/haansofthwpx"), + "HwpExtractor must not claim application/haansofthwpx" + ); + } +} diff --git a/crates/kreuzberg/src/extractors/mod.rs b/crates/kreuzberg/src/extractors/mod.rs index f2da8aa22..e6eb15ca7 100644 --- a/crates/kreuzberg/src/extractors/mod.rs +++ b/crates/kreuzberg/src/extractors/mod.rs @@ -97,6 +97,9 @@ pub mod excel; #[cfg(feature = "hwp")] pub mod hwp; +#[cfg(feature = "hwpx")] +pub mod hwpx; + #[cfg(feature = "iwork")] pub mod iwork; @@ -197,6 +200,9 @@ pub use excel::ExcelExtractor; #[cfg(feature = "hwp")] pub use hwp::HwpExtractor; +#[cfg(feature = "hwpx")] +pub use hwpx::HwpxExtractor; + #[cfg(feature = "iwork")] pub use iwork::{keynote::KeynoteExtractor, numbers::NumbersExtractor, pages::PagesExtractor}; @@ -368,6 +374,11 @@ pub(crate) fn register_default_extractors() -> Result<()> { registry.register(Arc::new(HwpExtractor::new()))?; } + #[cfg(feature = "hwpx")] + { + registry.register(Arc::new(HwpxExtractor::new()))?; + } + #[cfg(feature = "iwork")] { registry.register(Arc::new(PagesExtractor::new()))?; diff --git a/fixtures/format_specific/format_hwpx_standalone.json b/fixtures/format_specific/format_hwpx_standalone.json new file mode 100644 index 000000000..97d1b7274 --- /dev/null +++ b/fixtures/format_specific/format_hwpx_standalone.json @@ -0,0 +1,26 @@ +{ + "id": "format_hwpx_standalone", + "category": "format_specific", + "description": "Standalone HWPX extraction using extract_bytes_sync", + "tags": ["format_specific", "hwpx", "text_extraction"], + "call": "extract_bytes_sync", + "input": { + "data": "hwpx/simple.hwpx", + "mime_type": "application/haansofthwpx" + }, + "assertions": [ + { + "type": "not_error" + }, + { + "type": "min_length", + "field": "content", + "value": 20 + }, + { + "type": "contains", + "field": "content", + "value": "Hello from HWPX" + } + ] +} diff --git a/test_documents/hwpx/simple.hwpx b/test_documents/hwpx/simple.hwpx new file mode 100644 index 000000000..1e8963a34 Binary files /dev/null and b/test_documents/hwpx/simple.hwpx differ