apache · CTTY · Jun 26, 2025 · Jul 15, 2025 · Jul 15, 2025 · Jul 15, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/iceberg/src/arrow/nan_val_cnt_visitor.rs b/crates/iceberg/src/arrow/nan_val_cnt_visitor.rs
@@ -159,6 +159,9 @@ impl NanValueCountVisitor {
         let arrow_arr_partner_accessor = ArrowArrayAccessor {};
 
         let struct_arr = Arc::new(StructArray::from(batch)) as ArrayRef;
+        // todo remove these log lines
+        println!("----StructArray from record stream: {:?}", struct_arr);
+        println!("----Schema.as_struct from table: {:?}", schema.as_struct());
         visit_struct_with_partner(
             schema.as_struct(),
             &struct_arr,

diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs
@@ -440,10 +440,12 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
         Ok(schema_partner)
     }
 
+    // todo generate field_pos in datafusion instead of passing to here
 .compute(self.schema.clone(), batch_c)?; 
 .compute(self.schema.clone(), batch_c)?; 
     fn field_partner<'a>(
         &self,
         struct_partner: &'a ArrayRef,
         field: &NestedField,
+        field_pos: Option<usize>,
     ) -> Result<&'a ArrayRef> {
         let struct_array = struct_partner
             .as_any()
@@ -455,6 +457,14 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
                 )
             })?;
 
+        // todo remove unneeded log lines
+        println!(
+            "!!!Accessor struct array from struct partner: {:?}",
+            struct_array
+        );
+
+        println!("!!!field: {:?}", field);
+
         let field_pos = struct_array
             .fields()
             .iter()
@@ -463,12 +473,12 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
                     .map(|id| id == field.id)
                     .unwrap_or(false)
             })
-            .ok_or_else(|| {
+            .unwrap_or(field_pos.ok_or_else(|| {
                 Error::new(
                     ErrorKind::DataInvalid,
                     format!("Field id {} not found in struct array", field.id),
                 )
-            })?;
+            })?);
 
         Ok(struct_array.column(field_pos))
     }

diff --git a/crates/iceberg/src/spec/manifest/_serde.rs b/crates/iceberg/src/spec/manifest/_serde.rs
@@ -21,7 +21,7 @@ use serde_derive::{Deserialize, Serialize};
 use serde_with::serde_as;
 
 use super::{Datum, ManifestEntry, Schema, Struct};
-use crate::spec::{Literal, RawLiteral, StructType, Type};
+use crate::spec::{FormatVersion, Literal, RawLiteral, StructType, Type};
 use crate::{Error, ErrorKind};
 
 #[derive(Serialize, Deserialize)]
@@ -40,7 +40,7 @@ impl ManifestEntryV2 {
             snapshot_id: value.snapshot_id,
             sequence_number: value.sequence_number,
             file_sequence_number: value.file_sequence_number,
-            data_file: DataFileSerde::try_from(value.data_file, partition_type, false)?,
+            data_file: DataFileSerde::try_from(value.data_file, partition_type, FormatVersion::V2)?,
         })
     }
 
@@ -74,7 +74,7 @@ impl ManifestEntryV1 {
         Ok(Self {
             status: value.status as i32,
             snapshot_id: value.snapshot_id.unwrap_or_default(),
-            data_file: DataFileSerde::try_from(value.data_file, partition_type, true)?,
+            data_file: DataFileSerde::try_from(value.data_file, partition_type, FormatVersion::V1)?,
         })
     }
 
@@ -129,9 +129,13 @@ impl DataFileSerde {
     pub fn try_from(
         value: super::DataFile,
         partition_type: &StructType,
-        is_version_1: bool,
+        format_version: FormatVersion,
     ) -> Result<Self, Error> {
-        let block_size_in_bytes = if is_version_1 { Some(0) } else { None };
+        let block_size_in_bytes = if format_version == FormatVersion::V1 {
+            Some(0)
+        } else {
+            None
+        };
         Ok(Self {
             content: value.content as i32,
             file_path: value.file_path,

diff --git a/crates/iceberg/src/spec/manifest/data_file.rs b/crates/iceberg/src/spec/manifest/data_file.rs
@@ -297,8 +297,12 @@ pub fn write_data_files_to_avro<W: Write>(
     let mut writer = AvroWriter::new(&avro_schema, writer);
 
     for data_file in data_files {
-        let value = to_value(DataFileSerde::try_from(data_file, partition_type, true)?)?
-            .resolve(&avro_schema)?;
+        let value = to_value(DataFileSerde::try_from(
+            data_file,
+            partition_type,
+            FormatVersion::V1,
+        )?)?
+        .resolve(&avro_schema)?;
         writer.append(value)?;
     }
 

diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs
@@ -33,7 +33,7 @@ use super::{
     Datum, FormatVersion, ManifestContentType, PartitionSpec, PrimitiveType, Schema, Struct,
     UNASSIGNED_SEQUENCE_NUMBER,
 };
-use crate::error::Result;
+use crate::error::{Error, ErrorKind, Result};
 
 /// A manifest contains metadata and a list of entries.
 #[derive(Debug, PartialEq, Eq, Clone)]
@@ -119,12 +119,45 @@ impl Manifest {
     }
 }
 
+/// Serialize a DataFile to a JSON string.
+pub fn serialize_data_file_to_json(
+    data_file: DataFile,
+    partition_type: &super::StructType,
+    format_version: FormatVersion,
+) -> Result<String> {
+    let serde = _serde::DataFileSerde::try_from(data_file, partition_type, format_version)?;
+    serde_json::to_string(&serde).map_err(|e| {
+        Error::new(
+            ErrorKind::DataInvalid,
+            format!("Failed to serialize DataFile to JSON: {}", e),
+        )
+    })
+}
+
+/// Deserialize a DataFile from a JSON string.
+pub fn deserialize_data_file_from_json(
+    json: &str,
+    partition_spec_id: i32,
+    partition_type: &super::StructType,
+    schema: &Schema,
+) -> Result<DataFile> {
+    let serde = serde_json::from_str::<_serde::DataFileSerde>(json).map_err(|e| {
+        Error::new(
+            ErrorKind::DataInvalid,
+            format!("Failed to deserialize JSON to DataFile: {}", e),
+        )
+    })?;
+
+    serde.try_into(partition_spec_id, partition_type, schema)
+}
+
 #[cfg(test)]
 mod tests {
     use std::collections::HashMap;
     use std::fs;
     use std::sync::Arc;
 
+    use arrow_array::StringArray;
     use tempfile::TempDir;
 
     use super::*;
@@ -1056,4 +1089,120 @@ mod tests {
         assert!(!partitions[2].clone().contains_null);
         assert_eq!(partitions[2].clone().contains_nan, Some(false));
     }
+
+    #[test]
+    fn test_data_file_serialization() {
+        // Create a simple schema
+        let schema = Schema::builder()
+            .with_schema_id(1)
+            .with_identifier_field_ids(vec![1])
+            .with_fields(vec![
+                crate::spec::NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long))
+                    .into(),
+                crate::spec::NestedField::required(
+                    2,
+                    "name",
+                    Type::Primitive(PrimitiveType::String),
+                )
+                .into(),
+            ])
+            .build()
+            .unwrap();
+
+        // Create a partition spec
+        let partition_spec = PartitionSpec::builder(schema.clone())
+            .with_spec_id(1)
+            .add_partition_field("id", "id_partition", crate::spec::Transform::Identity)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // Get partition type from the partition spec
+        let partition_type = partition_spec.partition_type(&schema).unwrap();
+
+        // Create a vector of DataFile objects
+        let data_files = vec![
+            DataFileBuilder::default()
+                .content(crate::spec::DataContentType::Data)
+                .file_format(DataFileFormat::Parquet)
+                .file_path("path/to/file1.parquet".to_string())
+                .file_size_in_bytes(1024)
+                .record_count(100)
+                .partition_spec_id(1)
+                .partition(Struct::empty())
+                .column_sizes(HashMap::from([(1, 512), (2, 512)]))
+                .value_counts(HashMap::from([(1, 100), (2, 100)]))
+                .null_value_counts(HashMap::from([(1, 0), (2, 0)]))
+                .build()
+                .unwrap(),
+            DataFileBuilder::default()
+                .content(crate::spec::DataContentType::Data)
+                .file_format(DataFileFormat::Parquet)
+                .file_path("path/to/file2.parquet".to_string())
+                .file_size_in_bytes(2048)
+                .record_count(200)
+                .partition_spec_id(1)
+                .partition(Struct::empty())
+                .column_sizes(HashMap::from([(1, 1024), (2, 1024)]))
+                .value_counts(HashMap::from([(1, 200), (2, 200)]))
+                .null_value_counts(HashMap::from([(1, 10), (2, 5)]))
+                .build()
+                .unwrap(),
+        ];
+
+        // Serialize the DataFile objects
+        let serialized_files = data_files
+            .into_iter()
+            .map(|f| {
+                let json =
+                    serialize_data_file_to_json(f, &partition_type, FormatVersion::V2).unwrap();
+                println!("Test serialized data file: {}", json);
+                json
+            })
+            .collect::<Vec<String>>();
+
+        // Verify we have the expected number of serialized files
+        assert_eq!(serialized_files.len(), 2);
+
+        // Verify each serialized file contains expected data
+        for json in &serialized_files {
+            assert!(json.contains("path/to/file"));
+            assert!(json.contains("parquet"));
+            assert!(json.contains("record_count"));
+            assert!(json.contains("file_size_in_bytes"));
+        }
+
+        // Convert Vec<String> to StringArray and print it
+        let string_array = StringArray::from(serialized_files.clone());
+        println!("StringArray: {:?}", string_array);
+
+        // Now deserialize the JSON strings back into DataFile objects
+        println!("\nDeserializing back to DataFile objects:");
+        let deserialized_files: Vec<DataFile> = serialized_files
+            .into_iter()
+            .map(|json| {
+                let data_file = deserialize_data_file_from_json(
+                    &json,
+                    partition_spec.spec_id(),
+                    &partition_type,
+                    &schema,
+                )
+                .unwrap();
+
+                println!("Deserialized DataFile: {:?}", data_file);
+                data_file
+            })
+            .collect();
+
+        // Verify we have the expected number of deserialized files
+        assert_eq!(deserialized_files.len(), 2);
+
+        // Verify the deserialized files have the expected properties
+        for file in &deserialized_files {
+            assert_eq!(file.content_type(), crate::spec::DataContentType::Data);
+            assert_eq!(file.file_format(), DataFileFormat::Parquet);
+            assert!(file.file_path().contains("path/to/file"));
+            assert!(file.record_count() == 100 || file.record_count() == 200);
+        }
+    }
 }
diff --git a/crates/iceberg/src/spec/schema/visitor.rs b/crates/iceberg/src/spec/schema/visitor.rs
@@ -192,7 +192,12 @@ pub trait PartnerAccessor<P> {
     /// Get the struct partner from schema partner.
     fn struct_parner<'a>(&self, schema_partner: &'a P) -> Result<&'a P>;
     /// Get the field partner from struct partner.
-    fn field_partner<'a>(&self, struct_partner: &'a P, field: &NestedField) -> Result<&'a P>;
+    fn field_partner<'a>(
+        &self,
+        struct_partner: &'a P,
+        field: &NestedField,
+        field_pos: Option<usize>,
+    ) -> Result<&'a P>;
     /// Get the list element partner from list partner.
     fn list_element_partner<'a>(&self, list_partner: &'a P) -> Result<&'a P>;
     /// Get the map key partner from map partner.
@@ -253,8 +258,8 @@ pub fn visit_struct_with_partner<P, V: SchemaWithPartnerVisitor<P>, A: PartnerAc
     accessor: &A,
 ) -> Result<V::T> {
     let mut results = Vec::with_capacity(s.fields().len());
-    for field in s.fields() {
-        let field_partner = accessor.field_partner(partner, field)?;
+    for (pos, field) in s.fields().iter().enumerate() {
+        let field_partner = accessor.field_partner(partner, field, Some(pos))?;
         visitor.before_struct_field(field, field_partner)?;
         let result = visit_type_with_partner(&field.field_type, field_partner, visitor, accessor)?;
         visitor.after_struct_field(field, field_partner)?;

diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs
@@ -119,6 +119,13 @@ pub const PROPERTY_COMMIT_TOTAL_RETRY_TIME_MS: &str = "commit.retry.total-timeou
 /// Default value for total maximum retry time (ms).
 pub const PROPERTY_COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT: u64 = 30 * 60 * 1000; // 30 minutes
 
+/// Default file format for data files
+pub const PROPERTY_DEFAULT_FILE_FORMAT: &str = "write.format.default";
+/// Default file format for delete files
+pub const PROPERTY_DELETE_DEFAULT_FILE_FORMAT: &str = "write.delete.format.default";
+/// Default value for data file format
+pub const PROPERTY_DEFAULT_FILE_FORMAT_DEFAULT: &str = "parquet";
+
 /// Reference to [`TableMetadata`].
 pub type TableMetadataRef = Arc<TableMetadata>;
 

diff --git a/crates/iceberg/src/writer/base_writer/mod.rs b/crates/iceberg/src/writer/base_writer/mod.rs
@@ -19,3 +19,5 @@
 
 pub mod data_file_writer;
 pub mod equality_delete_writer;
+/// Module providing writers that can automatically roll over to new files based on size thresholds.
+pub mod rolling_writer;