apache · adriangb · Jan 22, 2026 · Dec 19, 2025 · Dec 25, 2025 · Dec 25, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
@@ -175,6 +175,7 @@ env_logger = { workspace = true }
 glob = { workspace = true }
 insta = { workspace = true }
 paste = { workspace = true }
+pretty_assertions = "1.0"
 rand = { workspace = true, features = ["small_rng"] }
 rand_distr = "0.5"
 recursive = { workspace = true }

diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs
@@ -59,6 +59,7 @@ use datafusion_physical_plan::{
     coalesce_partitions::CoalescePartitionsExec,
     collect,
     filter::{FilterExec, FilterExecBuilder},
+    projection::ProjectionExec,
     repartition::RepartitionExec,
     sorts::sort::SortExec,
 };
@@ -1826,6 +1827,234 @@ fn schema() -> SchemaRef {
     Arc::clone(&TEST_SCHEMA)
 }
 
+struct ProjectionDynFilterTestCase {
+    schema: SchemaRef,
+    batches: Vec<RecordBatch>,
+    projection: Vec<(Arc<dyn PhysicalExpr>, String)>,
+    sort_expr: PhysicalSortExpr,
+    expected_plans: Vec<String>,
+}
+
+async fn run_projection_dyn_filter_case(case: ProjectionDynFilterTestCase) {
+    let ProjectionDynFilterTestCase {
+        schema,
+        batches,
+        projection,
+        sort_expr,
+        expected_plans,
+    } = case;
+
+    let scan = TestScanBuilder::new(Arc::clone(&schema))
+        .with_support(true)
+        .with_batches(batches)
+        .build();
+
+    let projection_exec = Arc::new(ProjectionExec::try_new(projection, scan).unwrap());
+
+    let sort = Arc::new(
+        SortExec::new(LexOrdering::new(vec![sort_expr]).unwrap(), projection_exec)
+            .with_fetch(Some(2)),
+    ) as Arc<dyn ExecutionPlan>;
+
+    let mut config = ConfigOptions::default();
+    config.execution.parquet.pushdown_filters = true;
+    config.optimizer.enable_dynamic_filter_pushdown = true;
+
+    let optimized_plan = FilterPushdown::new_post_optimization()
+        .optimize(Arc::clone(&sort), &config)
+        .unwrap();
+
+    pretty_assertions::assert_eq!(
+        format_plan_for_test(&optimized_plan).trim(),
+        expected_plans[0].trim()
+    );
+
+    let config = SessionConfig::new().with_batch_size(2);
+    let session_ctx = SessionContext::new_with_config(config);
+    session_ctx.register_object_store(
+        ObjectStoreUrl::parse("test://").unwrap().as_ref(),
+        Arc::new(InMemory::new()),
+    );
+    let state = session_ctx.state();
+    let task_ctx = state.task_ctx();
+    let mut stream = optimized_plan.execute(0, Arc::clone(&task_ctx)).unwrap();
+    for (idx, expected_plan) in expected_plans.iter().enumerate().skip(1) {
+        stream.next().await.unwrap().unwrap();
+        let formatted_plan = format_plan_for_test(&optimized_plan);
+        pretty_assertions::assert_eq!(
+            formatted_plan.trim(),
+            expected_plan.trim(),
+            "Mismatch at iteration {}",
+            idx
+        );
+    }
+}
+
+#[tokio::test]
+async fn test_topk_with_projection_transformation_on_dyn_filter() {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("a", DataType::Int32, false),
+        Field::new("b", DataType::Utf8, false),
+        Field::new("c", DataType::Float64, false),
+    ]));
+    let simple_abc = vec![
+        record_batch!(
+            ("a", Int32, [1, 2, 3]),
+            ("b", Utf8, ["x", "y", "z"]),
+            ("c", Float64, [1.0, 2.0, 3.0])
+        )
+        .unwrap(),
+    ];
+
+    // Case 1: Reordering [b, a]
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![
+            (col("b", &schema).unwrap(), "b".to_string()),
+            (col("a", &schema).unwrap(), "a".to_string()),
+        ],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("a", 1)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+r#"  - SortExec: TopK(fetch=2), expr=[a@1 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[b@1 as b, a@0 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+r#"  - SortExec: TopK(fetch=2), expr=[a@1 ASC], preserve_partitioning=[false], filter=[a@1 IS NULL OR a@1 < 2]
+  -   ProjectionExec: expr=[b@1 as b, a@0 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 IS NULL OR a@0 < 2 ]"#.to_string()]
+    })
+    .await;
+
+    // Case 2: Pruning [a]
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![(col("a", &schema).unwrap(), "a".to_string())],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("a", 0)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[a@0 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false], filter=[a@0 IS NULL OR a@0 < 2]
+  -   ProjectionExec: expr=[a@0 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 IS NULL OR a@0 < 2 ]"#.to_string(),
+        ],
+    })
+    .await;
+
+    // Case 3: Identity [a, b]
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![
+            (col("a", &schema).unwrap(), "a".to_string()),
+            (col("b", &schema).unwrap(), "b".to_string()),
+        ],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("a", 0)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false], filter=[a@0 IS NULL OR a@0 < 2]
+  -   ProjectionExec: expr=[a@0 as a, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 IS NULL OR a@0 < 2 ]"#.to_string(),
+        ],
+    })
+    .await;
+
+    // Case 4: Expressions [a + 1, b]
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![
+            (
+                Arc::new(BinaryExpr::new(
+                    col("a", &schema).unwrap(),
+                    Operator::Plus,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+                )),
+                "a_plus_1".to_string(),
+            ),
+            (col("b", &schema).unwrap(), "b".to_string()),
+        ],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("a_plus_1", 0)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+            r#"  - SortExec: TopK(fetch=2), expr=[a_plus_1@0 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[a@0 + 1 as a_plus_1, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+            r#"  - SortExec: TopK(fetch=2), expr=[a_plus_1@0 ASC], preserve_partitioning=[false], filter=[a_plus_1@0 IS NULL OR a_plus_1@0 < 3]
+  -   ProjectionExec: expr=[a@0 + 1 as a_plus_1, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 + 1 IS NULL OR a@0 + 1 < 3 ]"#.to_string(),
+        ],
+    })
+    .await;
+
+    // Case 5: [a as b, b as a] (swapped columns)
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![
+            (col("a", &schema).unwrap(), "b".to_string()),
+            (col("b", &schema).unwrap(), "a".to_string()),
+        ],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("b", 0)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+            r#"  - SortExec: TopK(fetch=2), expr=[b@0 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[a@0 as b, b@1 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+            r#"  - SortExec: TopK(fetch=2), expr=[b@0 ASC], preserve_partitioning=[false], filter=[b@0 IS NULL OR b@0 < 2]
+  -   ProjectionExec: expr=[a@0 as b, b@1 as a]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 IS NULL OR a@0 < 2 ]"#.to_string(),
+        ],
+    })
+    .await;
+
+    // Case 6: Confusing expr [a + 1 as a, b]
+    run_projection_dyn_filter_case(ProjectionDynFilterTestCase {
+        schema: Arc::clone(&schema),
+        batches: simple_abc.clone(),
+        projection: vec![
+            (
+                Arc::new(BinaryExpr::new(
+                    col("a", &schema).unwrap(),
+                    Operator::Plus,
+                    Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
+                )),
+                "a".to_string(),
+            ),
+            (col("b", &schema).unwrap(), "b".to_string()),
+        ],
+        sort_expr: PhysicalSortExpr::new(
+            Arc::new(Column::new("a", 0)),
+            SortOptions::default(),
+        ),
+        expected_plans: vec![
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false]
+  -   ProjectionExec: expr=[a@0 + 1 as a, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ empty ]"#.to_string(),
+            r#"  - SortExec: TopK(fetch=2), expr=[a@0 ASC], preserve_partitioning=[false], filter=[a@0 IS NULL OR a@0 < 3]
+  -   ProjectionExec: expr=[a@0 + 1 as a, b@1 as b]
+  -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 + 1 IS NULL OR a@0 + 1 < 3 ]"#.to_string(),
+        ],
+    })
+    .await;
+}
+
 /// Returns a predicate that is a binary expression col = lit
 fn col_lit_predicate(
     column_name: &str,

diff --git a/datafusion/physical-plan/src/filter_pushdown.rs b/datafusion/physical-plan/src/filter_pushdown.rs
@@ -37,8 +37,12 @@
 use std::collections::HashSet;
 use std::sync::Arc;
 
-use datafusion_common::Result;
-use datafusion_physical_expr::utils::{collect_columns, reassign_expr_columns};
+use arrow_schema::Schema;
+use datafusion_common::{
+    Result,
+    tree_node::{TreeNode, TreeNodeRecursion},
+};
+use datafusion_physical_expr::{expressions::Column, utils::reassign_expr_columns};
 use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
 use itertools::Itertools;
 
@@ -306,6 +310,56 @@ pub struct ChildFilterDescription {
     pub(crate) self_filters: Vec<Arc<dyn PhysicalExpr>>,
 }
 
+/// A utility for checking whether a filter expression can be pushed down
+/// to a child node based on column availability.
+///
+/// This checker validates that all columns referenced in a filter expression
+/// exist in the target schema. If any column in the filter is not present
+/// in the schema, the filter cannot be pushed down to that child.
+pub struct FilterColumnChecker<'a> {
+    column_names: HashSet<&'a str>,
+}
+
+impl<'a> FilterColumnChecker<'a> {
+    /// Creates a new [`FilterColumnChecker`] from the given schema.
+    ///
+    /// Extracts all column names from the schema's fields to build
+    /// a lookup set for efficient column existence checks.
+    pub(crate) fn new(input_schema: &'a Schema) -> Self {
+        let column_names: HashSet<&str> = input_schema
+            .fields()
+            .iter()
+            .map(|f| f.name().as_str())
+            .collect();
+        Self { column_names }
+    }
+
+    /// Checks whether a filter expression can be pushed down to the child
+    /// whose schema was used to create this checker.
+    ///
+    /// Returns `true` if all [`Column`] references in the filter expression
+    /// exist in the target schema, `false` otherwise.
+    ///
+    /// This method traverses the entire expression tree, checking each
+    /// column reference against the available column names.
+    pub(crate) fn can_pushdown(&self, filter: &Arc<dyn PhysicalExpr>) -> bool {
+        let mut can_apply = true;
+        filter
+            .apply(|expr| {
+                if let Some(column) = expr.as_any().downcast_ref::<Column>()
+                    && !self.column_names.contains(column.name())
+                {
+                    can_apply = false;
+                    return Ok(TreeNodeRecursion::Stop);
+                }
+
+                Ok(TreeNodeRecursion::Continue)
+            })
+            .expect("infallible traversal");
+        can_apply
+    }
+}
+
 impl ChildFilterDescription {
     /// Build a child filter description by analyzing which parent filters can be pushed to a specific child.
     ///
@@ -320,26 +374,14 @@ impl ChildFilterDescription {
     ) -> Result<Self> {
         let child_schema = child.schema();
 
-        // Get column names from child schema for quick lookup
-        let child_column_names: HashSet<&str> = child_schema
-            .fields()
-            .iter()
-            .map(|f| f.name().as_str())
-            .collect();
+        // Build a set of column names in the child schema for quick lookup
+        let checker = FilterColumnChecker::new(&child_schema);
 
         // Analyze each parent filter
         let mut child_parent_filters = Vec::with_capacity(parent_filters.len());
 
         for filter in parent_filters {
-            // Check which columns the filter references
-            let referenced_columns = collect_columns(filter);
-
-            // Check if all referenced columns exist in the child schema
-            let all_columns_exist = referenced_columns
-                .iter()
-                .all(|col| child_column_names.contains(col.name()));
-
-            if all_columns_exist {
+            if checker.can_pushdown(filter) {
                 // All columns exist in child - we can push down
                 // Need to reassign column indices to match child schema
                 let reassigned_filter =

diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
@@ -92,6 +92,7 @@ pub mod streaming;
 pub mod tree_node;
 pub mod union;
 pub mod unnest;
+pub mod util;
 pub mod windows;
 pub mod work_table;
 pub mod udaf {