From ef2ba6ca47e272a69d3694323380bbbce46bcc03 Mon Sep 17 00:00:00 2001 From: Peter Toth Date: Tue, 30 Sep 2025 19:11:00 +0200 Subject: [PATCH 1/3] [SPARK-53762][SQL] Add date and time conversions simplifier rule to optimizer --- .../expressions/datetimeExpressions.scala | 4 ++ .../sql/catalyst/optimizer/Optimizer.scala | 1 + .../sql/catalyst/optimizer/expressions.scala | 46 +++++++++++++ .../sql/catalyst/rules/RuleIdCollection.scala | 1 + .../sql/catalyst/trees/TreePatterns.scala | 1 + .../SimplifyDateTimeConversionsSuite.scala | 69 +++++++++++++++++++ 6 files changed, 122 insertions(+) create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyDateTimeConversionsSuite.scala diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 418431c6c78e4..3948f8bd0dd6d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1014,6 +1014,8 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): DateFormatClass = copy(left = newLeft, right = newRight) + + final override def nodePatternsInternal(): Seq[TreePattern] = Seq(DATETIME) } /** @@ -1147,6 +1149,8 @@ case class GetTimestamp( newLeft: Expression, newRight: Expression): Expression = copy(left = newLeft, right = newRight) + + final override def nodePatternsInternal(): Seq[TreePattern] = Seq(DATETIME) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index fc65c24afcb8f..a2dced57c7153 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -146,6 +146,7 @@ abstract class Optimizer(catalogManager: CatalogManager) PruneFilters, SimplifyCasts, SimplifyCaseConversionExpressions, + SimplifyDateTimeConversions, RewriteCorrelatedScalarSubquery, RewriteLateralSubquery, EliminateSerialization, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index 856236750f7bf..c78b744945ad6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -1142,6 +1142,52 @@ object SimplifyCaseConversionExpressions extends Rule[LogicalPlan] { } } +/** + * Removes date and time related functions that are unnecessary. + */ +object SimplifyDateTimeConversions extends Rule[LogicalPlan] { + def apply(plan: LogicalPlan): LogicalPlan = plan.transformWithPruning( + _.containsPattern(DATETIME), ruleId) { + case q: LogicalPlan => q.transformExpressionsUpWithPruning( + _.containsPattern(DATETIME), ruleId) { + case DateFormatClass( + GetTimestamp( + e @ DateFormatClass( + _, + pattern, + timeZoneId), + pattern2, + TimestampType, + _, + timeZoneId2, + _), + pattern3, + timeZoneId3) + if pattern.semanticEquals(pattern2) && pattern.semanticEquals(pattern3) + && timeZoneId == timeZoneId2 && timeZoneId == timeZoneId3 => + e + case GetTimestamp( + DateFormatClass( + e @ GetTimestamp( + _, + pattern, + TimestampType, + _, + timeZoneId, + _), + pattern2, + timeZoneId2), + pattern3, + TimestampType, + _, + timeZoneId3, + _) + if pattern.semanticEquals(pattern2) && pattern.semanticEquals(pattern3) + && timeZoneId == timeZoneId2 && timeZoneId == timeZoneId3 => + e + } + } +} /** * Combine nested [[Concat]] expressions. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala index fd839b4c2127f..b65de2ab0a1a9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala @@ -176,6 +176,7 @@ object RuleIdCollection { "org.apache.spark.sql.catalyst.optimizer.RewriteIntersectAll" :: "org.apache.spark.sql.catalyst.optimizer.SimplifyBinaryComparison" :: "org.apache.spark.sql.catalyst.optimizer.SimplifyCaseConversionExpressions" :: + "org.apache.spark.sql.catalyst.optimizer.SimplifyDateTimeConversions" :: "org.apache.spark.sql.catalyst.optimizer.SimplifyCasts" :: "org.apache.spark.sql.catalyst.optimizer.SimplifyConditionals" :: "org.apache.spark.sql.catalyst.optimizer.SimplifyExtractValueOps" :: diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala index c35aa7403d767..cc05a38f857be 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala @@ -42,6 +42,7 @@ object TreePattern extends Enumeration { val COUNT: Value = Value val CREATE_NAMED_STRUCT: Value = Value val CURRENT_LIKE: Value = Value + val DATETIME: Value = Value val DYNAMIC_PRUNING_EXPRESSION: Value = Value val DYNAMIC_PRUNING_SUBQUERY: Value = Value val EXISTS_SUBQUERY = Value diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyDateTimeConversionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyDateTimeConversionsSuite.scala new file mode 100644 index 0000000000000..8e7013f2df038 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyDateTimeConversionsSuite.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.optimizer + +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ +import org.apache.spark.sql.catalyst.expressions.{DateFormatClass, GetTimestamp} +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.RuleExecutor +import org.apache.spark.sql.types._ + +class SimplifyDateTimeConversionsSuite extends PlanTest { + + object Optimize extends RuleExecutor[LogicalPlan] { + val batches = + Batch("SimplifyDateTimeConversions", FixedPoint(50), SimplifyDateTimeConversions) :: Nil + } + + val testRelation = LocalRelation($"ts".timestamp, $"s".string) + + test("SPARK-53762: Remove DateFormat - GetTimestamp groups") { + val pattern = "yyyy-MM-dd" + + val df = DateFormatClass($"ts", pattern) + val gt = GetTimestamp($"s", pattern, TimestampType) + + val originalQuery = testRelation + .select( + DateFormatClass( + GetTimestamp( + df, + pattern, + TimestampType), + pattern) as "c1", + GetTimestamp( + DateFormatClass( + gt, + pattern), + pattern, + TimestampType) as "c2") + .analyze + + val optimized = Optimize.execute(originalQuery) + + val expected = testRelation + .select( + df as "c1", + gt as "c2") + .analyze + + comparePlans(optimized, expected) + } +} From f8b46371d593fc2f4cf23a3d6b85257cd81dd383 Mon Sep 17 00:00:00 2001 From: Peter Toth Date: Wed, 1 Oct 2025 10:34:36 +0200 Subject: [PATCH 2/3] fix indent, add comments --- .../sql/catalyst/optimizer/expressions.scala | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index c78b744945ad6..71eb3e5ea2bd7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -1147,15 +1147,14 @@ object SimplifyCaseConversionExpressions extends Rule[LogicalPlan] { */ object SimplifyDateTimeConversions extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan.transformWithPruning( - _.containsPattern(DATETIME), ruleId) { - case q: LogicalPlan => q.transformExpressionsUpWithPruning( _.containsPattern(DATETIME), ruleId) { + case q: LogicalPlan => q.transformExpressionsUpWithPruning( + _.containsPattern(DATETIME), ruleId) { + // Remove a string to timestamp conversions followed by a timestamp to string conversions if + // original string is in the same format. case DateFormatClass( GetTimestamp( - e @ DateFormatClass( - _, - pattern, - timeZoneId), + e @ DateFormatClass(_, pattern, timeZoneId), pattern2, TimestampType, _, @@ -1166,15 +1165,12 @@ object SimplifyDateTimeConversions extends Rule[LogicalPlan] { if pattern.semanticEquals(pattern2) && pattern.semanticEquals(pattern3) && timeZoneId == timeZoneId2 && timeZoneId == timeZoneId3 => e + + // Remove a timestamp to string conversion followed by a string to timestamp conversions if + // original timestamp is built with the same format. case GetTimestamp( DateFormatClass( - e @ GetTimestamp( - _, - pattern, - TimestampType, - _, - timeZoneId, - _), + e @ GetTimestamp(_, pattern, TimestampType, _, timeZoneId, _), pattern2, timeZoneId2), pattern3, From 018055dbc9ac73a294e057a30d9341a5222c75c0 Mon Sep 17 00:00:00 2001 From: Peter Toth Date: Wed, 1 Oct 2025 17:19:44 +0200 Subject: [PATCH 3/3] fix rule order --- .../org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala index b65de2ab0a1a9..b9f15f3f951c2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleIdCollection.scala @@ -176,9 +176,9 @@ object RuleIdCollection { "org.apache.spark.sql.catalyst.optimizer.RewriteIntersectAll" :: "org.apache.spark.sql.catalyst.optimizer.SimplifyBinaryComparison" :: "org.apache.spark.sql.catalyst.optimizer.SimplifyCaseConversionExpressions" :: - "org.apache.spark.sql.catalyst.optimizer.SimplifyDateTimeConversions" :: "org.apache.spark.sql.catalyst.optimizer.SimplifyCasts" :: "org.apache.spark.sql.catalyst.optimizer.SimplifyConditionals" :: + "org.apache.spark.sql.catalyst.optimizer.SimplifyDateTimeConversions" :: "org.apache.spark.sql.catalyst.optimizer.SimplifyExtractValueOps" :: "org.apache.spark.sql.catalyst.optimizer.TransposeWindow" :: "org.apache.spark.sql.catalyst.optimizer.UnwrapCastInBinaryComparison" :: Nil