From e9f204ae93061a862e4da52c128eaf3512a66c7b Mon Sep 17 00:00:00 2001
From: GideonPotok <g.potok4@gmail.com>
Date: Mon, 1 Apr 2024 22:29:28 +0800
Subject: [PATCH] [SPARK-46840][SQL][TESTS] Add `CollationBenchmark`

### What changes were proposed in this pull request?

https://issues.apache.org/jira/browse/SPARK-46840

[Collation Support in Spark.docx](https://github.com/apache/spark/files/14551958/Collation.Support.in.Spark.docx)

### Why are the changes needed?

Work is underway to introduce collation concept into Spark. There is a need to build out a benchmarking suite to allow engineers to address performance impact.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

GHA 'Run Benchmarks' ran on this, for both JDK 17 and JDK 21

In addition, both the author and dbatomic tested locally on personal computers:
`build/sbt "sql/Test/runMain  org.apache.spark.sql.execution.benchmark.CollationBenchmark"`

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #45453 from GideonPotok/spark_46840.

Authored-by: GideonPotok <g.potok4@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../CollationBenchmark-jdk21-results.txt      |  27 ++++
 .../benchmarks/CollationBenchmark-results.txt |  27 ++++
 .../benchmark/CollationBenchmark.scala        | 129 ++++++++++++++++++
 3 files changed, 183 insertions(+)
 create mode 100644 sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
 create mode 100644 sql/core/benchmarks/CollationBenchmark-results.txt
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala

diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
new file mode 100644
index 0000000000000..e1d7a42aac618
--- /dev/null
+++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
@@ -0,0 +1,27 @@
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - equalsFunction:  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+--------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                   29904          29937          47          0.0      299036.1       1.0X
+UNICODE                                              3886           3893          10          0.0       38863.0       7.7X
+UTF8_BINARY                                          3945           3945           0          0.0       39449.6       7.6X
+UNICODE_CI                                          45321          45330          12          0.0      453210.3       0.7X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - compareFunction:  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+---------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                    29807          29818          17          0.0      298065.0       1.0X
+UNICODE                                              45704          45723          27          0.0      457036.2       0.7X
+UTF8_BINARY                                           6460           6464           7          0.0       64597.9       4.6X
+UNICODE_CI                                           45498          45508          14          0.0      454977.6       0.7X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - hashFunction:  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 23553          23595          59          0.0      235531.8       1.0X
+UNICODE                                          197303         197309           8          0.0     1973034.1       0.1X
+UTF8_BINARY                                       14389          14391           2          0.0      143891.2       1.6X
+UNICODE_CI                                       166880         166885           7          0.0     1668799.5       0.1X
+
diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt b/sql/core/benchmarks/CollationBenchmark-results.txt
new file mode 100644
index 0000000000000..d8ebdfa695ff4
--- /dev/null
+++ b/sql/core/benchmarks/CollationBenchmark-results.txt
@@ -0,0 +1,27 @@
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - equalsFunction:  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+--------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                   34122          34152          42          0.0      341224.2       1.0X
+UNICODE                                              4520           4522           2          0.0       45201.8       7.5X
+UTF8_BINARY                                          4524           4526           2          0.0       45243.0       7.5X
+UNICODE_CI                                          52706          52711           7          0.0      527056.1       0.6X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - compareFunction:  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+---------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                    33467          33474          10          0.0      334671.7       1.0X
+UNICODE                                              51168          51168           1          0.0      511677.4       0.7X
+UTF8_BINARY                                           5561           5593          45          0.0       55610.9       6.0X
+UNICODE_CI                                           51929          51955          36          0.0      519291.8       0.6X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - hashFunction:  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 22079          22083           5          0.0      220786.7       1.0X
+UNICODE                                          177636         177709         103          0.0     1776363.9       0.1X
+UTF8_BINARY                                       11954          11956           3          0.0      119536.7       1.8X
+UNICODE_CI                                       158014         158038          35          0.0     1580135.7       0.1X
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
new file mode 100644
index 0000000000000..24e61052f5612
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.benchmark
+
+import scala.concurrent.duration._
+
+import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
+import org.apache.spark.sql.catalyst.util.CollationFactory
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * Benchmark to measure performance for comparisons between collated strings. To run this benchmark:
+ * {{{
+ *   1. without sbt:
+ *      bin/spark-submit --class <this class>
+ *        --jars <spark core test jar>,<spark catalyst test jar> <spark sql test jar>
+ *   2. build/sbt "sql/Test/runMain org.apache.spark.sql.execution.benchmark.CollationBenchmark"
+ *   3. generate result:
+ *      SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this class>"
+ *      Results will be written to "benchmarks/CollationBenchmark-results.txt".
+ * }}}
+ */
+
+object CollationBenchmark extends BenchmarkBase {
+  private val collationTypes = Seq("UTF8_BINARY_LCASE", "UNICODE", "UTF8_BINARY", "UNICODE_CI")
+
+  def generateSeqInput(n: Long): Seq[UTF8String] = {
+    val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def",
+      "GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx",
+      "ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", "GHI", "ghi",
+      "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", "YZ")
+      .map(UTF8String.fromString)
+    val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % input.size))
+    inputLong
+  }
+
+  def benchmarkUTFStringEquals(collationTypes: Seq[String], utf8Strings: Seq[UTF8String]): Unit = {
+    val sublistStrings = utf8Strings
+
+    val benchmark = new Benchmark(
+      "collation unit benchmarks - equalsFunction",
+      utf8Strings.size * 10,
+      warmupTime = 4.seconds,
+      output = output)
+    collationTypes.foreach(collationType => {
+      val collation = CollationFactory.fetchCollation(collationType)
+      benchmark.addCase(s"$collationType") { _ =>
+        sublistStrings.foreach(s1 =>
+          utf8Strings.foreach(s =>
+            (0 to 10).foreach(_ =>
+              collation.equalsFunction(s, s1).booleanValue())
+          )
+        )
+      }
+    }
+    )
+    benchmark.run()
+  }
+
+  def benchmarkUTFStringCompare(collationTypes: Seq[String], utf8Strings: Seq[UTF8String]): Unit = {
+    val sublistStrings = utf8Strings
+
+    val benchmark = new Benchmark(
+      "collation unit benchmarks - compareFunction",
+      utf8Strings.size * 10,
+      warmupTime = 4.seconds,
+      output = output)
+    collationTypes.foreach(collationType => {
+      val collation = CollationFactory.fetchCollation(collationType)
+      benchmark.addCase(s"$collationType") { _ =>
+        sublistStrings.foreach(s1 =>
+          utf8Strings.foreach(s =>
+            (0 to 10).foreach(_ =>
+              collation.comparator.compare(s, s1)
+            )
+          )
+        )
+      }
+    }
+    )
+    benchmark.run()
+  }
+
+  def benchmarkUTFStringHashFunction(
+      collationTypes: Seq[String],
+      utf8Strings: Seq[UTF8String]): Unit = {
+    val sublistStrings = utf8Strings
+
+    val benchmark = new Benchmark(
+      "collation unit benchmarks - hashFunction",
+      utf8Strings.size * 10,
+      warmupTime = 4.seconds,
+      output = output)
+    collationTypes.foreach(collationType => {
+      val collation = CollationFactory.fetchCollation(collationType)
+      benchmark.addCase(s"$collationType") { _ =>
+        sublistStrings.foreach(_ =>
+          utf8Strings.foreach(s =>
+            (0 to 10).foreach(_ =>
+              collation.hashFunction.applyAsLong(s)
+            )
+          )
+        )
+      }
+    }
+    )
+    benchmark.run()
+  }
+
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    benchmarkUTFStringEquals(collationTypes, generateSeqInput(10000L))
+    benchmarkUTFStringCompare(collationTypes, generateSeqInput(10000L))
+    benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(10000L))
+  }
+}