From e9f204ae93061a862e4da52c128eaf3512a66c7b Mon Sep 17 00:00:00 2001 From: GideonPotok Date: Mon, 1 Apr 2024 22:29:28 +0800 Subject: [PATCH] [SPARK-46840][SQL][TESTS] Add `CollationBenchmark` ### What changes were proposed in this pull request? https://issues.apache.org/jira/browse/SPARK-46840 [Collation Support in Spark.docx](https://github.com/apache/spark/files/14551958/Collation.Support.in.Spark.docx) ### Why are the changes needed? Work is underway to introduce collation concept into Spark. There is a need to build out a benchmarking suite to allow engineers to address performance impact. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? GHA 'Run Benchmarks' ran on this, for both JDK 17 and JDK 21 In addition, both the author and dbatomic tested locally on personal computers: `build/sbt "sql/Test/runMain org.apache.spark.sql.execution.benchmark.CollationBenchmark"` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45453 from GideonPotok/spark_46840. Authored-by: GideonPotok Signed-off-by: Wenchen Fan --- .../CollationBenchmark-jdk21-results.txt | 27 ++++ .../benchmarks/CollationBenchmark-results.txt | 27 ++++ .../benchmark/CollationBenchmark.scala | 129 ++++++++++++++++++ 3 files changed, 183 insertions(+) create mode 100644 sql/core/benchmarks/CollationBenchmark-jdk21-results.txt create mode 100644 sql/core/benchmarks/CollationBenchmark-results.txt create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt new file mode 100644 index 0000000000000..e1d7a42aac618 --- /dev/null +++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt @@ -0,0 +1,27 @@ +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 29904 29937 47 0.0 299036.1 1.0X +UNICODE 3886 3893 10 0.0 38863.0 7.7X +UTF8_BINARY 3945 3945 0 0.0 39449.6 7.6X +UNICODE_CI 45321 45330 12 0.0 453210.3 0.7X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 29807 29818 17 0.0 298065.0 1.0X +UNICODE 45704 45723 27 0.0 457036.2 0.7X +UTF8_BINARY 6460 6464 7 0.0 64597.9 4.6X +UNICODE_CI 45498 45508 14 0.0 454977.6 0.7X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 23553 23595 59 0.0 235531.8 1.0X +UNICODE 197303 197309 8 0.0 1973034.1 0.1X +UTF8_BINARY 14389 14391 2 0.0 143891.2 1.6X +UNICODE_CI 166880 166885 7 0.0 1668799.5 0.1X + diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt b/sql/core/benchmarks/CollationBenchmark-results.txt new file mode 100644 index 0000000000000..d8ebdfa695ff4 --- /dev/null +++ b/sql/core/benchmarks/CollationBenchmark-results.txt @@ -0,0 +1,27 @@ +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 34122 34152 42 0.0 341224.2 1.0X +UNICODE 4520 4522 2 0.0 45201.8 7.5X +UTF8_BINARY 4524 4526 2 0.0 45243.0 7.5X +UNICODE_CI 52706 52711 7 0.0 527056.1 0.6X + +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 33467 33474 10 0.0 334671.7 1.0X +UNICODE 51168 51168 1 0.0 511677.4 0.7X +UTF8_BINARY 5561 5593 45 0.0 55610.9 6.0X +UNICODE_CI 51929 51955 36 0.0 519291.8 0.6X + +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 22079 22083 5 0.0 220786.7 1.0X +UNICODE 177636 177709 103 0.0 1776363.9 0.1X +UTF8_BINARY 11954 11956 3 0.0 119536.7 1.8X +UNICODE_CI 158014 158038 35 0.0 1580135.7 0.1X + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala new file mode 100644 index 0000000000000..24e61052f5612 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.benchmark + +import scala.concurrent.duration._ + +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.sql.catalyst.util.CollationFactory +import org.apache.spark.unsafe.types.UTF8String + +/** + * Benchmark to measure performance for comparisons between collated strings. To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class + * --jars , + * 2. build/sbt "sql/Test/runMain org.apache.spark.sql.execution.benchmark.CollationBenchmark" + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain " + * Results will be written to "benchmarks/CollationBenchmark-results.txt". + * }}} + */ + +object CollationBenchmark extends BenchmarkBase { + private val collationTypes = Seq("UTF8_BINARY_LCASE", "UNICODE", "UTF8_BINARY", "UNICODE_CI") + + def generateSeqInput(n: Long): Seq[UTF8String] = { + val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", + "GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", + "ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", "GHI", "ghi", + "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", "YZ") + .map(UTF8String.fromString) + val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % input.size)) + inputLong + } + + def benchmarkUTFStringEquals(collationTypes: Seq[String], utf8Strings: Seq[UTF8String]): Unit = { + val sublistStrings = utf8Strings + + val benchmark = new Benchmark( + "collation unit benchmarks - equalsFunction", + utf8Strings.size * 10, + warmupTime = 4.seconds, + output = output) + collationTypes.foreach(collationType => { + val collation = CollationFactory.fetchCollation(collationType) + benchmark.addCase(s"$collationType") { _ => + sublistStrings.foreach(s1 => + utf8Strings.foreach(s => + (0 to 10).foreach(_ => + collation.equalsFunction(s, s1).booleanValue()) + ) + ) + } + } + ) + benchmark.run() + } + + def benchmarkUTFStringCompare(collationTypes: Seq[String], utf8Strings: Seq[UTF8String]): Unit = { + val sublistStrings = utf8Strings + + val benchmark = new Benchmark( + "collation unit benchmarks - compareFunction", + utf8Strings.size * 10, + warmupTime = 4.seconds, + output = output) + collationTypes.foreach(collationType => { + val collation = CollationFactory.fetchCollation(collationType) + benchmark.addCase(s"$collationType") { _ => + sublistStrings.foreach(s1 => + utf8Strings.foreach(s => + (0 to 10).foreach(_ => + collation.comparator.compare(s, s1) + ) + ) + ) + } + } + ) + benchmark.run() + } + + def benchmarkUTFStringHashFunction( + collationTypes: Seq[String], + utf8Strings: Seq[UTF8String]): Unit = { + val sublistStrings = utf8Strings + + val benchmark = new Benchmark( + "collation unit benchmarks - hashFunction", + utf8Strings.size * 10, + warmupTime = 4.seconds, + output = output) + collationTypes.foreach(collationType => { + val collation = CollationFactory.fetchCollation(collationType) + benchmark.addCase(s"$collationType") { _ => + sublistStrings.foreach(_ => + utf8Strings.foreach(s => + (0 to 10).foreach(_ => + collation.hashFunction.applyAsLong(s) + ) + ) + ) + } + } + ) + benchmark.run() + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + benchmarkUTFStringEquals(collationTypes, generateSeqInput(10000L)) + benchmarkUTFStringCompare(collationTypes, generateSeqInput(10000L)) + benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(10000L)) + } +}