-
Notifications
You must be signed in to change notification settings - Fork 28.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-46840][SQL][TESTS] Add
CollationBenchmark
### What changes were proposed in this pull request? https://issues.apache.org/jira/browse/SPARK-46840 [Collation Support in Spark.docx](https://github.com/apache/spark/files/14551958/Collation.Support.in.Spark.docx) ### Why are the changes needed? Work is underway to introduce collation concept into Spark. There is a need to build out a benchmarking suite to allow engineers to address performance impact. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? GHA 'Run Benchmarks' ran on this, for both JDK 17 and JDK 21 In addition, both the author and dbatomic tested locally on personal computers: `build/sbt "sql/Test/runMain org.apache.spark.sql.execution.benchmark.CollationBenchmark"` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45453 from GideonPotok/spark_46840. Authored-by: GideonPotok <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
- Loading branch information
1 parent
72c619e
commit e9f204a
Showing
3 changed files
with
183 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure | ||
AMD EPYC 7763 64-Core Processor | ||
collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative | ||
-------------------------------------------------------------------------------------------------------------------------- | ||
UTF8_BINARY_LCASE 29904 29937 47 0.0 299036.1 1.0X | ||
UNICODE 3886 3893 10 0.0 38863.0 7.7X | ||
UTF8_BINARY 3945 3945 0 0.0 39449.6 7.6X | ||
UNICODE_CI 45321 45330 12 0.0 453210.3 0.7X | ||
|
||
OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure | ||
AMD EPYC 7763 64-Core Processor | ||
collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative | ||
--------------------------------------------------------------------------------------------------------------------------- | ||
UTF8_BINARY_LCASE 29807 29818 17 0.0 298065.0 1.0X | ||
UNICODE 45704 45723 27 0.0 457036.2 0.7X | ||
UTF8_BINARY 6460 6464 7 0.0 64597.9 4.6X | ||
UNICODE_CI 45498 45508 14 0.0 454977.6 0.7X | ||
|
||
OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure | ||
AMD EPYC 7763 64-Core Processor | ||
collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative | ||
------------------------------------------------------------------------------------------------------------------------ | ||
UTF8_BINARY_LCASE 23553 23595 59 0.0 235531.8 1.0X | ||
UNICODE 197303 197309 8 0.0 1973034.1 0.1X | ||
UTF8_BINARY 14389 14391 2 0.0 143891.2 1.6X | ||
UNICODE_CI 166880 166885 7 0.0 1668799.5 0.1X | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure | ||
AMD EPYC 7763 64-Core Processor | ||
collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative | ||
-------------------------------------------------------------------------------------------------------------------------- | ||
UTF8_BINARY_LCASE 34122 34152 42 0.0 341224.2 1.0X | ||
UNICODE 4520 4522 2 0.0 45201.8 7.5X | ||
UTF8_BINARY 4524 4526 2 0.0 45243.0 7.5X | ||
UNICODE_CI 52706 52711 7 0.0 527056.1 0.6X | ||
|
||
OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure | ||
AMD EPYC 7763 64-Core Processor | ||
collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative | ||
--------------------------------------------------------------------------------------------------------------------------- | ||
UTF8_BINARY_LCASE 33467 33474 10 0.0 334671.7 1.0X | ||
UNICODE 51168 51168 1 0.0 511677.4 0.7X | ||
UTF8_BINARY 5561 5593 45 0.0 55610.9 6.0X | ||
UNICODE_CI 51929 51955 36 0.0 519291.8 0.6X | ||
|
||
OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure | ||
AMD EPYC 7763 64-Core Processor | ||
collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative | ||
------------------------------------------------------------------------------------------------------------------------ | ||
UTF8_BINARY_LCASE 22079 22083 5 0.0 220786.7 1.0X | ||
UNICODE 177636 177709 103 0.0 1776363.9 0.1X | ||
UTF8_BINARY 11954 11956 3 0.0 119536.7 1.8X | ||
UNICODE_CI 158014 158038 35 0.0 1580135.7 0.1X | ||
|
129 changes: 129 additions & 0 deletions
129
sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.spark.sql.execution.benchmark | ||
|
||
import scala.concurrent.duration._ | ||
|
||
import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} | ||
import org.apache.spark.sql.catalyst.util.CollationFactory | ||
import org.apache.spark.unsafe.types.UTF8String | ||
|
||
/** | ||
* Benchmark to measure performance for comparisons between collated strings. To run this benchmark: | ||
* {{{ | ||
* 1. without sbt: | ||
* bin/spark-submit --class <this class> | ||
* --jars <spark core test jar>,<spark catalyst test jar> <spark sql test jar> | ||
* 2. build/sbt "sql/Test/runMain org.apache.spark.sql.execution.benchmark.CollationBenchmark" | ||
* 3. generate result: | ||
* SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this class>" | ||
* Results will be written to "benchmarks/CollationBenchmark-results.txt". | ||
* }}} | ||
*/ | ||
|
||
object CollationBenchmark extends BenchmarkBase { | ||
private val collationTypes = Seq("UTF8_BINARY_LCASE", "UNICODE", "UTF8_BINARY", "UNICODE_CI") | ||
|
||
def generateSeqInput(n: Long): Seq[UTF8String] = { | ||
val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", | ||
"GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", | ||
"ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", "GHI", "ghi", | ||
"JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", "YZ") | ||
.map(UTF8String.fromString) | ||
val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % input.size)) | ||
inputLong | ||
} | ||
|
||
def benchmarkUTFStringEquals(collationTypes: Seq[String], utf8Strings: Seq[UTF8String]): Unit = { | ||
val sublistStrings = utf8Strings | ||
|
||
val benchmark = new Benchmark( | ||
"collation unit benchmarks - equalsFunction", | ||
utf8Strings.size * 10, | ||
warmupTime = 4.seconds, | ||
output = output) | ||
collationTypes.foreach(collationType => { | ||
val collation = CollationFactory.fetchCollation(collationType) | ||
benchmark.addCase(s"$collationType") { _ => | ||
sublistStrings.foreach(s1 => | ||
utf8Strings.foreach(s => | ||
(0 to 10).foreach(_ => | ||
collation.equalsFunction(s, s1).booleanValue()) | ||
) | ||
) | ||
} | ||
} | ||
) | ||
benchmark.run() | ||
} | ||
|
||
def benchmarkUTFStringCompare(collationTypes: Seq[String], utf8Strings: Seq[UTF8String]): Unit = { | ||
val sublistStrings = utf8Strings | ||
|
||
val benchmark = new Benchmark( | ||
"collation unit benchmarks - compareFunction", | ||
utf8Strings.size * 10, | ||
warmupTime = 4.seconds, | ||
output = output) | ||
collationTypes.foreach(collationType => { | ||
val collation = CollationFactory.fetchCollation(collationType) | ||
benchmark.addCase(s"$collationType") { _ => | ||
sublistStrings.foreach(s1 => | ||
utf8Strings.foreach(s => | ||
(0 to 10).foreach(_ => | ||
collation.comparator.compare(s, s1) | ||
) | ||
) | ||
) | ||
} | ||
} | ||
) | ||
benchmark.run() | ||
} | ||
|
||
def benchmarkUTFStringHashFunction( | ||
collationTypes: Seq[String], | ||
utf8Strings: Seq[UTF8String]): Unit = { | ||
val sublistStrings = utf8Strings | ||
|
||
val benchmark = new Benchmark( | ||
"collation unit benchmarks - hashFunction", | ||
utf8Strings.size * 10, | ||
warmupTime = 4.seconds, | ||
output = output) | ||
collationTypes.foreach(collationType => { | ||
val collation = CollationFactory.fetchCollation(collationType) | ||
benchmark.addCase(s"$collationType") { _ => | ||
sublistStrings.foreach(_ => | ||
utf8Strings.foreach(s => | ||
(0 to 10).foreach(_ => | ||
collation.hashFunction.applyAsLong(s) | ||
) | ||
) | ||
) | ||
} | ||
} | ||
) | ||
benchmark.run() | ||
} | ||
|
||
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { | ||
benchmarkUTFStringEquals(collationTypes, generateSeqInput(10000L)) | ||
benchmarkUTFStringCompare(collationTypes, generateSeqInput(10000L)) | ||
benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(10000L)) | ||
} | ||
} |