Skip to content

Commit

Permalink
[SPARK-49611][SQL] Introduce TVF collations() & remove the `SHOW CO…
Browse files Browse the repository at this point in the history
…LLATIONS` command

### What changes were proposed in this pull request?
The pr aims to
- introduce `TVF` `collations()`.
- remove the `SHOW COLLATIONS` command.

### Why are the changes needed?
Based on cloud-fan's suggestion: #47364 (comment)
I believe that after this, we can do many things based on it, such as `filtering` and `querying` based on `LANGUAGE` or `COUNTRY`, etc. eg:
```sql
SELECT * FROM collations() WHERE LANGUAGE like '%Chinese%';
```

### Does this PR introduce _any_ user-facing change?
Yes, provide a new TVF `collations()` for end-users.

### How was this patch tested?
- Add new UT.
- Pass GA.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #48087 from panbingkun/SPARK-49611.

Lead-authored-by: panbingkun <[email protected]>
Co-authored-by: panbingkun <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
2 people authored and cloud-fan committed Sep 16, 2024
1 parent 738db07 commit 2113f10
Show file tree
Hide file tree
Showing 13 changed files with 101 additions and 122 deletions.
1 change: 0 additions & 1 deletion docs/sql-ref-ansi-compliance.md
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,6 @@ Below is a list of all the keywords in Spark SQL.
|CODEGEN|non-reserved|non-reserved|non-reserved|
|COLLATE|reserved|non-reserved|reserved|
|COLLATION|reserved|non-reserved|reserved|
|COLLATIONS|reserved|non-reserved|reserved|
|COLLECTION|non-reserved|non-reserved|non-reserved|
|COLUMN|reserved|non-reserved|reserved|
|COLUMNS|non-reserved|non-reserved|non-reserved|
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ CLUSTERED: 'CLUSTERED';
CODEGEN: 'CODEGEN';
COLLATE: 'COLLATE';
COLLATION: 'COLLATION';
COLLATIONS: 'COLLATIONS';
COLLECTION: 'COLLECTION';
COLUMN: 'COLUMN';
COLUMNS: 'COLUMNS';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,6 @@ statement
| SHOW PARTITIONS identifierReference partitionSpec? #showPartitions
| SHOW identifier? FUNCTIONS ((FROM | IN) ns=identifierReference)?
(LIKE? (legacy=multipartIdentifier | pattern=stringLit))? #showFunctions
| SHOW COLLATIONS (LIKE? pattern=stringLit)? #showCollations
| SHOW CREATE TABLE identifierReference (AS SERDE)? #showCreateTable
| SHOW CURRENT namespace #showCurrentNamespace
| SHOW CATALOGS (LIKE? pattern=stringLit)? #showCatalogs
Expand Down Expand Up @@ -1868,7 +1867,6 @@ nonReserved
| CODEGEN
| COLLATE
| COLLATION
| COLLATIONS
| COLLECTION
| COLUMN
| COLUMNS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1158,6 +1158,7 @@ object TableFunctionRegistry {
generator[PosExplode]("posexplode"),
generator[PosExplode]("posexplode_outer", outer = true),
generator[Stack]("stack"),
generator[Collations]("collations"),
generator[SQLKeywords]("sql_keywords"),
generator[VariantExplode]("variant_explode"),
generator[VariantExplode]("variant_explode_outer", outer = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import java.util.concurrent.TimeUnit
import javax.annotation.concurrent.GuardedBy

import scala.collection.mutable
import scala.jdk.CollectionConverters.CollectionHasAsScala
import scala.util.{Failure, Success, Try}

import com.google.common.cache.{Cache, CacheBuilder}
Expand All @@ -40,8 +39,7 @@ import org.apache.spark.sql.catalyst.expressions.{Alias, Cast, Expression, Expre
import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, SubqueryAlias, View}
import org.apache.spark.sql.catalyst.trees.CurrentOrigin
import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, CollationFactory, StringUtils}
import org.apache.spark.sql.catalyst.util.CollationFactory.CollationMeta
import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, StringUtils}
import org.apache.spark.sql.connector.catalog.CatalogManager
import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME
import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
Expand Down Expand Up @@ -1901,17 +1899,6 @@ class SessionCatalog(
.filter(isTemporaryFunction)
}

/**
* List all built-in collations with the given pattern.
*/
def listCollations(pattern: Option[String]): Seq[CollationMeta] = {
val collationIdentifiers = CollationFactory.listCollations().asScala.toSeq
val filteredCollationNames = StringUtils.filterPattern(
collationIdentifiers.map(_.getName), pattern.getOrElse("*")).toSet
collationIdentifiers.filter(ident => filteredCollationNames.contains(ident.getName)).map(
CollationFactory.loadCollationMeta)
}

// -----------------
// | Other methods |
// -----------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.apache.spark.sql.catalyst.expressions

import scala.collection.mutable
import scala.jdk.CollectionConverters.CollectionHasAsScala

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
Expand All @@ -28,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.catalyst.plans.logical.{FunctionSignature, InputParameter}
import org.apache.spark.sql.catalyst.trees.TreePattern.{GENERATOR, TreePattern}
import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
import org.apache.spark.sql.catalyst.util.{ArrayData, CollationFactory, MapData}
import org.apache.spark.sql.catalyst.util.SQLKeywordUtils._
import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
import org.apache.spark.sql.internal.SQLConf
Expand Down Expand Up @@ -618,3 +619,44 @@ case class SQLKeywords() extends LeafExpression with Generator with CodegenFallb

override def prettyName: String = "sql_keywords"
}

@ExpressionDescription(
usage = """_FUNC_() - Get all of the Spark SQL string collations""",
examples = """
Examples:
> SELECT * FROM _FUNC_() WHERE NAME = 'UTF8_BINARY';
SYSTEM BUILTIN UTF8_BINARY NULL NULL ACCENT_SENSITIVE CASE_SENSITIVE NO_PAD NULL
""",
since = "4.0.0",
group = "generator_funcs")
case class Collations() extends LeafExpression with Generator with CodegenFallback {
override def elementSchema: StructType = new StructType()
.add("CATALOG", StringType, nullable = false)
.add("SCHEMA", StringType, nullable = false)
.add("NAME", StringType, nullable = false)
.add("LANGUAGE", StringType)
.add("COUNTRY", StringType)
.add("ACCENT_SENSITIVITY", StringType, nullable = false)
.add("CASE_SENSITIVITY", StringType, nullable = false)
.add("PAD_ATTRIBUTE", StringType, nullable = false)
.add("ICU_VERSION", StringType)

override def eval(input: InternalRow): IterableOnce[InternalRow] = {
CollationFactory.listCollations().asScala.map(CollationFactory.loadCollationMeta).map { m =>
InternalRow(
UTF8String.fromString(m.catalog),
UTF8String.fromString(m.schema),
UTF8String.fromString(m.collationName),
UTF8String.fromString(m.language),
UTF8String.fromString(m.country),
UTF8String.fromString(
if (m.accentSensitivity) "ACCENT_SENSITIVE" else "ACCENT_INSENSITIVE"),
UTF8String.fromString(
if (m.caseSensitivity) "CASE_SENSITIVE" else "CASE_INSENSITIVE"),
UTF8String.fromString(m.padAttribute),
UTF8String.fromString(m.icuVersion))
}
}

override def prettyName: String = "collations"
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ CLOSE
COALESCE
COLLATE
COLLATION
COLLATIONS
COLLECT
COLUMN
COMMIT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1096,16 +1096,4 @@ class SparkSqlAstBuilder extends AstBuilder {
withIdentClause(ctx.identifierReference(), UnresolvedNamespace(_)),
cleanedProperties)
}

/**
* Create a [[ShowCollationsCommand]] command.
* Expected format:
* {{{
* SHOW COLLATIONS (LIKE? pattern=stringLit)?;
* }}}
*/
override def visitShowCollations(ctx: ShowCollationsContext): LogicalPlan = withOrigin(ctx) {
val pattern = Option(ctx.pattern).map(x => string(visitStringLit(x)))
ShowCollationsCommand(pattern)
}
}

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ CLUSTERED false
CODEGEN false
COLLATE true
COLLATION true
COLLATIONS true
COLLECTION false
COLUMN true
COLUMNS false
Expand Down Expand Up @@ -384,7 +383,6 @@ CAST
CHECK
COLLATE
COLLATION
COLLATIONS
COLUMN
CONSTRAINT
CREATE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ CLUSTERED false
CODEGEN false
COLLATE false
COLLATION false
COLLATIONS false
COLLECTION false
COLUMN false
COLUMNS false
Expand Down
79 changes: 55 additions & 24 deletions sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1625,38 +1625,38 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
}
}

test("show collations") {
assert(sql("SHOW COLLATIONS").collect().length >= 562)
test("TVF collations()") {
assert(sql("SELECT * FROM collations()").collect().length >= 562)

// verify that the output ordering is as expected (UTF8_BINARY, UTF8_LCASE, etc.)
val df = sql("SHOW COLLATIONS").limit(10)
val df = sql("SELECT * FROM collations() limit 10")
checkAnswer(df,
Seq(Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null,
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", null),
Row("SYSTEM", "BUILTIN", "UTF8_LCASE", null, null,
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", null),
Row("SYSTEM", "BUILTIN", "UNICODE", "", "",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_AI", "", "",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_CI", "", "",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_CI_AI", "", "",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af", "Afrikaans", "",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_AI", "Afrikaans", "",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_CI", "Afrikaans", "",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_CI_AI", "Afrikaans", "",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))

checkAnswer(sql("SHOW COLLATIONS LIKE '*UTF8_BINARY*'"),
Row("SYSTEM", "BUILTIN", "UTF8_LCASE", null, null,
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", null),
Row("SYSTEM", "BUILTIN", "UNICODE", "", "",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_AI", "", "",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_CI", "", "",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "UNICODE_CI_AI", "", "",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af", "Afrikaans", "",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_AI", "Afrikaans", "",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_CI", "Afrikaans", "",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "af_CI_AI", "Afrikaans", "",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))

checkAnswer(sql("SELECT * FROM collations() WHERE NAME LIKE '%UTF8_BINARY%'"),
Row("SYSTEM", "BUILTIN", "UTF8_BINARY", null, null,
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", null))

checkAnswer(sql("SHOW COLLATIONS '*zh_Hant_HKG*'"),
checkAnswer(sql("SELECT * FROM collations() WHERE NAME LIKE '%zh_Hant_HKG%'"),
Seq(Row("SYSTEM", "BUILTIN", "zh_Hant_HKG", "Chinese", "Hong Kong SAR China",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_AI", "Chinese", "Hong Kong SAR China",
Expand All @@ -1665,5 +1665,36 @@ class CollationSuite extends DatasourceV2SQLBase with AdaptiveSparkPlanHelper {
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hant_HKG_CI_AI", "Chinese", "Hong Kong SAR China",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))

checkAnswer(sql("SELECT * FROM collations() WHERE COUNTRY = 'Singapore'"),
Seq(Row("SYSTEM", "BUILTIN", "zh_Hans_SGP", "Chinese", "Singapore",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_AI", "Chinese", "Singapore",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI", "Chinese", "Singapore",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "zh_Hans_SGP_CI_AI", "Chinese", "Singapore",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))

checkAnswer(sql("SELECT * FROM collations() WHERE LANGUAGE = 'English' " +
"and COUNTRY = 'United States'"),
Seq(Row("SYSTEM", "BUILTIN", "en_USA", "English", "United States",
"ACCENT_SENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "en_USA_AI", "English", "United States",
"ACCENT_SENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "en_USA_CI", "English", "United States",
"ACCENT_INSENSITIVE", "CASE_SENSITIVE", "NO_PAD", "75.1.0.0"),
Row("SYSTEM", "BUILTIN", "en_USA_CI_AI", "English", "United States",
"ACCENT_INSENSITIVE", "CASE_INSENSITIVE", "NO_PAD", "75.1.0.0")))

checkAnswer(sql("SELECT NAME, LANGUAGE, ACCENT_SENSITIVITY, CASE_SENSITIVITY " +
"FROM collations() WHERE COUNTRY = 'United States'"),
Seq(Row("en_USA", "English", "ACCENT_SENSITIVE", "CASE_SENSITIVE"),
Row("en_USA_AI", "English", "ACCENT_SENSITIVE", "CASE_INSENSITIVE"),
Row("en_USA_CI", "English", "ACCENT_INSENSITIVE", "CASE_SENSITIVE"),
Row("en_USA_CI_AI", "English", "ACCENT_INSENSITIVE", "CASE_INSENSITIVE")))

checkAnswer(sql("SELECT NAME FROM collations() WHERE ICU_VERSION is null"),
Seq(Row("UTF8_BINARY"), Row("UTF8_LCASE")))
}
}
Loading

0 comments on commit 2113f10

Please sign in to comment.