apache · gabry-lab · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024
diff --git a/docs/extensions/engines/spark/compact-table.md b/docs/extensions/engines/spark/compact-table.md
@@ -0,0 +1,57 @@
+<!--
+- Licensed to the Apache Software Foundation (ASF) under one or more
+- contributor license agreements.  See the NOTICE file distributed with
+- this work for additional information regarding copyright ownership.
+- The ASF licenses this file to You under the Apache License, Version 2.0
+- (the "License"); you may not use this file except in compliance with
+- the License.  You may obtain a copy of the License at
+-
+-   http://www.apache.org/licenses/LICENSE-2.0
+-
+- Unless required by applicable law or agreed to in writing, software
+- distributed under the License is distributed on an "AS IS" BASIS,
+- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+- See the License for the specific language governing permissions and
+- limitations under the License.
+-->
+
+# Compact Table Command Support
+
+It's a new spark sql command to compact small files in a table into larger files, such as 128MB.
+After compacting is done, it create a temporary view to query the compacted file details.
+
+Instead of read-write the whole data in a table, it only merges data in the binary and file level,
+and it's more efficient.
+
+## syntax
+
+### compact table
+
+```sparksql
+compact table table_name [INTO ${targetFileSize} ${targetFileSizeUnit} ] [ cleanup | retain | list ]
+-- targetFileSizeUnit can be 'b','k','m','g','t','p'
+-- cleanup means cleaning compact staging folders, which contains original small files, default behavior
+-- retain means retaining compact staging folders, for testing, and we can recover with the staging data
+-- list means this command only get the merging result, and don't run actually
+```
+
+### recover table
+
+```sparksql
+corecover mpact table table_name
+-- recover the compacted table, and restore the small files from staging to the original location
+```
+
+## example
+
+The following command will compact the small files in the table `default.small_files_table` into 128MB files, and create
+a temporary view `v_merged_files` to query the compacted file details.
+
+```sparksql
+set spark.sql.shuffle.partitions=32;
+
+compact table default.small_files_table;
+
+select * from v_merged_files;
+```
+
diff --git a/extensions/spark/kyuubi-extension-spark-3-5/pom.xml b/extensions/spark/kyuubi-extension-spark-3-5/pom.xml
@@ -99,6 +99,13 @@
             <scope>test</scope>
         </dependency>
 
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-avro_${scala.binary.version}</artifactId>
+            <version>${spark.version}</version>
+            <scope>test</scope>
+        </dependency>
+
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-client-runtime</artifactId>

diff --git a/.../spark/kyuubi-extension-spark-3-5/src/main/antlr4/org/apache/kyuubi/sql/KyuubiSparkSQL.g4 b/.../spark/kyuubi-extension-spark-3-5/src/main/antlr4/org/apache/kyuubi/sql/KyuubiSparkSQL.g4
@@ -51,6 +51,10 @@ singleStatement
 
 statement
     : OPTIMIZE multipartIdentifier whereClause? zorderClause        #optimizeZorder
+    | COMPACT TABLE multipartIdentifier
+              (INTO targetFileSize=INTEGER_VALUE FILE_SIZE_UNIT_LITERAL)?
+               (action=compactAction)?                              #compactTable
+    | RECOVER COMPACT TABLE multipartIdentifier                     #recoverCompactTable
     | .*?                                                           #passThrough
     ;
 
@@ -62,6 +66,9 @@ zorderClause
     : ZORDER BY order+=multipartIdentifier (',' order+=multipartIdentifier)*
     ;
 
+compactAction
+    : CLEANUP | RETAIN | LIST
+    ;
 // We don't have an expression rule in our grammar here, so we just grab the tokens and defer
 // parsing them to later.
 predicateToken
@@ -101,6 +108,12 @@ nonReserved
     | ZORDER
     ;
 
+COMPACT: 'COMPACT';
+INTO: 'INTO';
+RECOVER: 'RECOVER';
+CLEANUP: 'CLEANUP';
+RETAIN:'RETAIN';
+LIST:'LIST';
 AND: 'AND';
 BY: 'BY';
 FALSE: 'FALSE';
@@ -115,7 +128,9 @@ WHERE: 'WHERE';
 ZORDER: 'ZORDER';
 
 MINUS: '-';
-
+FILE_SIZE_UNIT_LITERAL:
+    'M' | 'MB'
+    ;
 BIGINT_LITERAL
     : DIGIT+ 'L'
     ;

diff --git a/...i-extension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/KyuubiSparkSQLAstBuilder.scala b/...i-extension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/KyuubiSparkSQLAstBuilder.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.parser.ParserUtils.withOrigin
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project, Sort}
 
 import org.apache.kyuubi.sql.KyuubiSparkSQLParser._
+import org.apache.kyuubi.sql.compact.{CompactTableOptions, CompactTableStatement, RecoverCompactTableStatement}
 import org.apache.kyuubi.sql.zorder.{OptimizeZorderStatement, Zorder}
 
 class KyuubiSparkSQLAstBuilder extends KyuubiSparkSQLBaseVisitor[AnyRef] with SQLConfHelper {
@@ -127,6 +128,20 @@ class KyuubiSparkSQLAstBuilder extends KyuubiSparkSQLBaseVisitor[AnyRef] with SQ
     UnparsedPredicateOptimize(tableIdent, predicate, orderExpr)
   }
 
+  override def visitCompactTable(ctx: CompactTableContext): CompactTableStatement =
+    withOrigin(ctx) {
+      val tableParts = visitMultipartIdentifier(ctx.multipartIdentifier())
+      val targetFileSize = Option(ctx.targetFileSize).map(_.getText.toLong)
+      val action = Option(ctx.action).map(_.getText)
+      CompactTableStatement(tableParts, targetFileSize, CompactTableOptions(action))
+    }
+
+  override def visitRecoverCompactTable(ctx: RecoverCompactTableContext)
+      : RecoverCompactTableStatement = withOrigin(ctx) {
+    val tableParts = visitMultipartIdentifier(ctx.multipartIdentifier())
+    RecoverCompactTableStatement(tableParts)
+  }
+
   override def visitPassThrough(ctx: PassThroughContext): LogicalPlan = null
 
   override def visitMultipartIdentifier(ctx: MultipartIdentifierContext): Seq[String] =

diff --git a/...ension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/KyuubiSparkSQLCommonExtension.scala b/...ension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/KyuubiSparkSQLCommonExtension.scala
@@ -19,6 +19,7 @@ package org.apache.kyuubi.sql
 
 import org.apache.spark.sql.SparkSessionExtensions
 
+import org.apache.kyuubi.sql.compact.CompactTableResolver
 import org.apache.kyuubi.sql.zorder.{InsertZorderBeforeWritingDatasource, InsertZorderBeforeWritingHive, ResolveZorder}
 
 class KyuubiSparkSQLCommonExtension extends (SparkSessionExtensions => Unit) {
@@ -32,6 +33,7 @@ object KyuubiSparkSQLCommonExtension {
     // inject zorder parser and related rules
     extensions.injectParser { case (_, parser) => new SparkKyuubiSparkSQLParser(parser) }
     extensions.injectResolutionRule(ResolveZorder)
+    extensions.injectResolutionRule(CompactTableResolver)
 
     // Note that:
     // InsertZorderBeforeWritingDatasource and InsertZorderBeforeWritingHive

diff --git a/...bi-extension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/KyuubiSparkSQLExtension.scala b/...bi-extension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/KyuubiSparkSQLExtension.scala
@@ -19,9 +19,11 @@ package org.apache.kyuubi.sql
 
 import org.apache.spark.sql.{FinalStageResourceManager, InjectCustomResourceProfile, SparkSessionExtensions}
 
+import org.apache.kyuubi.sql.compact.CompactTableSparkStrategy
 import org.apache.kyuubi.sql.watchdog.{ForcedMaxOutputRowsRule, KyuubiUnsupportedOperationsCheck, MaxScanStrategy}
 
 // scalastyle:off line.size.limit
+
 /**
  * Depend on Spark SQL Extension framework, we can use this extension follow steps
  *   1. move this jar into $SPARK_HOME/jars
@@ -40,6 +42,7 @@ class KyuubiSparkSQLExtension extends (SparkSessionExtensions => Unit) {
     extensions.injectCheckRule(_ => KyuubiUnsupportedOperationsCheck)
     extensions.injectOptimizerRule(ForcedMaxOutputRowsRule)
     extensions.injectPlannerStrategy(MaxScanStrategy)
+    extensions.injectPlannerStrategy(CompactTableSparkStrategy)
 
     extensions.injectQueryStagePrepRule(FinalStageResourceManager(_))
     extensions.injectQueryStagePrepRule(InjectCustomResourceProfile)

diff --git a/...i-extension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/ParquetFileWriterWrapper.scala b/...i-extension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/ParquetFileWriterWrapper.scala
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.sql
+
+import java.lang.reflect.Method
+
+import org.apache.parquet.hadoop.ParquetFileWriter
+import org.apache.parquet.hadoop.metadata.{FileMetaData, GlobalMetaData}
+
+object ParquetFileWriterWrapper {
+
+  val mergeInfoField: Method = classOf[ParquetFileWriter]
+    .getDeclaredMethod(
+      "mergeInto",
+      classOf[FileMetaData],
+      classOf[GlobalMetaData],
+      classOf[Boolean])
+
+  mergeInfoField.setAccessible(true)
+
+  def mergeInto(
+      toMerge: FileMetaData,
+      mergedMetadata: GlobalMetaData,
+      strict: Boolean): GlobalMetaData = {
+    mergeInfoField.invoke(
+      null,
+      toMerge.asInstanceOf[AnyRef],
+      mergedMetadata.asInstanceOf[AnyRef],
+      strict.asInstanceOf[AnyRef]).asInstanceOf[GlobalMetaData]
+  }
+}
diff --git a/...-spark-3-5/src/main/scala/org/apache/kyuubi/sql/compact/CachePerformanceViewCommand.scala b/...-spark-3-5/src/main/scala/org/apache/kyuubi/sql/compact/CachePerformanceViewCommand.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.sql.compact
+
+import org.apache.hadoop.fs.FileSystem
+import org.apache.spark.sql.{Row, SparkInternalExplorer, SparkSession}
+import org.apache.spark.sql.catalyst.plans.QueryPlan
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.command.{DropTableCommand, LeafRunnableCommand}
+
+case class CachePerformanceViewCommand(
+    tableIdentifier: Seq[String],
+    performancePlan: LogicalPlan,
+    originalFileLocations: Seq[String],
+    options: CompactTableOption) extends LeafRunnableCommand {
+
+  override def innerChildren: Seq[QueryPlan[_]] = Seq(performancePlan)
+
+  override def run(sparkSession: SparkSession): Seq[Row] = {
+    val dropViewCommand = DropTableCommand(
+      CompactTableUtils.getTableIdentifier(tableIdentifier),
+      ifExists = true,
+      isView = true,
+      purge = true)
+    dropViewCommand.run(sparkSession)
+
+    val speculation =
+      sparkSession.sparkContext.getConf.getBoolean(
+        SparkInternalExplorer.SPECULATION_ENABLED_SYNONYM.key,
+        defaultValue = false)
+    if (speculation) {
+      sparkSession.sparkContext.getConf.set(
+        SparkInternalExplorer.SPECULATION_ENABLED_SYNONYM.key,
+        "false")
+      log.warn("set spark.speculation to false")
+    }
+    try {
+      val cacheTableCommand =
+        SparkInternalExplorer.CacheTableAsSelectExec(tableIdentifier.head, performancePlan)
+
+      // this result always empty
+      cacheTableCommand.run()
+
+      if (options == CompactTableOptions.CleanupStagingFolder) {
+        val fileSystem = FileSystem.get(sparkSession.sparkContext.hadoopConfiguration)
+        originalFileLocations.foreach { originalFileLocation =>
+          val compactStagingDir = CompactTableUtils.getCompactStagingDir(originalFileLocation)
+          fileSystem.delete(compactStagingDir, true)
+        }
+
+      }
+    } finally {
+      if (speculation) {
+        sparkSession.sparkContext.getConf.set(
+          SparkInternalExplorer.SPECULATION_ENABLED_SYNONYM.key,
+          "true")
+        log.warn("rollback spark.speculation to true")
+      }
+    }
+    Seq.empty[Row]
+  }
+
+}
diff --git a/...yuubi-extension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/compact/CompactTable.scala b/...yuubi-extension-spark-3-5/src/main/scala/org/apache/kyuubi/sql/compact/CompactTable.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.kyuubi.sql.compact
+
+import org.apache.spark.sql.catalyst.analysis.UnresolvedUnaryNode
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.plans.logical.{LeafParsedStatement, LogicalPlan}
+import org.apache.spark.sql.types._
+
+object CompactTable {
+  private val fileLocAndSizeStructArrayType: ArrayType =
+    DataTypes.createArrayType(DataTypes.createStructType(Array(
+      DataTypes.createStructField("sub_group_id", IntegerType, false),
+      DataTypes.createStructField("name", StringType, false),
+      DataTypes.createStructField("length", LongType, false))))
+
+  val smallFileCollectOutput: StructType = DataTypes.createStructType(Array(
+    DataTypes.createStructField("group_id", IntegerType, false),
+    DataTypes.createStructField("location", StringType, false),
+    DataTypes.createStructField("data_source", StringType, false),
+    DataTypes.createStructField("codec", StringType, true),
+    DataTypes.createStructField("smallFiles", fileLocAndSizeStructArrayType, false)))
+
+  val smallFileCollectOutputAttribute: Seq[AttributeReference] = smallFileCollectOutput
+    .map(field => AttributeReference(field.name, field.dataType, field.nullable)())
+
+  val mergedFilesCachedTableName = "v_merged_files"
+  val mergeMetadataKey = "spark.sql.compact.parquet.metadata.merge"
+}
+
+trait CompactTableOption
+
+object CompactTableOptions {
+  def apply(options: Option[String]): CompactTableOption = options.map(_.toLowerCase) match {
+    case Some("retain") => RetainStagingFolder
+    case Some("list") => DryRun
+    case _ => CleanupStagingFolder
+  }
+
+  case object CleanupStagingFolder extends CompactTableOption
+
+  case object RetainStagingFolder extends CompactTableOption
+
+  case object DryRun extends CompactTableOption
+}
+
+case class CompactTable(
+    child: LogicalPlan,
+    targetSizeInBytes: Option[Long],
+    options: CompactTableOption) extends UnresolvedUnaryNode {
+  override protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = {
+    CompactTable(newChild, targetSizeInBytes, options)
+  }
+}
+
+case class CompactTableStatement(
+    tableParts: Seq[String],
+    targetSizeInMB: Option[Long],
+    options: CompactTableOption) extends LeafParsedStatement
+
+case class RecoverCompactTableStatement(tableParts: Seq[String])
+  extends LeafParsedStatement
+
+case class RecoverCompactTable(child: LogicalPlan) extends UnresolvedUnaryNode {
+  override protected def withNewChildInternal(newChild: LogicalPlan): LogicalPlan = {
+    RecoverCompactTable(newChild)
+  }
+}