Skip to content

Commit

Permalink
[SPARK-50795][SQL] Store timestamp as long type in describe Linke…
Browse files Browse the repository at this point in the history
…dHashMap

### What changes were proposed in this pull request?

When storing table metadata in the `describe` LinkedHashMap object, we retain the timestamp as a `long`data type (instead of converting to a formatted date `string` type) to allow flexibility and extensibility of `describe` date format. Formatting the date fields is delegated to the caller (e.g. describe table, describe as json, describe column, etc.).

Example date for describe table: `Mon Nov 01 12:00:00 UTC 2021`

Example date for describe as json: `2021-11-01T12:00:00Z`

### Why are the changes needed?

Improve extensibility of `describe` and ensure backwards compatibility

### Does this PR introduce _any_ user-facing change?

Affects the `describe` output date format

### How was this patch tested?

Added `describe table` tests for date format

### Was this patch authored or co-authored using generative AI tooling?

No

Closes #49513 from asl3/asl3/describetable-dateplaintext.

Lead-authored-by: Amanda Liu <[email protected]>
Co-authored-by: Wenchen Fan <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
asl3 and cloud-fan committed Jan 16, 2025
1 parent 6ebed5b commit 8bbec5d
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.catalog

import java.net.URI
import java.time.{ZoneId, ZoneOffset}
import java.util.Date

import scala.collection.mutable
import scala.util.control.NonFatal
Expand All @@ -27,7 +28,7 @@ import com.fasterxml.jackson.annotation.JsonInclude.Include
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
import com.fasterxml.jackson.module.scala.{ClassTagExtensions, DefaultScalaModule}
import org.apache.commons.lang3.StringUtils
import org.json4s.JsonAST.{JArray, JBool, JDouble, JInt, JNull, JObject, JString, JValue}
import org.json4s.JsonAST.{JArray, JBool, JDouble, JInt, JLong, JNull, JObject, JString, JValue}
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkException
Expand Down Expand Up @@ -63,6 +64,7 @@ trait MetadataMapSupport {
protected def jsonToString(
jsonMap: mutable.LinkedHashMap[String, JValue]): mutable.LinkedHashMap[String, String] = {
val map = new mutable.LinkedHashMap[String, String]()
val timestampKeys = Set("Created Time", "Last Access")
jsonMap.foreach { case (key, jValue) =>
val stringValue = jValue match {
case JString(value) => value
Expand All @@ -80,20 +82,18 @@ trait MetadataMapSupport {
.mkString("[", ", ", "]")
case JInt(value) => value.toString
case JDouble(value) => value.toString
case JLong(value) =>
if (timestampKeys.contains(key)) {
new Date(value).toString
} else {
value.toString
}
case _ => jValue.values.toString
}
map.put(key, stringValue)
}
map
}

val timestampFormatter = new Iso8601TimestampFormatter(
pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'",
zoneId = ZoneId.of("UTC"),
locale = DateFormatter.defaultLocale,
legacyFormat = LegacyDateFormats.LENIENT_SIMPLE_DATE_FORMAT,
isParsing = true
)
}


Expand Down Expand Up @@ -191,12 +191,10 @@ case class CatalogTablePartition(
map += ("Partition Parameters" -> paramsJson)
}

map += ("Created Time" -> JString(
timestampFormatter.format(DateTimeUtils.millisToMicros(createTime))))
map += ("Created Time" -> JLong(createTime))

val lastAccess = if (lastAccessTime <= 0) JString("UNKNOWN")
else JString(
timestampFormatter.format(DateTimeUtils.millisToMicros(createTime)))
else JLong(lastAccessTime)
map += ("Last Access" -> lastAccess)

stats.foreach(s => map += ("Partition Statistics" -> JString(s.simpleString)))
Expand Down Expand Up @@ -605,7 +603,7 @@ case class CatalogTable(

val lastAccess: JValue =
if (lastAccessTime <= 0) JString("UNKNOWN")
else JString(timestampFormatter.format(DateTimeUtils.millisToMicros(createTime)))
else JLong(lastAccessTime)

val viewQueryOutputColumns: JValue =
if (viewQueryColumnNames.nonEmpty) JArray(viewQueryColumnNames.map(JString).toList)
Expand All @@ -617,8 +615,7 @@ case class CatalogTable(
if (identifier.database.isDefined) map += "Database" -> JString(identifier.database.get)
map += "Table" -> JString(identifier.table)
if (Option(owner).exists(_.nonEmpty)) map += "Owner" -> JString(owner)
map += "Created Time" ->
JString(timestampFormatter.format(DateTimeUtils.millisToMicros(createTime)))
map += "Created Time" -> JLong(createTime)
if (lastAccess != JNull) map += "Last Access" -> lastAccess
map += "Created By" -> JString(s"Spark $createVersion")
map += "Type" -> JString(tableType.name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

package org.apache.spark.sql.execution.command

import java.time.ZoneId

import scala.collection.mutable

import org.json4s._
Expand All @@ -29,7 +31,13 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, Se
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.util.quoteIfNeeded
import org.apache.spark.sql.catalyst.util.{
quoteIfNeeded,
DateFormatter,
DateTimeUtils,
Iso8601TimestampFormatter,
LegacyDateFormats
}
import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
import org.apache.spark.sql.connector.catalog.V1Table
import org.apache.spark.sql.errors.QueryCompilationErrors
Expand All @@ -50,6 +58,13 @@ case class DescribeRelationJsonCommand(
nullable = false,
new MetadataBuilder().putString("comment", "JSON metadata of the table").build())()
)) extends UnaryRunnableCommand {
private lazy val timestampFormatter = new Iso8601TimestampFormatter(
pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'",
zoneId = ZoneId.of("UTC"),
locale = DateFormatter.defaultLocale,
legacyFormat = LegacyDateFormats.LENIENT_SIMPLE_DATE_FORMAT,
isParsing = true
)

override def run(sparkSession: SparkSession): Seq[Row] = {
val jsonMap = mutable.LinkedHashMap[String, JValue]()
Expand Down Expand Up @@ -106,11 +121,22 @@ case class DescribeRelationJsonCommand(
"outputformat" -> "output_format"
)

val timestampKeys = Set("created_time", "last_access")

val normalizedKey = key.toLowerCase().replace(" ", "_")
val renamedKey = renames.getOrElse(normalizedKey, normalizedKey)

if (!jsonMap.contains(renamedKey) && !excludedKeys.contains(renamedKey)) {
jsonMap += renamedKey -> value
val formattedValue = if (timestampKeys.contains(renamedKey)) {
value match {
case JLong(timestamp) =>
JString(timestampFormatter.format(DateTimeUtils.millisToMicros(timestamp)))
case _ => value
}
} else {
value
}
jsonMap += renamedKey -> formattedValue
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,21 @@ class DescribeTableSuite extends DescribeTableSuiteBase with CommandSuiteBase {
Row("Table Properties", "[bar=baz]", ""),
Row("Location", "file:/tmp/testcat/table_name", ""),
Row("Partition Provider", "Catalog", "")))

// example date format: Mon Nov 01 12:00:00 UTC 2021
val dayOfWeek = raw"[A-Z][a-z]{2}"
val month = raw"[A-Z][a-z]{2}"
val day = raw"\s?[0-9]{1,2}"
val time = raw"[0-9]{2}:[0-9]{2}:[0-9]{2}"
val timezone = raw"[A-Z]{3,4}"
val year = raw"[0-9]{4}"

val timeRegex = raw"""$dayOfWeek $month $day $time $timezone $year""".r

val createdTimeValue = descriptionDf.filter("col_name = 'Created Time'")
.collect().head.getString(1).trim

assert(timeRegex.matches(createdTimeValue))
}
}

Expand Down

0 comments on commit 8bbec5d

Please sign in to comment.