@@ -62,7 +62,7 @@ import org.apache.spark.sql.execution.datasources.LogicalRelation
62
62
import org .apache .spark .sql .execution .datasources .v2 .{DataSourceV2Relation , DataSourceV2ScanRelation , FileTable }
63
63
import org .apache .spark .sql .execution .python .EvaluatePython
64
64
import org .apache .spark .sql .execution .stat .StatFunctions
65
- import org .apache .spark .sql .internal .{DataFrameWriterImpl , DataFrameWriterV2Impl , MergeIntoWriterImpl , SQLConf }
65
+ import org .apache .spark .sql .internal .{DataFrameWriterImpl , DataFrameWriterV2Impl , MergeIntoWriterImpl , SQLConf , ToScalaUDF }
66
66
import org .apache .spark .sql .internal .ExpressionUtils .column
67
67
import org .apache .spark .sql .internal .TypedAggUtils .withInputType
68
68
import org .apache .spark .sql .streaming .DataStreamWriter
@@ -865,7 +865,24 @@ class Dataset[T] private[sql](
865
865
Filter (condition.expr, logicalPlan)
866
866
}
867
867
868
- /** @inheritdoc */
868
+ /**
869
+ * Groups the Dataset using the specified columns, so we can run aggregation on them. See
870
+ * [[RelationalGroupedDataset ]] for all the available aggregate functions.
871
+ *
872
+ * {{{
873
+ * // Compute the average for all numeric columns grouped by department.
874
+ * ds.groupBy($"department").avg()
875
+ *
876
+ * // Compute the max age and average salary, grouped by department and gender.
877
+ * ds.groupBy($"department", $"gender").agg(Map(
878
+ * "salary" -> "avg",
879
+ * "age" -> "max"
880
+ * ))
881
+ * }}}
882
+ *
883
+ * @group untypedrel
884
+ * @since 2.0.0
885
+ */
869
886
@ scala.annotation.varargs
870
887
def groupBy (cols : Column * ): RelationalGroupedDataset = {
871
888
RelationalGroupedDataset (toDF(), cols.map(_.expr), RelationalGroupedDataset .GroupByType )
@@ -897,7 +914,13 @@ class Dataset[T] private[sql](
897
914
rdd.reduce(func)
898
915
}
899
916
900
- /** @inheritdoc */
917
+ /**
918
+ * (Scala-specific)
919
+ * Returns a [[KeyValueGroupedDataset ]] where the data is grouped by the given key `func`.
920
+ *
921
+ * @group typedrel
922
+ * @since 2.0.0
923
+ */
901
924
def groupByKey [K : Encoder ](func : T => K ): KeyValueGroupedDataset [K , T ] = {
902
925
val withGroupingKey = AppendColumns (func, logicalPlan)
903
926
val executed = sparkSession.sessionState.executePlan(withGroupingKey)
@@ -910,6 +933,16 @@ class Dataset[T] private[sql](
910
933
withGroupingKey.newColumns)
911
934
}
912
935
936
+ /**
937
+ * (Java-specific)
938
+ * Returns a [[KeyValueGroupedDataset ]] where the data is grouped by the given key `func`.
939
+ *
940
+ * @group typedrel
941
+ * @since 2.0.0
942
+ */
943
+ def groupByKey [K ](func : MapFunction [T , K ], encoder : Encoder [K ]): KeyValueGroupedDataset [K , T ] =
944
+ groupByKey(ToScalaUDF (func))(encoder)
945
+
913
946
/** @inheritdoc */
914
947
def unpivot (
915
948
ids : Array [Column ],
@@ -1607,7 +1640,28 @@ class Dataset[T] private[sql](
1607
1640
new DataFrameWriterV2Impl [T ](table, this )
1608
1641
}
1609
1642
1610
- /** @inheritdoc */
1643
+ /**
1644
+ * Merges a set of updates, insertions, and deletions based on a source table into
1645
+ * a target table.
1646
+ *
1647
+ * Scala Examples:
1648
+ * {{{
1649
+ * spark.table("source")
1650
+ * .mergeInto("target", $"source.id" === $"target.id")
1651
+ * .whenMatched($"salary" === 100)
1652
+ * .delete()
1653
+ * .whenNotMatched()
1654
+ * .insertAll()
1655
+ * .whenNotMatchedBySource($"salary" === 100)
1656
+ * .update(Map(
1657
+ * "salary" -> lit(200)
1658
+ * ))
1659
+ * .merge()
1660
+ * }}}
1661
+ *
1662
+ * @group basic
1663
+ * @since 4.0.0
1664
+ */
1611
1665
def mergeInto (table : String , condition : Column ): MergeIntoWriter [T ] = {
1612
1666
if (isStreaming) {
1613
1667
logicalPlan.failAnalysis(
@@ -1970,12 +2024,6 @@ class Dataset[T] private[sql](
1970
2024
@ scala.annotation.varargs
1971
2025
override def agg (expr : Column , exprs : Column * ): DataFrame = super .agg(expr, exprs : _* )
1972
2026
1973
- /** @inheritdoc */
1974
- override def groupByKey [K ](
1975
- func : MapFunction [T , K ],
1976
- encoder : Encoder [K ]): KeyValueGroupedDataset [K , T ] =
1977
- super .groupByKey(func, encoder).asInstanceOf [KeyValueGroupedDataset [K , T ]]
1978
-
1979
2027
// //////////////////////////////////////////////////////////////////////////
1980
2028
// For Python API
1981
2029
// //////////////////////////////////////////////////////////////////////////
0 commit comments