apache
diff --git a/‎mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala‎
Lines changed: 28 additions & 2 deletions b/‎mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala‎
Lines changed: 28 additions & 2 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/Estimator.scala‎
Lines changed: 21 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/Estimator.scala‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/Model.scala‎
Lines changed: 16 additions & 1 deletion b/‎mllib/src/main/scala/org/apache/spark/ml/Model.scala‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala‎
Lines changed: 20 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala‎
Lines changed: 16 additions & 1 deletion b/‎mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala‎
Lines changed: 28 additions & 1 deletion b/‎mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala‎
Lines changed: 28 additions & 1 deletion
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala‎
Lines changed: 17 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala‎
Lines changed: 30 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/param/params.scala‎
Lines changed: 5 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/param/params.scala‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala‎
Lines changed: 16 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala‎
Lines changed: 16 additions & 0 deletions
@@ -191,8 +191,7 @@ sealed trait Vector extends Serializable {
   def compressed: Vector = compressedWithNNZ(numNonzeros)
 
   private[ml] def compressedWithNNZ(nnz: Int): Vector = {
-    // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
-    if (1.5 * (nnz + 1.0) < size) {
+    if (Vectors.getSparseSize(nnz) < Vectors.getDenseSize(size)) {
       toSparseWithSize(nnz)
     } else {
       toDense
@@ -230,6 +229,8 @@ sealed trait Vector extends Serializable {
    */
   private[spark] def nonZeroIterator: Iterator[(Int, Double)] =
     activeIterator.filter(_._2 != 0)
+
+  private[ml] def getSizeInBytes: Long
 }
 
 /**
@@ -504,6 +505,27 @@ object Vectors {
 
   /** Max number of nonzero entries used in computing hash code. */
   private[linalg] val MAX_HASH_NNZ = 128
+
+  private[ml] def getSparseSize(nnz: Long): Long = {
+    /*
+      A sparse vector stores one double array, one int array and one int:
+      8 * values.length + 4 * values.length + arrayHeader * 2 + 4
+     */
+    val doubleBytes = java.lang.Double.BYTES
+    val intBytes = java.lang.Integer.BYTES
+    val arrayHeader = 12L
+    (doubleBytes + intBytes) * nnz + arrayHeader * 2L + 4L
+  }
+
+  private[ml] def getDenseSize(size: Long): Long = {
+    /*
+      A dense vector stores one double array:
+      8 * values.length + arrayHeader
+     */
+    val doubleBytes = java.lang.Double.BYTES
+    val arrayHeader = 12L
+    doubleBytes * size + arrayHeader
+  }
 }
 
 /**
@@ -596,6 +618,8 @@ class DenseVector @Since("2.0.0") ( @Since("2.0.0") val values: Array[Double]) e
 
   private[spark] override def activeIterator: Iterator[(Int, Double)] =
     iterator
+
+  override private[ml] def getSizeInBytes: Long = Vectors.getDenseSize(values.length)
 }
 
 @Since("2.0.0")
@@ -845,6 +869,8 @@ class SparseVector @Since("2.0.0") (
     val localValues = values
     Iterator.tabulate(numActives)(j => (localIndices(j), localValues(j)))
   }
+
+  override private[ml] def getSizeInBytes: Long = Vectors.getSparseSize(values.length)
 }
 
 @Since("2.0.0")
 
@@ -81,4 +81,25 @@ abstract class Estimator[M <: Model[M]] extends PipelineStage {
   }
 
   override def copy(extra: ParamMap): Estimator[M]
+
+  /**
+   * For ml connect only.
+   * Estimate an upper-bound size of the model to be fitted in bytes, based on the
+   * parameters and the dataset, e.g., using $(k) and numFeatures to estimate a
+   * k-means model size.
+   * 1, Only driver side memory usage is counted, distributed objects (like DataFrame,
+   * RDD, Graph, Summary) are ignored.
+   * 2, Lazy vals are not counted, e.g., an auxiliary object used in prediction.
+   * 3, If there is no enough information to get an accurate size, try to estimate the
+   * upper-bound size, e.g.
+   *    - Given a LogisticRegression estimator, assume the coefficients are dense, even
+   *      though the actual fitted model might be sparse (by L1 penalty).
+   *    - Given a tree model, assume all underlying trees are complete binary trees, even
+   *      though some branches might be pruned or truncated.
+   * 4, For some model such as tree model, estimating model size before training is hard,
+   *    the `estimateModelSize` method is not supported.
+   */
+  private[spark] def estimateModelSize(dataset: Dataset[_]): Long = {
+    throw new UnsupportedOperationException
+  }
 }
@@ -18,13 +18,14 @@
 package org.apache.spark.ml
 
 import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.util.SizeEstimator
 
 /**
  * A fitted model, i.e., a [[Transformer]] produced by an [[Estimator]].
  *
  * @tparam M model type
  */
-abstract class Model[M <: Model[M]] extends Transformer {
+abstract class Model[M <: Model[M]] extends Transformer { self =>
   /**
    * The parent estimator that produced this model.
    * @note For ensembles' component Models, this value can be null.
@@ -43,4 +44,18 @@ abstract class Model[M <: Model[M]] extends Transformer {
   def hasParent: Boolean = parent != null
 
   override def copy(extra: ParamMap): M
+
+  /**
+   * For ml connect only.
+   * Estimate the size of this model in bytes.
+   * This is an approximation, the real size might be different.
+   * 1, Only driver side memory usage is counted, distributed objects (like DataFrame,
+   * RDD, Graph, Summary) are ignored.
+   * 2, Lazy vals are not counted, e.g., an auxiliary object used in prediction.
+   * 3, The default implementation uses `org.apache.spark.util.SizeEstimator.estimate`,
+   *    some models override the default implementation to achieve more precise estimation.
+   * 4, For 3-rd extension, if external languages are used, it is recommended to override
+   * this method and return a proper size.
+   */
+  private[spark] def estimatedSize: Long = SizeEstimator.estimate(self)
 }
@@ -237,6 +237,15 @@ class FMClassifier @Since("3.0.0") (
 
   @Since("3.0.0")
   override def copy(extra: ParamMap): FMClassifier = defaultCopy(extra)
+
+  override def estimateModelSize(dataset: Dataset[_]): Long = {
+    val numFeatures = DatasetUtils.getNumFeatures(dataset, $(featuresCol))
+
+    var size = this.estimateMatadataSize
+    size += Vectors.getDenseSize(numFeatures) // linear
+    size += Matrices.getDenseSize(numFeatures, $(factorSize)) // factors
+    size
+  }
 }
 
 @Since("3.0.0")
@@ -312,6 +321,17 @@ class FMClassificationModel private[classification] (
     copyValues(new FMClassificationModel(uid, intercept, linear, factors), extra)
   }
 
+  override def estimatedSize: Long = {
+    var size = this.estimateMatadataSize
+    if (this.linear != null) {
+      size += this.linear.getSizeInBytes
+    }
+    if (this.factors != null) {
+      size += this.factors.getSizeInBytes
+    }
+    size
+  }
+
   @Since("3.0.0")
   override def write: MLWriter =
     new FMClassificationModel.FMClassificationModelWriter(this)
 
@@ -168,6 +168,13 @@ class LinearSVC @Since("2.2.0") (
   @Since("3.1.0")
   def setMaxBlockSizeInMB(value: Double): this.type = set(maxBlockSizeInMB, value)
 
+  private[spark] override def estimateModelSize(dataset: Dataset[_]): Long = {
+    val numFeatures = DatasetUtils.getNumFeatures(dataset, $(featuresCol))
+    var size = this.estimateMatadataSize
+    size += Vectors.getDenseSize(numFeatures) // coefficients
+    size
+  }
+
   @Since("2.2.0")
   override def copy(extra: ParamMap): LinearSVC = defaultCopy(extra)
 
@@ -259,7 +266,7 @@ class LinearSVC @Since("2.2.0") (
       if (featuresStd(i) != 0.0) rawCoefficients(i) / featuresStd(i) else 0.0
     }
     val intercept = if ($(fitIntercept)) rawCoefficients.last else 0.0
-    createModel(dataset, Vectors.dense(coefficientArray), intercept, objectiveHistory)
+    createModel(dataset, Vectors.dense(coefficientArray).compressed, intercept, objectiveHistory)
   }
 
   private def createModel(
@@ -421,6 +428,14 @@ class LinearSVCModel private[classification] (
     copyValues(new LinearSVCModel(uid, coefficients, intercept), extra).setParent(parent)
   }
 
+  private[spark] override def estimatedSize: Long = {
+    var size = this.estimateMatadataSize
+    if (this.coefficients != null) {
+      size += this.coefficients.getSizeInBytes
+    }
+    size
+  }
+
   @Since("2.2.0")
   override def write: MLWriter = new LinearSVCModel.LinearSVCWriter(this)
 
 
@@ -45,7 +45,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.VersionUtils
+import org.apache.spark.util._
 
 /**
  * Params for logistic regression.
@@ -1041,6 +1041,22 @@ class LogisticRegression @Since("1.2.0") (
     (solution, arrayBuilder.result())
   }
 
+  private[spark] override def estimateModelSize(dataset: Dataset[_]): Long = {
+    // TODO: get numClasses and numFeatures together from dataset
+    val numClasses = DatasetUtils.getNumClasses(dataset, $(labelCol))
+    val numFeatures = DatasetUtils.getNumFeatures(dataset, $(featuresCol))
+
+    var size = this.estimateMatadataSize
+    if (checkMultinomial(numClasses)) {
+      size += Matrices.getDenseSize(numFeatures, numClasses) // coefficientMatrix
+      size += Vectors.getDenseSize(numClasses) // interceptVector
+    } else {
+      size += Matrices.getDenseSize(numFeatures, 1) // coefficientMatrix
+      size += Vectors.getDenseSize(1) // interceptVector
+    }
+    size
+  }
+
   @Since("1.4.0")
   override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra)
 }
@@ -1248,6 +1264,17 @@ class LogisticRegressionModel private[spark] (
     }
   }
 
+  private[spark] override def estimatedSize: Long = {
+    var size = this.estimateMatadataSize
+    if (this.coefficientMatrix != null) {
+      size += this.coefficientMatrix.getSizeInBytes
+    }
+    if (this.interceptVector != null) {
+      size += this.interceptVector.getSizeInBytes
+    }
+    size
+  }
+
   @Since("1.4.0")
   override def copy(extra: ParamMap): LogisticRegressionModel = {
     val newModel = copyValues(new LogisticRegressionModel(uid, coefficientMatrix, interceptVector,
 
@@ -173,6 +173,15 @@ class MultilayerPerceptronClassifier @Since("1.5.0") (
   @Since("1.5.0")
   override def copy(extra: ParamMap): MultilayerPerceptronClassifier = defaultCopy(extra)
 
+  private[spark] override def estimateModelSize(dataset: Dataset[_]): Long = {
+    val topology = FeedForwardTopology.multiLayerPerceptron($(layers), softmaxOnTop = true)
+    val expectedWeightSize = topology.layers.map(_.weightSize).sum
+
+    var size = this.estimateMatadataSize
+    size += Vectors.getDenseSize(expectedWeightSize) // weights
+    size
+  }
+
   /**
    * Train a model using the given dataset and parameters.
    * Developers can implement this instead of `fit()` to avoid dealing with schema validation
@@ -328,6 +337,14 @@ class MultilayerPerceptronClassificationModel private[ml] (
     copyValues(copied, extra)
   }
 
+  private[spark] override def estimatedSize: Long = {
+    var size = this.estimateMatadataSize
+    if (this.weights != null) {
+      size += this.weights.getSizeInBytes
+    }
+    size
+  }
+
   @Since("2.0.0")
   override def write: MLWriter =
     new MultilayerPerceptronClassificationModel.MultilayerPerceptronClassificationModelWriter(this)
 
@@ -344,6 +344,22 @@ class NaiveBayes @Since("1.5.0") (
     new NaiveBayesModel(uid, pi.compressed, theta.compressed, sigma.compressed)
   }
 
+  private[spark] override def estimateModelSize(dataset: Dataset[_]): Long = {
+    val numClasses = DatasetUtils.getNumClasses(dataset, $(labelCol))
+    val numFeatures = DatasetUtils.getNumFeatures(dataset, $(featuresCol))
+
+    var size = this.estimateMatadataSize
+    size += Vectors.getDenseSize(numClasses) // pi
+    size += Matrices.getDenseSize(numClasses, numFeatures) // theta
+    $(modelType) match {
+      case Multinomial | Bernoulli | Complement =>
+        size += Matrices.getDenseSize(0, 0) // sigma
+      case _ =>
+        size += Matrices.getDenseSize(numClasses, numFeatures) // sigma
+    }
+    size
+  }
+
   @Since("1.5.0")
   override def copy(extra: ParamMap): NaiveBayes = defaultCopy(extra)
 }
@@ -551,6 +567,20 @@ class NaiveBayesModel private[ml] (
     }
   }
 
+  private[spark] override def estimatedSize: Long = {
+    var size = this.estimateMatadataSize
+    if (this.pi != null) {
+      size += this.pi.getSizeInBytes
+    }
+    if (this.theta != null) {
+      size += this.theta.getSizeInBytes
+    }
+    if (this.sigma != null) {
+      size += this.sigma.getSizeInBytes
+    }
+    size
+  }
+
   @Since("1.5.0")
   override def copy(extra: ParamMap): NaiveBayesModel = {
     copyValues(new NaiveBayesModel(uid, pi, theta, sigma).setParent(this.parent), extra)
 
@@ -33,6 +33,7 @@ import org.apache.spark.annotation.Since
 import org.apache.spark.ml.linalg.{JsonMatrixConverter, JsonVectorConverter, Matrix, Vector}
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.util.ArrayImplicits._
+import org.apache.spark.util.SizeEstimator
 
 /**
  * A param with self-contained documentation and optionally default value. Primitive-typed param
@@ -647,6 +648,10 @@ case class ParamPair[T] @Since("1.2.0") (
  */
 trait Params extends Identifiable with Serializable {
 
+  private[ml] def estimateMatadataSize: Long = {
+    SizeEstimator.estimate((this.paramMap, this.defaultParamMap, this.uid))
+  }
+
   /**
    * Returns all params sorted by their names. The default implementation uses Java reflection to
    * list all public methods that have no arguments and return [[Param]].
 
@@ -350,6 +350,14 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
 
   @Since("1.6.0")
   override def copy(extra: ParamMap): AFTSurvivalRegression = defaultCopy(extra)
+
+  private[spark] override def estimateModelSize(dataset: Dataset[_]): Long = {
+    val numFeatures = DatasetUtils.getNumFeatures(dataset, $(featuresCol))
+
+    var size = this.estimateMatadataSize
+    size += Vectors.getDenseSize(numFeatures) // coefficients
+    size
+  }
 }
 
 @Since("1.6.0")
@@ -469,6 +477,14 @@ class AFTSurvivalRegressionModel private[ml] (
       .setParent(parent)
   }
 
+  private[spark] override def estimatedSize: Long = {
+    var size = this.estimateMatadataSize
+    if (this.coefficients != null) {
+      size += this.coefficients.getSizeInBytes
+    }
+    size
+  }
+
   @Since("1.6.0")
   override def write: MLWriter =
     new AFTSurvivalRegressionModel.AFTSurvivalRegressionModelWriter(this)