Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-49792][PYTHON][BUILD] Upgrade to numpy 2 for building and testing Spark branches #48180

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions dev/infra/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image"
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE 20241002
ENV FULL_REFRESH_DATE 20241007

ENV DEBIAN_FRONTEND noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN true
Expand Down Expand Up @@ -91,10 +91,10 @@ RUN mkdir -p /usr/local/pypy/pypy3.9 && \
ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3.9 && \
ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
RUN pypy3 -m pip install 'numpy==1.26.4' 'six==1.16.0' 'pandas==2.2.3' scipy coverage matplotlib lxml
RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.3' scipy coverage matplotlib lxml


ARG BASIC_PIP_PKGS="numpy==1.26.4 pyarrow>=15.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
# Python deps for Spark Connect
ARG CONNECT_PIP_PKGS="grpcio==1.62.0 grpcio-status==1.62.0 protobuf==4.25.1 googleapis-common-protos==1.56.4 graphviz==0.20.3"

Expand Down
8 changes: 4 additions & 4 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ class LinearSVC(
>>> model_path = temp_path + "/svm_model"
>>> model.save(model_path)
>>> model2 = LinearSVCModel.load(model_path)
>>> model.coefficients[0] == model2.coefficients[0]
>>> bool(model.coefficients[0] == model2.coefficients[0])
True
>>> model.intercept == model2.intercept
True
Expand Down Expand Up @@ -1210,7 +1210,7 @@ class LogisticRegression(
>>> model_path = temp_path + "/lr_model"
>>> blorModel.save(model_path)
>>> model2 = LogisticRegressionModel.load(model_path)
>>> blorModel.coefficients[0] == model2.coefficients[0]
>>> bool(blorModel.coefficients[0] == model2.coefficients[0])
True
>>> blorModel.intercept == model2.intercept
True
Expand Down Expand Up @@ -2038,9 +2038,9 @@ class RandomForestClassifier(
>>> result = model.transform(test0).head()
>>> result.prediction
0.0
>>> numpy.argmax(result.probability)
>>> int(numpy.argmax(result.probability))
0
>>> numpy.argmax(result.newRawPrediction)
>>> int(numpy.argmax(result.newRawPrediction))
0
>>> result.leafId
DenseVector([0.0, 0.0, 0.0])
Expand Down
10 changes: 5 additions & 5 deletions python/pyspark/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ class LinearRegression(
True
>>> abs(model.transform(test0).head().newPrediction - (-1.0)) < 0.001
True
>>> abs(model.coefficients[0] - 1.0) < 0.001
>>> bool(abs(model.coefficients[0] - 1.0) < 0.001)
True
>>> abs(model.intercept - 0.0) < 0.001
True
Expand All @@ -283,11 +283,11 @@ class LinearRegression(
>>> model_path = temp_path + "/lr_model"
>>> model.save(model_path)
>>> model2 = LinearRegressionModel.load(model_path)
>>> model.coefficients[0] == model2.coefficients[0]
>>> bool(model.coefficients[0] == model2.coefficients[0])
True
>>> model.intercept == model2.intercept
>>> bool(model.intercept == model2.intercept)
True
>>> model.transform(test0).take(1) == model2.transform(test0).take(1)
>>> bool(model.transform(test0).take(1) == model2.transform(test0).take(1))
True
>>> model.numFeatures
1
Expand Down Expand Up @@ -2542,7 +2542,7 @@ class GeneralizedLinearRegression(
>>> model2 = GeneralizedLinearRegressionModel.load(model_path)
>>> model.intercept == model2.intercept
True
>>> model.coefficients[0] == model2.coefficients[0]
>>> bool(model.coefficients[0] == model2.coefficients[0])
True
>>> model.transform(df).take(1) == model2.transform(df).take(1)
True
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/ml/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import numpy as np

from pyspark.loose_version import LooseVersion
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import array, struct, col
from pyspark.sql.types import ArrayType, DoubleType, IntegerType, StructType, StructField, FloatType
Expand Down Expand Up @@ -193,6 +194,10 @@ def predict(inputs):
batch_sizes = preds["preds"].to_numpy()
self.assertTrue(all(batch_sizes <= batch_size))

# TODO(SPARK-49793): enable the test below
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WeichenXu123 may I get your input on that please?
More details can be found here https://issues.apache.org/jira/browse/SPARK-49793.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you have error message and error stack for numpy2 + caching ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you please see https://issues.apache.org/jira/browse/SPARK-49793? There is no error but the results are unexpected.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it , need some time to investigation, but we can disable it as a workaround for now.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, thank you!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also cc @leewyang as the test author

@unittest.skipIf(
LooseVersion(np.__version__) >= LooseVersion("2"), "Caching does not work with numpy 2"
)
def test_caching(self):
def make_predict_fn():
# emulate loading a model, this should only be invoked once (per worker process)
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ class CrossValidator(
>>> cvModel = cv.fit(dataset)
>>> cvModel.getNumFolds()
3
>>> cvModel.avgMetrics[0]
>>> float(cvModel.avgMetrics[0])
0.5
>>> path = tempfile.mkdtemp()
>>> model_path = path + "/model"
Expand Down
25 changes: 14 additions & 11 deletions python/pyspark/mllib/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,9 @@ class LogisticRegressionModel(LinearClassificationModel):
>>> path = tempfile.mkdtemp()
>>> lrm.save(sc, path)
>>> sameModel = LogisticRegressionModel.load(sc, path)
>>> sameModel.predict(numpy.array([0.0, 1.0]))
>>> int(sameModel.predict(numpy.array([0.0, 1.0])))
1
>>> sameModel.predict(SparseVector(2, {0: 1.0}))
>>> int(sameModel.predict(SparseVector(2, {0: 1.0})))
0
>>> from shutil import rmtree
>>> try:
Expand Down Expand Up @@ -555,7 +555,7 @@ class SVMModel(LinearClassificationModel):
>>> svm.predict(sc.parallelize([[1.0]])).collect()
[1]
>>> svm.clearThreshold()
>>> svm.predict(numpy.array([1.0]))
>>> float(svm.predict(numpy.array([1.0])))
1.44...

>>> sparse_data = [
Expand All @@ -573,9 +573,9 @@ class SVMModel(LinearClassificationModel):
>>> path = tempfile.mkdtemp()
>>> svm.save(sc, path)
>>> sameModel = SVMModel.load(sc, path)
>>> sameModel.predict(SparseVector(2, {1: 1.0}))
>>> int(sameModel.predict(SparseVector(2, {1: 1.0})))
1
>>> sameModel.predict(SparseVector(2, {0: -1.0}))
>>> int(sameModel.predict(SparseVector(2, {0: -1.0})))
0
>>> from shutil import rmtree
>>> try:
Expand Down Expand Up @@ -756,27 +756,30 @@ class NaiveBayesModel(Saveable, Loader["NaiveBayesModel"]):
... LabeledPoint(1.0, [1.0, 0.0]),
... ]
>>> model = NaiveBayes.train(sc.parallelize(data))
>>> model.predict(numpy.array([0.0, 1.0]))
>>> float(model.predict(numpy.array([0.0, 1.0])))
0.0
>>> model.predict(numpy.array([1.0, 0.0]))
>>> float(model.predict(numpy.array([1.0, 0.0])))
1.0
>>> model.predict(sc.parallelize([[1.0, 0.0]])).collect()
>>> list(map(float, model.predict(sc.parallelize([[1.0, 0.0]])).collect()))
[1.0]
>>> sparse_data = [
... LabeledPoint(0.0, SparseVector(2, {1: 0.0})),
... LabeledPoint(0.0, SparseVector(2, {1: 1.0})),
... LabeledPoint(1.0, SparseVector(2, {0: 1.0}))
... ]
>>> model = NaiveBayes.train(sc.parallelize(sparse_data))
>>> model.predict(SparseVector(2, {1: 1.0}))
>>> float(model.predict(SparseVector(2, {1: 1.0})))
0.0
>>> model.predict(SparseVector(2, {0: 1.0}))
>>> float(model.predict(SparseVector(2, {0: 1.0})))
1.0
>>> import os, tempfile
>>> path = tempfile.mkdtemp()
>>> model.save(sc, path)
>>> sameModel = NaiveBayesModel.load(sc, path)
>>> sameModel.predict(SparseVector(2, {0: 1.0})) == model.predict(SparseVector(2, {0: 1.0}))
>>> bool((
... sameModel.predict(SparseVector(2, {0: 1.0})) ==
... model.predict(SparseVector(2, {0: 1.0}))
... ))
True
>>> from shutil import rmtree
>>> try:
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/mllib/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,9 +554,9 @@ class PCA:
... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])]
>>> model = PCA(2).fit(sc.parallelize(data))
>>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray()
>>> pcArray[0]
>>> float(pcArray[0])
1.648...
>>> pcArray[1]
>>> float(pcArray[1])
-4.013...
"""

Expand Down
42 changes: 21 additions & 21 deletions python/pyspark/mllib/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ def normalRDD(
>>> stats = x.stats()
>>> stats.count()
1000
>>> abs(stats.mean() - 0.0) < 0.1
>>> bool(abs(stats.mean() - 0.0) < 0.1)
True
>>> abs(stats.stdev() - 1.0) < 0.1
>>> bool(abs(stats.stdev() - 1.0) < 0.1)
True
"""
return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
Expand Down Expand Up @@ -186,10 +186,10 @@ def logNormalRDD(
>>> stats = x.stats()
>>> stats.count()
1000
>>> abs(stats.mean() - expMean) < 0.5
>>> bool(abs(stats.mean() - expMean) < 0.5)
True
>>> from math import sqrt
>>> abs(stats.stdev() - expStd) < 0.5
>>> bool(abs(stats.stdev() - expStd) < 0.5)
True
"""
return callMLlibFunc(
Expand Down Expand Up @@ -238,7 +238,7 @@ def poissonRDD(
>>> abs(stats.mean() - mean) < 0.5
True
>>> from math import sqrt
>>> abs(stats.stdev() - sqrt(mean)) < 0.5
>>> bool(abs(stats.stdev() - sqrt(mean)) < 0.5)
True
"""
return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed)
Expand Down Expand Up @@ -285,7 +285,7 @@ def exponentialRDD(
>>> abs(stats.mean() - mean) < 0.5
True
>>> from math import sqrt
>>> abs(stats.stdev() - sqrt(mean)) < 0.5
>>> bool(abs(stats.stdev() - sqrt(mean)) < 0.5)
True
"""
return callMLlibFunc("exponentialRDD", sc._jsc, float(mean), size, numPartitions, seed)
Expand Down Expand Up @@ -336,9 +336,9 @@ def gammaRDD(
>>> stats = x.stats()
>>> stats.count()
1000
>>> abs(stats.mean() - expMean) < 0.5
>>> bool(abs(stats.mean() - expMean) < 0.5)
True
>>> abs(stats.stdev() - expStd) < 0.5
>>> bool(abs(stats.stdev() - expStd) < 0.5)
True
"""
return callMLlibFunc(
Expand Down Expand Up @@ -384,7 +384,7 @@ def uniformVectorRDD(
>>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
>>> mat.shape
(10, 10)
>>> mat.max() <= 1.0 and mat.min() >= 0.0
>>> bool(mat.max() <= 1.0 and mat.min() >= 0.0)
True
>>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
4
Expand Down Expand Up @@ -430,9 +430,9 @@ def normalVectorRDD(
>>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1).collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - 0.0) < 0.1
>>> bool(abs(mat.mean() - 0.0) < 0.1)
True
>>> abs(mat.std() - 1.0) < 0.1
>>> bool(abs(mat.std() - 1.0) < 0.1)
True
"""
return callMLlibFunc("normalVectorRDD", sc._jsc, numRows, numCols, numPartitions, seed)
Expand Down Expand Up @@ -488,9 +488,9 @@ def logNormalVectorRDD(
>>> mat = np.matrix(m)
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - expMean) < 0.1
>>> bool(abs(mat.mean() - expMean) < 0.1)
True
>>> abs(mat.std() - expStd) < 0.1
>>> bool(abs(mat.std() - expStd) < 0.1)
True
"""
return callMLlibFunc(
Expand Down Expand Up @@ -545,13 +545,13 @@ def poissonVectorRDD(
>>> import numpy as np
>>> mean = 100.0
>>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1)
>>> mat = np.mat(rdd.collect())
>>> mat = np.asmatrix(rdd.collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - mean) < 0.5
>>> bool(abs(mat.mean() - mean) < 0.5)
True
>>> from math import sqrt
>>> abs(mat.std() - sqrt(mean)) < 0.5
>>> bool(abs(mat.std() - sqrt(mean)) < 0.5)
True
"""
return callMLlibFunc(
Expand Down Expand Up @@ -599,13 +599,13 @@ def exponentialVectorRDD(
>>> import numpy as np
>>> mean = 0.5
>>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1)
>>> mat = np.mat(rdd.collect())
>>> mat = np.asmatrix(rdd.collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - mean) < 0.5
>>> bool(abs(mat.mean() - mean) < 0.5)
True
>>> from math import sqrt
>>> abs(mat.std() - sqrt(mean)) < 0.5
>>> bool(abs(mat.std() - sqrt(mean)) < 0.5)
True
"""
return callMLlibFunc(
Expand Down Expand Up @@ -662,9 +662,9 @@ def gammaVectorRDD(
>>> mat = np.matrix(RandomRDDs.gammaVectorRDD(sc, shape, scale, 100, 100, seed=1).collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - expMean) < 0.1
>>> bool(abs(mat.mean() - expMean) < 0.1)
True
>>> abs(mat.std() - expStd) < 0.1
>>> bool(abs(mat.std() - expStd) < 0.1)
True
"""
return callMLlibFunc(
Expand Down
Loading