Skip to content

Commit

Permalink
[SPARK-46699][PS][TESTS] Split ArithmeticTests
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Split `ArithmeticTests`

### Why are the changes needed?
its parity test is slow

### Does this PR introduce _any_ user-facing change?
no, test-only

### How was this patch tested?
ci

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #44708 from zhengruifeng/ps_test_split_num_arithmetic.

Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
  • Loading branch information
zhengruifeng authored and HyukjinKwon committed Jan 13, 2024
1 parent 4ea3742 commit 1c9b022
Show file tree
Hide file tree
Showing 8 changed files with 391 additions and 93 deletions.
6 changes: 6 additions & 0 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,9 @@ def __hash__(self):
"pyspark.pandas.tests.data_type_ops.test_null_ops",
"pyspark.pandas.tests.data_type_ops.test_num_ops",
"pyspark.pandas.tests.data_type_ops.test_num_arithmetic",
"pyspark.pandas.tests.data_type_ops.test_num_mod",
"pyspark.pandas.tests.data_type_ops.test_num_mul_div",
"pyspark.pandas.tests.data_type_ops.test_num_pow",
"pyspark.pandas.tests.data_type_ops.test_num_reverse",
"pyspark.pandas.tests.data_type_ops.test_string_ops",
"pyspark.pandas.tests.data_type_ops.test_udt_ops",
Expand Down Expand Up @@ -1201,6 +1204,9 @@ def __hash__(self):
"pyspark.pandas.tests.connect.series.test_parity_sort",
"pyspark.pandas.tests.connect.series.test_parity_stat",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mod",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mul_div",
"pyspark.pandas.tests.connect.data_type_ops.test_parity_num_pow",
"pyspark.pandas.tests.connect.reshape.test_parity_get_dummies",
"pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_kwargs",
"pyspark.pandas.tests.connect.reshape.test_parity_get_dummies_multiindex",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

from pyspark.pandas.tests.data_type_ops.test_num_mod import NumModTestsMixin
from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
from pyspark.testing.connectutils import ReusedConnectTestCase


class NumModParityTests(
NumModTestsMixin,
PandasOnSparkTestUtils,
OpsTestBase,
ReusedConnectTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mod import * # noqa

try:
import xmlrunner # type: ignore[import]

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

from pyspark.pandas.tests.data_type_ops.test_num_mul_div import NumMulDivTestsMixin
from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
from pyspark.testing.connectutils import ReusedConnectTestCase


class NumMulDivParityTests(
NumMulDivTestsMixin,
PandasOnSparkTestUtils,
OpsTestBase,
ReusedConnectTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.connect.data_type_ops.test_parity_num_mul_div import * # noqa

try:
import xmlrunner # type: ignore[import]

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

from pyspark.pandas.tests.data_type_ops.test_num_pow import NumPowTestsMixin
from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
from pyspark.testing.connectutils import ReusedConnectTestCase


class NumPowParityTests(
NumPowTestsMixin,
PandasOnSparkTestUtils,
OpsTestBase,
ReusedConnectTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.connect.data_type_ops.test_parity_num_pow import * # noqa

try:
import xmlrunner # type: ignore[import]

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
93 changes: 0 additions & 93 deletions python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import unittest

import pandas as pd
import numpy as np

from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase
Expand Down Expand Up @@ -75,98 +74,6 @@ def test_sub(self):
else:
self.assertRaises(TypeError, lambda: psser - psdf[n_col])

def test_mul(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser * pser, psser * psser)
self.assert_eq(pser * pser.astype(bool), psser * psser.astype(bool))
self.assert_eq(pser * True, psser * True)
self.assert_eq(pser * False, psser * False)

if psser.dtype in [int, np.int32]:
self.assert_eq(pser * pdf["string"], psser * psdf["string"])
else:
self.assertRaises(TypeError, lambda: psser * psdf["string"])

self.assert_eq(pser * pdf["bool"], psser * psdf["bool"])

self.assertRaises(TypeError, lambda: psser * psdf["datetime"])
self.assertRaises(TypeError, lambda: psser * psdf["date"])
self.assertRaises(TypeError, lambda: psser * psdf["categorical"])

def test_truediv(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
if psser.dtype in [float, int, np.int32]:
self.assert_eq(pser / pser, psser / psser)
self.assert_eq(pser / pser.astype(bool), psser / psser.astype(bool))
self.assert_eq(pser / True, psser / True)
self.assert_eq(pser / False, psser / False)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
self.assert_eq(pdf["float"] / pdf[n_col], psdf["float"] / psdf[n_col])
else:
self.assertRaises(TypeError, lambda: psser / psdf[n_col])

def test_floordiv(self):
pdf, psdf = self.pdf, self.psdf
pser, psser = pdf["float"], psdf["float"]
self.assert_eq(pser // pser, psser // psser)
self.assert_eq(pser // pser.astype(bool), psser // psser.astype(bool))
self.assert_eq(pser // True, psser // True)
self.assert_eq(pser // False, psser // False)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
self.assert_eq(pdf["float"] // pdf["bool"], psdf["float"] // psdf["bool"])
else:
for col in self.numeric_df_cols:
psser = psdf[col]
self.assertRaises(TypeError, lambda: psser // psdf[n_col])

def test_mod(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser % pser, psser % psser)
self.assert_eq(pser % pser.astype(bool), psser % psser.astype(bool))
self.assert_eq(pser % True, psser % True)
if col in ["int", "int32"]:
self.assert_eq(
pd.Series([np.nan, np.nan, np.nan], dtype=float, name=col), psser % False
)
else:
self.assert_eq(
pd.Series([np.nan, np.nan, np.nan], dtype=pser.dtype, name=col), psser % False
)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
self.assert_eq(pdf["float"] % pdf[n_col], psdf["float"] % psdf[n_col])
else:
self.assertRaises(TypeError, lambda: psser % psdf[n_col])

def test_pow(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
if col in ["float", "float_w_nan"]:
self.assert_eq(pser**pser, psser**psser)
self.assert_eq(pser ** pser.astype(bool), psser ** psser.astype(bool))
self.assert_eq(pser**True, psser**True)
self.assert_eq(pser**False, psser**False)
self.assert_eq(pser**1, psser**1)
self.assert_eq(pser**0, psser**0)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
self.assert_eq(pdf["float"] ** pdf[n_col], psdf["float"] ** psdf[n_col])
else:
self.assertRaises(TypeError, lambda: psser ** psdf[n_col])


class ArithmeticTests(
ArithmeticTestsMixin,
Expand Down
77 changes: 77 additions & 0 deletions python/pyspark/pandas/tests/data_type_ops/test_num_mod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import unittest

import pandas as pd
import numpy as np

from pyspark import pandas as ps
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase


class NumModTestsMixin:
@property
def float_pser(self):
return pd.Series([1, 2, 3], dtype=float)

@property
def float_psser(self):
return ps.from_pandas(self.float_pser)

def test_mod(self):
pdf, psdf = self.pdf, self.psdf
for col in self.numeric_df_cols:
pser, psser = pdf[col], psdf[col]
self.assert_eq(pser % pser, psser % psser)
self.assert_eq(pser % pser.astype(bool), psser % psser.astype(bool))
self.assert_eq(pser % True, psser % True)
if col in ["int", "int32"]:
self.assert_eq(
pd.Series([np.nan, np.nan, np.nan], dtype=float, name=col), psser % False
)
else:
self.assert_eq(
pd.Series([np.nan, np.nan, np.nan], dtype=pser.dtype, name=col), psser % False
)

for n_col in self.non_numeric_df_cols:
if n_col == "bool":
self.assert_eq(pdf["float"] % pdf[n_col], psdf["float"] % psdf[n_col])
else:
self.assertRaises(TypeError, lambda: psser % psdf[n_col])


class NumModTests(
NumModTestsMixin,
OpsTestBase,
PandasOnSparkTestCase,
):
pass


if __name__ == "__main__":
from pyspark.pandas.tests.data_type_ops.test_num_mod import * # noqa: F401

try:
import xmlrunner

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)
Loading

0 comments on commit 1c9b022

Please sign in to comment.