[SPARK-54958][PYTHON][ML][TEST] Accelerate test_cv_io_pipeline.py

gaogaotiantian · zhengruifeng · commit 78d5229980e1 · 2026-01-09T09:41:32.000+08:00
### What changes were proposed in this pull request? Parallelized some operations in the test. ### Why are the changes needed? The test takes ~80s which is the second longest test case we have. Having some parallelism will reduce about 50% of the time it takes. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Test passed locally. Time consumed 64s -> 28s. ### Was this patch authored or co-authored using generative AI tooling? No Closes #53725 from gaogaotiantian/optimize-cv-io-pipeline. Authored-by: Tian Gao <gaogaotiantian@hotmail.com> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py b/python/pyspark/ml/tests/tuning/test_cv_io_pipeline.py
@@ -16,6 +16,7 @@
 #
 
 import tempfile
+from concurrent.futures import ThreadPoolExecutor
 
 from pyspark.ml.feature import HashingTF, Tokenizer
 from pyspark.ml import Pipeline
@@ -54,7 +55,7 @@ def _run_test_save_load_pipeline_estimator(self, LogisticRegressionCls):
         tokenizer = Tokenizer(inputCol="text", outputCol="words")
         hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
 
-        ova = OneVsRest(classifier=LogisticRegressionCls())
+        ova = OneVsRest(classifier=LogisticRegressionCls(), parallelism=2)
         lr1 = LogisticRegressionCls().setMaxIter(5)
         lr2 = LogisticRegressionCls().setMaxIter(10)
 
@@ -72,6 +73,7 @@ def _run_test_save_load_pipeline_estimator(self, LogisticRegressionCls):
             estimatorParamMaps=paramGrid,
             evaluator=MulticlassClassificationEvaluator(),
             numFolds=2,
+            parallelism=4,
         )  # use 3+ folds in practice
         cvPath = temp_path + "/cv"
         crossval.save(cvPath)
@@ -100,6 +102,7 @@ def _run_test_save_load_pipeline_estimator(self, LogisticRegressionCls):
             estimatorParamMaps=paramGrid,
             evaluator=MulticlassClassificationEvaluator(),
             numFolds=2,
+            parallelism=4,
         )  # use 3+ folds in practice
         cv2Path = temp_path + "/cv2"
         crossval2.save(cv2Path)
@@ -126,8 +129,13 @@ def _run_test_save_load_pipeline_estimator(self, LogisticRegressionCls):
             self.assertEqual(loadedStage.uid, originalStage.uid)
 
     def test_save_load_pipeline_estimator(self):
-        self._run_test_save_load_pipeline_estimator(LogisticRegression)
-        self._run_test_save_load_pipeline_estimator(DummyLogisticRegression)
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            list(
+                executor.map(
+                    self._run_test_save_load_pipeline_estimator,
+                    [LogisticRegression, DummyLogisticRegression],
+                )
+            )
 
 
 if __name__ == "__main__":