From 79c70d9b831fb76e259adbfdc71a8ff6fd9ea4bf Mon Sep 17 00:00:00 2001 From: Weichen Xu Date: Wed, 4 Dec 2024 11:58:47 +0800 Subject: [PATCH] init Signed-off-by: Weichen Xu --- python-package/xgboost/spark/core.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index 166acbe1764b..8c080f6c585e 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -1174,6 +1174,10 @@ def _run_job() -> Tuple[str, str]: _train_booster, # type: ignore schema="data string", ) + # TODO: In spark connect, use `dataframe.mapInPandas(..., barrier=True)` + # and remove `rdd.barrier().mapPartitions(lambda x: x)` + # and for stage scheduling, similarly, use + # `dataframe.mapInPandas(..., profile=...)` to set resource profile. .rdd.barrier() .mapPartitions(lambda x: x) ) @@ -1384,6 +1388,8 @@ def _run_on_gpu(self) -> bool: use_gpu_by_params = super()._run_on_gpu() + # TODO: To support spark connect, we can't use any SparkContext APIs, + # and we can't read any spark configurations. Remove them if _is_local(_get_spark_session().sparkContext): # if it's local model, no need to check the spark configurations return use_gpu_by_params