From 79c70d9b831fb76e259adbfdc71a8ff6fd9ea4bf Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 4 Dec 2024 11:58:47 +0800
Subject: [PATCH] init

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 python-package/xgboost/spark/core.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 166acbe1764b..8c080f6c585e 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -1174,6 +1174,10 @@ def _run_job() -> Tuple[str, str]:
                     _train_booster,  # type: ignore
                     schema="data string",
                 )
+                # TODO: In spark connect, use `dataframe.mapInPandas(..., barrier=True)`
+                #  and remove `rdd.barrier().mapPartitions(lambda x: x)`
+                #  and for stage scheduling, similarly, use
+                #  `dataframe.mapInPandas(..., profile=...)` to set resource profile.
                 .rdd.barrier()
                 .mapPartitions(lambda x: x)
             )
@@ -1384,6 +1388,8 @@ def _run_on_gpu(self) -> bool:
 
         use_gpu_by_params = super()._run_on_gpu()
 
+        # TODO: To support spark connect, we can't use any SparkContext APIs,
+        #  and we can't read any spark configurations. Remove them
         if _is_local(_get_spark_session().sparkContext):
             # if it's local model, no need to check the spark configurations
             return use_gpu_by_params