-
Notifications
You must be signed in to change notification settings - Fork 5
Implement clientmode #85
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
0ecff7d
6d5b0ef
82c6ee5
1276f9d
d3ea730
fc45224
c5193fb
ebd21f5
b384df5
13e191d
badfa1b
d954211
0d2b555
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
| package org.apache.spark.deploy.armada | ||
|
|
||
| import org.apache.spark.SparkConf | ||
| import org.apache.spark.deploy.armada.Config._ | ||
| import org.apache.spark.scheduler.cluster.SchedulerBackendUtils | ||
|
|
||
| /** Helper trait that encapsulates deployment mode-specific behavior for Armada Spark jobs. | ||
|
|
@@ -45,6 +46,35 @@ trait DeploymentModeHelper { | |
| * The total number of pods in the gang (executors + driver if applicable) | ||
| */ | ||
| def getGangCardinality: Int | ||
|
|
||
| /** Returns whether the driver runs inside the cluster (cluster mode) or externally (client mode). | ||
| * | ||
| * @return | ||
| * true if driver runs in cluster, false if driver runs externally | ||
| */ | ||
| def isDriverInCluster: Boolean | ||
|
|
||
| /** Returns whether executors should be proactively requested at startup. | ||
| * | ||
| * This is typically true for client mode with static allocation, where executors need to be | ||
| * requested immediately since the driver is already running. | ||
| * | ||
| * @return | ||
| * true if executors should be proactively requested, false otherwise | ||
| */ | ||
| def shouldProactivelyRequestExecutors: Boolean | ||
|
|
||
| /** Returns the source for jobSetId based on deployment mode. | ||
| * | ||
| * In cluster mode, jobSetId comes from environment variable ARMADA_JOB_SET_ID. In client mode, | ||
| * jobSetId comes from config or falls back to application ID. | ||
| * | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't remember the reason why these are different between client and cluster mode. Is there a reason why we shouldn't just make them both come from the config/appId?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure about the specific reason, but in a general sense, I don't see any reason for them to handle differently based on modes. Should come from config regardless of modes
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree. Please make it so. |
||
| * @param applicationId | ||
| * Spark application ID to use as fallback | ||
| * @return | ||
| * Optional jobSetId string | ||
| */ | ||
| def getJobSetIdSource(applicationId: String): Option[String] | ||
| } | ||
|
|
||
| /** Static allocation in cluster mode. | ||
|
|
@@ -54,7 +84,7 @@ trait DeploymentModeHelper { | |
| * - The driver runs as a pod inside the cluster | ||
| * - Gang cardinality includes both driver and executors | ||
| */ | ||
| class StaticCluster(conf: SparkConf) extends DeploymentModeHelper { | ||
| class StaticCluster(val conf: SparkConf) extends DeploymentModeHelper { | ||
| override def getExecutorCount: Int = { | ||
| SchedulerBackendUtils.getInitialTargetExecutorNumber(conf) | ||
| } | ||
|
|
@@ -63,6 +93,14 @@ class StaticCluster(conf: SparkConf) extends DeploymentModeHelper { | |
| // In cluster mode, include the driver pod in gang scheduling | ||
| getExecutorCount + 1 | ||
| } | ||
|
|
||
| override def isDriverInCluster: Boolean = true | ||
|
|
||
| override def shouldProactivelyRequestExecutors: Boolean = false | ||
|
|
||
| override def getJobSetIdSource(applicationId: String): Option[String] = { | ||
| sys.env.get("ARMADA_JOB_SET_ID") | ||
| } | ||
| } | ||
|
|
||
| /** Static allocation in client mode. | ||
|
|
@@ -72,7 +110,7 @@ class StaticCluster(conf: SparkConf) extends DeploymentModeHelper { | |
| * - The driver runs on the client machine (outside the cluster) | ||
| * - Gang cardinality includes only executors | ||
| */ | ||
| class StaticClient(conf: SparkConf) extends DeploymentModeHelper { | ||
| class StaticClient(val conf: SparkConf) extends DeploymentModeHelper { | ||
| override def getExecutorCount: Int = { | ||
| SchedulerBackendUtils.getInitialTargetExecutorNumber(conf) | ||
| } | ||
|
|
@@ -81,6 +119,14 @@ class StaticClient(conf: SparkConf) extends DeploymentModeHelper { | |
| // In client mode, driver runs externally, so only count executors | ||
| getExecutorCount | ||
| } | ||
|
|
||
| override def isDriverInCluster: Boolean = false | ||
|
|
||
| override def shouldProactivelyRequestExecutors: Boolean = true | ||
|
|
||
| override def getJobSetIdSource(applicationId: String): Option[String] = { | ||
| conf.get(ARMADA_JOB_SET_ID).orElse(Some(applicationId)) | ||
| } | ||
| } | ||
|
|
||
| /** Dynamic allocation in cluster mode. | ||
|
|
@@ -91,7 +137,7 @@ class StaticClient(conf: SparkConf) extends DeploymentModeHelper { | |
| * - Initial allocation uses the configured minimum executor count | ||
| * - Gang cardinality includes both driver and minimum executors | ||
| */ | ||
| class DynamicCluster(conf: SparkConf) extends DeploymentModeHelper { | ||
| class DynamicCluster(val conf: SparkConf) extends DeploymentModeHelper { | ||
| override def getExecutorCount: Int = { | ||
| // For dynamic allocation, use minExecutors as the initial count | ||
| conf.getInt( | ||
|
|
@@ -104,6 +150,14 @@ class DynamicCluster(conf: SparkConf) extends DeploymentModeHelper { | |
| // In cluster mode, include the driver pod in gang scheduling | ||
| getExecutorCount + 1 | ||
| } | ||
|
|
||
| override def isDriverInCluster: Boolean = true | ||
|
|
||
| override def shouldProactivelyRequestExecutors: Boolean = false | ||
|
|
||
| override def getJobSetIdSource(applicationId: String): Option[String] = { | ||
| sys.env.get("ARMADA_JOB_SET_ID") | ||
| } | ||
| } | ||
|
|
||
| /** Dynamic allocation in client mode. | ||
|
|
@@ -114,7 +168,7 @@ class DynamicCluster(conf: SparkConf) extends DeploymentModeHelper { | |
| * - Initial allocation uses the configured minimum executor count | ||
| * - Gang cardinality includes only minimum executors | ||
| */ | ||
| class DynamicClient(conf: SparkConf) extends DeploymentModeHelper { | ||
| class DynamicClient(val conf: SparkConf) extends DeploymentModeHelper { | ||
| override def getExecutorCount: Int = { | ||
| // For dynamic allocation, use minExecutors as the initial count | ||
| conf.getInt( | ||
|
|
@@ -127,6 +181,14 @@ class DynamicClient(conf: SparkConf) extends DeploymentModeHelper { | |
| // In client mode, driver runs externally, so only count executors | ||
| getExecutorCount | ||
| } | ||
|
|
||
| override def isDriverInCluster: Boolean = false | ||
|
|
||
| override def shouldProactivelyRequestExecutors: Boolean = false | ||
|
|
||
| override def getJobSetIdSource(applicationId: String): Option[String] = { | ||
| conf.get(ARMADA_JOB_SET_ID).orElse(Some(applicationId)) | ||
| } | ||
| } | ||
|
|
||
| /** Factory for creating ModeHelper instances based on Spark configuration. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd really prefer the rand to happen inside our scala code. The default should be random. If the user wants to override for some reason, he can, but I don't want us to have to do this here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Made some changes to fix this.