Skip to content

Commit

Permalink
make plank settings configurable
Browse files Browse the repository at this point in the history
Signed-off-by: Tim Ramlot <[email protected]>
  • Loading branch information
inteon committed Jul 1, 2024
1 parent 6a85753 commit 5cc1fc8
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 21 deletions.
29 changes: 29 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,17 @@ type Plank struct {
// stuck in an unscheduled state. Defaults to 5 minutes.
PodUnscheduledTimeout *metav1.Duration `json:"pod_unscheduled_timeout,omitempty"`

// MaxRetries is the maximum number of times a prowjob will be retried before
// being marked as failed. Defaults to 3. A value of 0 means no retries.
MaxRetries *int `json:"max_retries,omitempty"`

// NodeTerminationReasons is a set of reasons on which the controller will
// match to determine if a node is being terminated. If a node is being terminated
// the controller will restart the prowjob, unless the ErrorOnTermination option is set
// on the prowjob or the MaxRetries option is reached.
// Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"].
NodeTerminationReasons []string `json:"node_termination_reasons,omitempty"`

// DefaultDecorationConfigs holds the default decoration config for specific values.
//
// Each entry in the slice specifies Repo and Cluster regexp filter fields to
Expand Down Expand Up @@ -2495,6 +2506,24 @@ func parseProwConfig(c *Config) error {
c.Plank.PodUnscheduledTimeout = &metav1.Duration{Duration: 5 * time.Minute}
}

if c.Plank.MaxRetries == nil {
maxRetries := 3
c.Plank.MaxRetries = &maxRetries
}

if c.Plank.NodeTerminationReasons == nil {
c.Plank.NodeTerminationReasons = []string{
// If the node does no longer exist and the pod gets garbage collected,
// this condition will be set:
// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
"DeletionByPodGC",
// On GCP, before a new spot instance is started, the old pods are garbage
// collected (if they have not been already by the Kubernetes PodGC):
// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
"DeletionByGCPControllerManager",
}
}

if err := c.Gerrit.DefaultAndValidate(); err != nil {
return fmt.Errorf("validating gerrit config: %w", err)
}
Expand Down
16 changes: 16 additions & 0 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8417,6 +8417,10 @@ moonraker:
client_timeout: 10m0s
plank:
max_goroutines: 20
max_retries: 3
node_termination_reasons:
- DeletionByPodGC
- DeletionByGCPControllerManager
pod_pending_timeout: 10m0s
pod_running_timeout: 48h0m0s
pod_unscheduled_timeout: 5m0s
Expand Down Expand Up @@ -8501,6 +8505,10 @@ moonraker:
client_timeout: 10m0s
plank:
max_goroutines: 20
max_retries: 3
node_termination_reasons:
- DeletionByPodGC
- DeletionByGCPControllerManager
pod_pending_timeout: 10m0s
pod_running_timeout: 48h0m0s
pod_unscheduled_timeout: 5m0s
Expand Down Expand Up @@ -8578,6 +8586,10 @@ moonraker:
client_timeout: 10m0s
plank:
max_goroutines: 20
max_retries: 3
node_termination_reasons:
- DeletionByPodGC
- DeletionByGCPControllerManager
pod_pending_timeout: 10m0s
pod_running_timeout: 48h0m0s
pod_unscheduled_timeout: 5m0s
Expand Down Expand Up @@ -8660,6 +8672,10 @@ moonraker:
client_timeout: 10m0s
plank:
max_goroutines: 20
max_retries: 3
node_termination_reasons:
- DeletionByPodGC
- DeletionByGCPControllerManager
pod_pending_timeout: 10m0s
pod_running_timeout: 48h0m0s
pod_unscheduled_timeout: 5m0s
Expand Down
10 changes: 10 additions & 0 deletions pkg/config/prow-config-documented.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,16 @@ plank:
# JobURLPrefixDisableAppendStorageProvider disables that the storageProvider is
# automatically appended to the JobURLPrefix.
jobURLPrefixDisableAppendStorageProvider: true
# MaxRetries is the maximum number of times a prowjob will be retried before
# being marked as failed. Defaults to 3. A value of 0 means no retries.
max_retries: 0
# NodeTerminationReasons is a set of reasons on which the controller will
# match to determine if a node is being terminated. If a node is being terminated
# the controller will restart the prowjob, unless the ErrorOnTermination option is set
# on the prowjob or the MaxRetries option is reached.
# Defaults to ["DeletionByPodGC", "DeletionByGCPControllerManager"].
node_termination_reasons:
- ""
# PodPendingTimeout defines how long the controller will wait to perform a garbage
# collection on pending pods. Defaults to 10 minutes.
pod_pending_timeout: 0s
Expand Down
15 changes: 11 additions & 4 deletions pkg/plank/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ const (
podDeletionPreventionFinalizer = "keep-from-vanishing"
)

var (
maxRetries = 3
nodeTerminationReasons = []string{"DeletionByPodGC", "DeletionByGCPControllerManager"}
)

func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[string]int) *fca {
presubmits := []config.Presubmit{
{
Expand Down Expand Up @@ -102,10 +107,12 @@ func newFakeConfigAgent(t *testing.T, maxConcurrency int, queueCapacities map[st
MaxConcurrency: maxConcurrency,
MaxGoroutines: 20,
},
JobQueueCapacities: queueCapacities,
PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout},
PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout},
PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
JobQueueCapacities: queueCapacities,
PodPendingTimeout: &metav1.Duration{Duration: podPendingTimeout},
PodRunningTimeout: &metav1.Duration{Duration: podRunningTimeout},
PodUnscheduledTimeout: &metav1.Duration{Duration: podUnscheduledTimeout},
MaxRetries: &maxRetries,
NodeTerminationReasons: nodeTerminationReasons,
},
},
JobConfig: config.JobConfig{
Expand Down
23 changes: 6 additions & 17 deletions pkg/plank/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"encoding/json"
"errors"
"fmt"
"slices"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -60,8 +61,6 @@ import (

const ControllerName = "plank"

const MaxPodRetries = 3

// PodStatus constants
const (
Evicted = "Evicted"
Expand Down Expand Up @@ -464,7 +463,7 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
pj.Status.PodName = pn
r.log.WithFields(pjutil.ProwJobFields(pj)).Info("Pod is missing, starting a new pod")
}
} else if transientFailure := getTransientFailure(pod); transientFailure != PodTransientFailureNone {
} else if transientFailure := getTransientFailure(pod, r.config().Plank.NodeTerminationReasons); transientFailure != PodTransientFailureNone {
switch {
case transientFailure == PodTransientFailureEvicted && pj.Spec.ErrorOnEviction:
// ErrorOnEviction is enabled, complete the PJ and mark it as errored.
Expand All @@ -478,12 +477,12 @@ func (r *reconciler) syncPendingJob(ctx context.Context, pj *prowv1.ProwJob) (*r
pj.SetComplete()
pj.Status.State = prowv1.ErrorState
pj.Status.Description = "Job pod's node was terminated."
case pj.Status.RetryCount >= MaxPodRetries:
case pj.Status.RetryCount >= *r.config().Plank.MaxRetries:
// MaxPodRetries is reached, complete the PJ and mark it as errored.
r.log.WithField("transient-failure", transientFailure).WithFields(pjutil.ProwJobFields(pj)).Info("Pod Node reached max retries, fail job.")
pj.SetComplete()
pj.Status.State = prowv1.ErrorState
pj.Status.Description = fmt.Sprintf("Job pod reached max retries (%d) for transient failure %s", MaxPodRetries, transientFailure)
pj.Status.Description = fmt.Sprintf("Job pod reached max retries (%d) for transient failure %s", pj.Status.RetryCount, transientFailure)
default:
// Update the retry count and delete the pod so it gets recreated in the next resync.
pj.Status.RetryCount++
Expand Down Expand Up @@ -666,7 +665,7 @@ const (
PodTransientFailureUnreachable PodTransientFailure = "unreachable"
)

func getTransientFailure(pod *corev1.Pod) PodTransientFailure {
func getTransientFailure(pod *corev1.Pod, nodeTerminationReasons []string) PodTransientFailure {
if pod.Status.Reason == Evicted {
return PodTransientFailureEvicted
}
Expand All @@ -679,17 +678,7 @@ func getTransientFailure(pod *corev1.Pod) PodTransientFailure {
}

for _, condition := range pod.Status.Conditions {
// If the node does no longer exist and the pod gets garbage collected,
// this condition will be set:
// https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#pod-disruption-conditions
if condition.Reason == "DeletionByPodGC" {
return PodTransientFailureTerminated
}

// On GCP, before a new spot instance is started, the old pods are garbage
// collected (if they have not been already by the Kubernetes PodGC):
// https://github.com/kubernetes/cloud-provider-gcp/blob/25e5dcc715781316bc5e39f8b17c0d5b313453f7/cmd/gcp-controller-manager/node_csr_approver.go#L1035-L1058
if condition.Reason == "DeletionByGCPControllerManager" {
if slices.Contains(nodeTerminationReasons, condition.Reason) {
return PodTransientFailureTerminated
}
}
Expand Down

0 comments on commit 5cc1fc8

Please sign in to comment.