diff --git a/src/k8/k8s-neuron-device-plugin.yml b/src/k8/k8s-neuron-device-plugin.yml index 2370d64c..496fb2cb 100644 --- a/src/k8/k8s-neuron-device-plugin.yml +++ b/src/k8/k8s-neuron-device-plugin.yml @@ -25,6 +25,10 @@ spec: - key: aws.amazon.com/neuron operator: Exists effect: NoSchedule + - key: sagemaker.amazonaws.com/node-health-status + operator: Equal + value: Unschedulable + effect: NoSchedule # Mark this pod as a critical add-on; when enabled, the critical add-on # scheduler reserves resources for critical add-on pods so that they can # be rescheduled after a failure. @@ -65,6 +69,9 @@ spec: - trn1.2xlarge - trn1.32xlarge - trn1n.32xlarge + - ml.trn1.2xlarge + - ml.trn1.32xlarge + - ml.trn1n.32xlarge containers: # Find all neuron-device-plugin images at https://gallery.ecr.aws/neuron/neuron-device-plugin - image: public.ecr.aws/neuron/neuron-device-plugin:2.22.4.0