From b9d8f0e84277af9bd18d94d219ee232f66f510e4 Mon Sep 17 00:00:00 2001 From: Suraj Kota Date: Fri, 25 Oct 2024 18:03:17 -0700 Subject: [PATCH] Add support for HyperPod nodes --- src/k8/k8s-neuron-device-plugin.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/k8/k8s-neuron-device-plugin.yml b/src/k8/k8s-neuron-device-plugin.yml index 2370d64c..496fb2cb 100644 --- a/src/k8/k8s-neuron-device-plugin.yml +++ b/src/k8/k8s-neuron-device-plugin.yml @@ -25,6 +25,10 @@ spec: - key: aws.amazon.com/neuron operator: Exists effect: NoSchedule + - key: sagemaker.amazonaws.com/node-health-status + operator: Equal + value: Unschedulable + effect: NoSchedule # Mark this pod as a critical add-on; when enabled, the critical add-on # scheduler reserves resources for critical add-on pods so that they can # be rescheduled after a failure. @@ -65,6 +69,9 @@ spec: - trn1.2xlarge - trn1.32xlarge - trn1n.32xlarge + - ml.trn1.2xlarge + - ml.trn1.32xlarge + - ml.trn1n.32xlarge containers: # Find all neuron-device-plugin images at https://gallery.ecr.aws/neuron/neuron-device-plugin - image: public.ecr.aws/neuron/neuron-device-plugin:2.22.4.0