@@ -570,6 +570,8 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
570570 return self ._describe_sacct (app_id )
571571
572572 def _describe_sacct (self , app_id : str ) -> Optional [DescribeAppResponse ]:
573+ # NOTE: Handles multiple job ID formats due to SLURM version differences.
574+ # Different clusters use heterogeneous (+) vs regular (.) job ID formats.
573575 try :
574576 output = subprocess .check_output (
575577 ["sacct" , "--parsable2" , "-j" , app_id ],
@@ -594,15 +596,27 @@ def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
594596 msg = ""
595597 app_state = AppState .UNKNOWN
596598 for row in reader :
597- job_id , * parts = row ["JobID" ].split ("+" )
599+ # Handle both "+" (heterogeneous) and "." (regular) job ID formats
600+ job_id_full = row ["JobID" ]
601+
602+ # Split on both "+" and "." to handle different SLURM configurations
603+ if "+" in job_id_full :
604+ job_id , * parts = job_id_full .split ("+" )
605+ is_subjob = len (parts ) > 0 and "." in parts [0 ]
606+ else :
607+ job_id , * parts = job_id_full .split ("." )
608+ is_subjob = len (parts ) > 0
609+
598610 if job_id != app_id :
599611 continue
600- if len (parts ) > 0 and "." in parts [0 ]:
601- # we only care about the worker not the child jobs
612+
613+ if is_subjob :
614+ # we only care about the main job not the child jobs (.batch, .0, etc.)
602615 continue
603616
604- state = row ["State" ]
605- msg = state
617+ msg = row ["State" ]
618+ # Remove truncation indicator (CANCELLED+) and extract base state from verbose formats
619+ state = msg .split ()[0 ].rstrip ("+" )
606620 app_state = appstate_from_slurm_state (state )
607621
608622 role , _ , replica_id = row ["JobName" ].rpartition ("-" )
@@ -629,6 +643,9 @@ def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
629643 )
630644
631645 def _describe_squeue (self , app_id : str ) -> Optional [DescribeAppResponse ]:
646+ # NOTE: This method contains multiple compatibility checks for different SLURM versions
647+ # due to API format changes across versions (20.02, 23.02, 24.05, 24.11+).
648+
632649 # squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
633650 # if the job does not exist or is finished (e.g. not in PENDING or RUNNING state)
634651 output = subprocess .check_output (
@@ -670,7 +687,18 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
670687 if state == AppState .PENDING :
671688 # NOTE: torchx launched jobs points to exactly one host
672689 # otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]')
673- hostname = job_resources .get ("scheduled_nodes" , "" )
690+
691+ # SLURM 24.11.5+ returns job_resources=None for pending jobs (issue #1101)
692+ if job_resources is not None :
693+ hostname = job_resources .get ("scheduled_nodes" , "" )
694+ # If scheduled_nodes not found in job_resources, try nodes.list
695+ if not hostname and "nodes" in job_resources :
696+ nodes_info = job_resources .get ("nodes" , {})
697+ if isinstance (nodes_info , dict ):
698+ hostname = nodes_info .get ("list" , "" )
699+ else :
700+ # For pending jobs where job_resources is None, check top-level fields
701+ hostname = job .get ("nodes" , "" ) or job .get ("scheduled_nodes" , "" )
674702
675703 role .num_replicas += 1
676704 role_status .replicas .append (
@@ -686,24 +714,35 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
686714 # where each replica is a "sub-job" so `allocated_nodes` will always be 1
687715 # but we deal with jobs that have not been launched with torchx
688716 # which can have multiple hosts per sub-job (count them as replicas)
689- node_infos = job_resources .get ("allocated_nodes" , [])
717+ nodes_data = job_resources .get ("nodes" , {})
718+
719+ # SLURM 24.11+ changed from allocated_nodes to nodes.allocation structure
720+ if "allocation" in nodes_data and isinstance (
721+ nodes_data ["allocation" ], list
722+ ):
723+ # SLURM 24.11+ format: nodes.allocation is a list
724+ for node_info in nodes_data ["allocation" ]:
725+ hostname = node_info ["name" ]
726+ cpu = int (node_info ["cpus" ]["used" ])
727+ memMB = (
728+ int (node_info ["memory" ]["allocated" ]) // 1024
729+ ) # Convert to MB
690730
691- if not isinstance (node_infos , list ):
692- # NOTE: in some versions of slurm jobs[].job_resources.allocated_nodes
693- # is not a list of individual nodes, but a map of the nodelist specs
694- # in this case just use jobs[].job_resources.nodes
695- hostname = job_resources .get ("nodes" )
696- role .num_replicas += 1
697- role_status .replicas .append (
698- ReplicaStatus (
699- id = int (replica_id ),
700- role = role_name ,
701- state = state ,
702- hostname = hostname ,
731+ role .resource = Resource (cpu = cpu , memMB = memMB , gpu = - 1 )
732+ role .num_replicas += 1
733+ role_status .replicas .append (
734+ ReplicaStatus (
735+ id = int (replica_id ),
736+ role = role_name ,
737+ state = state ,
738+ hostname = hostname ,
739+ )
703740 )
704- )
705- else :
706- for node_info in node_infos :
741+ elif "allocated_nodes" in job_resources and isinstance (
742+ job_resources ["allocated_nodes" ], list
743+ ):
744+ # Legacy format: allocated_nodes is a list
745+ for node_info in job_resources ["allocated_nodes" ]:
707746 # NOTE: we expect resource specs for all the nodes to be the same
708747 # NOTE: use allocated (not used/requested) memory since
709748 # users may only specify --cpu, in which case slurm
@@ -726,6 +765,26 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
726765 hostname = hostname ,
727766 )
728767 )
768+ else :
769+ # Fallback: use hostname from nodes.list
770+ if isinstance (nodes_data , str ):
771+ hostname = nodes_data
772+ else :
773+ hostname = (
774+ nodes_data .get ("list" , "" )
775+ if isinstance (nodes_data , dict )
776+ else ""
777+ )
778+
779+ role .num_replicas += 1
780+ role_status .replicas .append (
781+ ReplicaStatus (
782+ id = int (replica_id ),
783+ role = role_name ,
784+ state = state ,
785+ hostname = hostname ,
786+ )
787+ )
729788
730789 return DescribeAppResponse (
731790 app_id = app_id ,
0 commit comments