@@ -2,21 +2,40 @@ package slurm
22
33import (
44 "context"
5+ "fmt"
56 "log"
67 "strings"
78
89 "github.com/nod-ai/ADA/redfish-exporter/metrics"
910)
1011
1112const (
12- Drain = "DrainNode"
13+ Drain = "DrainNode"
14+ ExlcudeReasonSet = "DRAIN_EXCLUDE_REASON_SET"
1315)
1416
17+ type AddEventReq struct {
18+ RedfishServerIP string
19+ SlurmNodeName string
20+ Severity string
21+ Action string
22+ DrainReasonPrefix string
23+ MessageId string
24+ Message string
25+ ExcludeStr string
26+ ScontrolPath string
27+ }
28+
1529type eventsActionReq struct {
16- redfishServerIP string
17- slurmNodeName string
18- severity string
19- action string
30+ redfishServerIP string
31+ slurmNodeName string
32+ severity string
33+ action string
34+ drainReasonPrefix string
35+ messageId string
36+ message string
37+ excludeStr string
38+ scontrolPath string
2039}
2140
2241type SlurmQueue struct {
@@ -28,12 +47,17 @@ func InitSlurmQueue(ctx context.Context) *SlurmQueue {
2847 return & SlurmQueue {ctx : ctx , queue : make (chan * eventsActionReq )}
2948}
3049
31- func (q * SlurmQueue ) Add (redfishServerIP , slurmNodeName , severity , action string ) {
50+ func (q * SlurmQueue ) Add (evt AddEventReq ) {
3251 q .queue <- & eventsActionReq {
33- redfishServerIP : redfishServerIP ,
34- slurmNodeName : slurmNodeName ,
35- severity : severity ,
36- action : action ,
52+ redfishServerIP : evt .RedfishServerIP ,
53+ slurmNodeName : evt .SlurmNodeName ,
54+ severity : evt .Severity ,
55+ action : evt .Action ,
56+ drainReasonPrefix : evt .DrainReasonPrefix ,
57+ messageId : evt .MessageId ,
58+ message : evt .Message ,
59+ excludeStr : evt .ExcludeStr ,
60+ scontrolPath : evt .ScontrolPath ,
3761 }
3862}
3963
@@ -65,19 +89,24 @@ func (q *SlurmQueue) ProcessEventActionQueue() {
6589 }
6690}
6791
92+ func getDrainReasonString (prefix , msg , msgId , severity string ) string {
93+ ret := fmt .Sprintf ("%s:redfishlistener:%s:%s:%s" , prefix , severity , msgId , msg )
94+ return ret
95+ }
96+
6897func (q * SlurmQueue ) performEventAction (req * eventsActionReq ) error {
6998 if len (strings .TrimSpace (req .slurmNodeName )) == 0 {
7099 return nil
71100 }
72101
73- slurmClient := GetClient ()
74- if slurmClient == nil {
75- return nil
76- }
77-
78102 if req .action == Drain {
79- err := slurmClient .DrainNode (req .slurmNodeName )
103+ reason := getDrainReasonString (req .drainReasonPrefix , req .message , req .messageId , req .severity )
104+ err := DrainNodeWithScontrol (req .slurmNodeName , reason , req .excludeStr , req .scontrolPath )
80105 if err != nil {
106+ if strings .Contains (err .Error (), ExlcudeReasonSet ) {
107+ log .Printf ("Node not drained: %v" , err .Error ())
108+ return nil
109+ }
81110 log .Printf ("Error draining node: %v" , err )
82111 return err
83112 }
0 commit comments