Skip to content

Commit 034e2ad

Browse files
Update the code to use scontrol in place of slurm APIs to drain the
node to make it slurm version independent
1 parent 16438ef commit 034e2ad

File tree

6 files changed

+331
-57
lines changed

6 files changed

+331
-57
lines changed

redfish-exporter/.env

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
1-
UPDATED="2024-09-24"
1+
UPDATED="2025-01-22"
22
DESCRIPTION="Redfish Event Listener/Exporter"
3-
LISTENER_IP="0.0.0.0"
4-
LISTENER_PORT="8080"
5-
METRICS_PORT="2112"
3+
LISTENER_IP="<Listener_IP>"
4+
LISTENER_PORT="<PORT>"
5+
METRICS_PORT="<MERTRICS_PORT>"
66
USE_SSL="false"
77
CERTFILE="path/to/certfile"
88
KEYFILE="path/to/keyfile"
9-
SLURM_USER="slurm user here"
10-
SLURM_TOKEN="token string here, from secret when for real"
11-
SLURM_CONTROL_NODE="slurm control node IP:Port"
9+
SLURM_CONTROL_NODE="<SLURM_CONTROL_NODE_IP>"
10+
#List of '|' seperated reasons for avoiding drain action if there is a match
11+
SLURM_DRAIN_EXCLUDE_REASON_LIST="reason 1|reason 2"
12+
SLURM_SCONTROL_PATH="/usr/bin/scontrol"
1213

14+
# Match RAS events received based on severity and '|' seperated list of message fields and perform drain action with the DrainReasonPrefix set as the prefix in the reason
15+
# Message can be left empty if it doesn't need to be matched against, in that case only severity is matched
16+
# only DrainNode action is supported for now
1317
TRIGGER_EVENTS="[\
14-
{\"Severity\":\"Fatal\",\"Action\":\"DrainNode\"},\
15-
{\"Severity\":\"Critical\",\"Action\":\"DrainNode\"}
18+
{\"Severity\":\"Critical\",\"Message\":\"message 1|This is a critical test event\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNeeded\"},\
19+
{\"Severity\":\"Info\",\"Message\":\"message 3\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"},\
20+
{\"Severity\":\"Warning\",\"Message\":\"message 4|This is a test event message\",\"Action\":\"DrainNode\", \"DrainReasonPrefix\":\"RebootNotNeeded\"}
1621
]"
1722

1823
# Subscription (v1.5+)
@@ -28,8 +33,8 @@ TRIGGER_EVENTS="[\
2833

2934
# Deprecated <v1.5
3035
SUBSCRIPTION_PAYLOAD="{\
31-
\"Destination\":\"http://host.docker.internal:8080\",\
32-
\"EventTypes\":[\"Alert\",\"StatusChange\"],\
36+
\"Destination\":\"http://<Listener_IP:Port>\",\
37+
\"EventTypes\":[\"Alert\"],\
3338
\"Protocol\":\"Redfish\",\
3439
\"Context\":\"YourContextData\",\
3540
\"Oem\":{\"Supermicro\":{\"EnableSubscription\":true}}\
@@ -41,5 +46,5 @@ PROMETHEUS_CONFIG="{\
4146
}"
4247

4348
REDFISH_SERVERS="[\
44-
{\"ip\":\"http://127.0.0.1:8000\",\"username\":\"Username1\",\"password\":\"Password1\",\"loginType\":\"Session\",\"slurmNode\":\"Node1\"}
45-
]"
49+
{\"ip\":\"https://<BMC_IP>\",\"username\":\"<username>\",\"password\":\"<password>\",\"loginType\":\"Session\",\"slurmNode\":\"<nodename\"}
50+
]"

redfish-exporter/config.go

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,25 @@ type Config struct {
4949
CertFile string
5050
KeyFile string
5151
}
52-
SlurmToken string
53-
SlurmControlNode string
54-
SlurmUser string
55-
SubscriptionPayload SubscriptionPayload
56-
RedfishServers []RedfishServer
57-
TriggerEvents []TriggerEvent
58-
PrometheusConfig PrometheusConfig
59-
context *tls.Config
60-
eventCount int
61-
dataBuffer []byte
52+
SlurmToken string
53+
SlurmControlNode string
54+
SlurmUser string
55+
SlurmScontrolPath string
56+
SlurmDrainExcludeStr string
57+
SubscriptionPayload SubscriptionPayload
58+
RedfishServers []RedfishServer
59+
TriggerEvents []TriggerEvent
60+
PrometheusConfig PrometheusConfig
61+
context *tls.Config
62+
eventCount int
63+
dataBuffer []byte
6264
}
6365

6466
type TriggerEvent struct {
65-
Severity string `json:"Severity"`
66-
Action string `json:"Action"`
67+
Severity string `json:"Severity"`
68+
Action string `json:"Action"`
69+
Message string `json:"Message"`
70+
DrainReasonPrefix string `json:"DrainReasonPrefix"`
6771
}
6872

6973
type PrometheusConfig struct {
@@ -119,6 +123,8 @@ func setupConfig() Config {
119123
AppConfig.SlurmToken = os.Getenv("SLURM_TOKEN")
120124
AppConfig.SlurmControlNode = os.Getenv("SLURM_CONTROL_NODE")
121125
AppConfig.SlurmUser = os.Getenv("SLURM_USER")
126+
AppConfig.SlurmDrainExcludeStr = os.Getenv("SLURM_DRAIN_EXCLUDE_REASON_LIST")
127+
AppConfig.SlurmScontrolPath = os.Getenv("SLURM_SCONTROL_PATH")
122128

123129
subscriptionPayloadJSON := os.Getenv("SUBSCRIPTION_PAYLOAD")
124130
if err := json.Unmarshal([]byte(subscriptionPayloadJSON), &AppConfig.SubscriptionPayload); err != nil {

redfish-exporter/listener.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"log"
2828
"net"
2929
"net/http"
30+
"regexp"
3031
"strings"
3132

3233
"github.com/nod-ai/ADA/redfish-exporter/metrics"
@@ -219,15 +220,34 @@ func (s *Server) processRequest(AppConfig Config, conn net.Conn, req *http.Reque
219220
log.Printf("Origin Of Condition: %s", originOfCondition)
220221
for _, triggerEvent := range AppConfig.TriggerEvents {
221222
if severity == triggerEvent.Severity {
222-
log.Printf("Matched Trigger Event: %s with action %s", triggerEvent.Severity, triggerEvent.Action)
223+
if triggerEvent.Message != "" {
224+
re := regexp.MustCompile(triggerEvent.Message)
225+
match := re.FindAllString(message, -1)
226+
227+
if len(match) == 0 {
228+
continue
229+
}
230+
}
231+
log.Printf("Matched Trigger Event: %s | message: %s | with action %s", triggerEvent.Severity, triggerEvent.Message, triggerEvent.Action)
223232
// Sending event belongs to redfish_utils. Each server may have different slurm node associated, and redfish_servers has the info/map.
224233
if s.slurmQueue != nil {
225234
redfishServerInfo := getServerInfoByIP(AppConfig.RedfishServers, ip)
226235
if len(strings.TrimSpace(redfishServerInfo.SlurmNode)) == 0 {
227236
log.Printf("failed to get the slurm node name, cannot perform action: %v", triggerEvent.Action)
228237
break
229238
}
230-
s.slurmQueue.Add(redfishServerInfo.IP, redfishServerInfo.SlurmNode, triggerEvent.Severity, triggerEvent.Action)
239+
evt := slurm.AddEventReq{
240+
RedfishServerIP: redfishServerInfo.IP,
241+
SlurmNodeName: redfishServerInfo.SlurmNode,
242+
Severity: triggerEvent.Severity,
243+
Action: triggerEvent.Action,
244+
DrainReasonPrefix: triggerEvent.DrainReasonPrefix,
245+
MessageId: messageId,
246+
Message: message,
247+
ExcludeStr: AppConfig.SlurmDrainExcludeStr,
248+
ScontrolPath: AppConfig.SlurmScontrolPath,
249+
}
250+
s.slurmQueue.Add(evt)
231251
}
232252
break
233253
}

redfish-exporter/main.go

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -53,17 +53,6 @@ func main() {
5353
defer cancel()
5454
var slurmQueue *slurm.SlurmQueue
5555
if *enableSlurm {
56-
if len(strings.TrimSpace(AppConfig.SlurmToken)) == 0 {
57-
log.Fatalf("Provide slurm token to enable slurm")
58-
}
59-
if len(strings.TrimSpace(AppConfig.SlurmControlNode)) == 0 {
60-
log.Fatalf("Provide slurm control node IP:Port to enable slurm")
61-
}
62-
_, err := slurm.NewClient(AppConfig.SlurmControlNode, AppConfig.SlurmUser, AppConfig.SlurmToken)
63-
if err != nil {
64-
log.Fatalf("failed to create slurm client, err: %+v", err)
65-
}
66-
6756
slurmQueue = slurm.InitSlurmQueue(ctx)
6857
go slurmQueue.ProcessEventActionQueue()
6958
}

redfish-exporter/slurm/queue.go

Lines changed: 45 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,40 @@ package slurm
22

33
import (
44
"context"
5+
"fmt"
56
"log"
67
"strings"
78

89
"github.com/nod-ai/ADA/redfish-exporter/metrics"
910
)
1011

1112
const (
12-
Drain = "DrainNode"
13+
Drain = "DrainNode"
14+
ExlcudeReasonSet = "DRAIN_EXCLUDE_REASON_SET"
1315
)
1416

17+
type AddEventReq struct {
18+
RedfishServerIP string
19+
SlurmNodeName string
20+
Severity string
21+
Action string
22+
DrainReasonPrefix string
23+
MessageId string
24+
Message string
25+
ExcludeStr string
26+
ScontrolPath string
27+
}
28+
1529
type eventsActionReq struct {
16-
redfishServerIP string
17-
slurmNodeName string
18-
severity string
19-
action string
30+
redfishServerIP string
31+
slurmNodeName string
32+
severity string
33+
action string
34+
drainReasonPrefix string
35+
messageId string
36+
message string
37+
excludeStr string
38+
scontrolPath string
2039
}
2140

2241
type SlurmQueue struct {
@@ -28,12 +47,17 @@ func InitSlurmQueue(ctx context.Context) *SlurmQueue {
2847
return &SlurmQueue{ctx: ctx, queue: make(chan *eventsActionReq)}
2948
}
3049

31-
func (q *SlurmQueue) Add(redfishServerIP, slurmNodeName, severity, action string) {
50+
func (q *SlurmQueue) Add(evt AddEventReq) {
3251
q.queue <- &eventsActionReq{
33-
redfishServerIP: redfishServerIP,
34-
slurmNodeName: slurmNodeName,
35-
severity: severity,
36-
action: action,
52+
redfishServerIP: evt.RedfishServerIP,
53+
slurmNodeName: evt.SlurmNodeName,
54+
severity: evt.Severity,
55+
action: evt.Action,
56+
drainReasonPrefix: evt.DrainReasonPrefix,
57+
messageId: evt.MessageId,
58+
message: evt.Message,
59+
excludeStr: evt.ExcludeStr,
60+
scontrolPath: evt.ScontrolPath,
3761
}
3862
}
3963

@@ -65,19 +89,24 @@ func (q *SlurmQueue) ProcessEventActionQueue() {
6589
}
6690
}
6791

92+
func getDrainReasonString(prefix, msg, msgId, severity string) string {
93+
ret := fmt.Sprintf("%s:redfishlistener:%s:%s:%s", prefix, severity, msgId, msg)
94+
return ret
95+
}
96+
6897
func (q *SlurmQueue) performEventAction(req *eventsActionReq) error {
6998
if len(strings.TrimSpace(req.slurmNodeName)) == 0 {
7099
return nil
71100
}
72101

73-
slurmClient := GetClient()
74-
if slurmClient == nil {
75-
return nil
76-
}
77-
78102
if req.action == Drain {
79-
err := slurmClient.DrainNode(req.slurmNodeName)
103+
reason := getDrainReasonString(req.drainReasonPrefix, req.message, req.messageId, req.severity)
104+
err := DrainNodeWithScontrol(req.slurmNodeName, reason, req.excludeStr, req.scontrolPath)
80105
if err != nil {
106+
if strings.Contains(err.Error(), ExlcudeReasonSet) {
107+
log.Printf("Node not drained: %v", err.Error())
108+
return nil
109+
}
81110
log.Printf("Error draining node: %v", err)
82111
return err
83112
}

0 commit comments

Comments
 (0)