Skip to content

Commit dcef2cd

Browse files
authored
Update smci_mi300x_platform_amdgpu_alllogs_collection.sh (#70)
Improvements Added
1 parent c55008e commit dcef2cd

File tree

1 file changed

+150
-47
lines changed

1 file changed

+150
-47
lines changed
Lines changed: 150 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,182 @@
11
#!/bin/bash
22
#Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved.
3-
3+
44
# Function for logging with timestamps
55
log() {
66
echo "$(date +'%Y-%m-%d %H:%M:%S') - $1"
77
}
8-
8+
99
# Function to check curl command success
1010
check_curl_error() {
1111
if [ $? -ne 0 ]; then
1212
log "$1"
1313
exit 1
1414
fi
1515
}
16-
16+
17+
# Wait for all UBB SMC tasks to complete
18+
tasksWait() {
19+
TIMEOUT_SECONDS=$((25 * 60)) # 25 minutes
20+
INTERVAL=5
21+
22+
# Test to make sure there is only one task
23+
tasks=$(curl -k -s -u "${BMC_USERNAME}:${BMC_PASSWORD}" -X GET \
24+
-H 'Content-Type: application/json' -H 'Accept: application/json' \
25+
https://${BMC_IP}/redfish/v1/Oem/Supermicro/MI300X/TaskService/Tasks |
26+
python3 -c "import sys, json; print(json.load(sys.stdin)['[email protected]'])")
27+
28+
log "Tasks: ${tasks}"
29+
30+
if [[ ${tasks} != "0" ]]; then
31+
printf "\n"
32+
33+
for task in $(seq 0 $((tasks - 1))); do
34+
elapsedTime=0
35+
36+
# Poll the task status until it's completed or failed
37+
while true; do
38+
STATUS=$(curl -s -k -u "${BMC_USERNAME}:${BMC_PASSWORD}" \
39+
-H 'Content-Type: application/json' -H 'Accept: application/json' \
40+
https://${BMC_IP}/redfish/v1/Oem/Supermicro/MI300X/TaskService/Tasks/${task} |
41+
python3 -c "import sys, json; print(json.load(sys.stdin)['TaskState'])")
42+
43+
case "$STATUS" in
44+
"Completed")
45+
printf "\n%s" "Task ${task} completed successfully."
46+
break
47+
;;
48+
"Failed")
49+
log "Task ${task} failed."
50+
exit 1
51+
break
52+
;;
53+
"Running" | "New" | "Pending")
54+
printf "\r%s" "Task ${task} still running, elapsed time ${elapsedTime}s"
55+
sleep ${INTERVAL}
56+
elapsedTime=$((elapsedTime + INTERVAL))
57+
;;
58+
*)
59+
printf "\r%s" "Unknown task status: $STATUS, elapsed time ${elapsedTime}s"
60+
sleep ${INTERVAL}
61+
elapsedTime=$((elapsedTime + INTERVAL))
62+
;;
63+
esac
64+
65+
if [ ${elapsedTime} -gt ${TIMEOUT_SECONDS} ]; then
66+
log "Task ${task} fails to complete in $((TIMEOUT_SECONDS / 60)) minutes."
67+
exit 1
68+
fi
69+
done
70+
71+
printf "\n"
72+
done
73+
fi
74+
}
75+
1776
# Prompt for BMC Username if not set
18-
if [ -z "$BMC_USERNAME" ]; then
77+
if [ -z "${BMC_USERNAME}" ]; then
1978
read -p "Enter BMC Username: " BMC_USERNAME
2079
fi
21-
80+
2281
# Prompt for BMC Password if not set
23-
if [ -z "$BMC_PASSWORD" ]; then
24-
read -sp "Enter BMC Password: " BMC_PASSWORD
82+
if [ -z "${BMC_PASSWORD}" ]; then
83+
read -s -p "Enter BMC Password: " BMC_PASSWORD
2584
echo
2685
fi
27-
86+
2887
# Prompt for BMC IP Address if not set and validate format
29-
while [[ ! "$BMC_IP" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; do
88+
while [[ ! ${BMC_IP} =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; do
3089
read -p "Enter valid BMC IP Address: " BMC_IP
3190
done
32-
91+
92+
# Check if the username and password are correct
93+
curl -s -u "${BMC_USERNAME}:${BMC_PASSWORD}" -k "https://${BMC_IP}/redfish/v1/Managers/1" |
94+
python3 -c '
95+
import sys, json
96+
97+
data = json.load(sys.stdin)
98+
99+
if "error" in data:
100+
sys.exit(1)
101+
102+
sys.exit(0)'
103+
104+
if [ $? -ne 0 ]; then
105+
echo "Authentication failed, check IP address, username, and password"
106+
echo "IP address used: ${BMC_IP}"
107+
echo "Username used: ${BMC_USERNAME}"
108+
exit 1
109+
fi
110+
33111
# Step 1: Collect Diagnostic Data
34112
log "Collecting diagnostic data..."
35-
TASK_RESPONSE=$(curl -s -k -u "$BMC_USERNAME:$BMC_PASSWORD" \
36-
"https://$BMC_IP/redfish/v1/Oem/Supermicro/MI300X/Systems/UBB/LogServices/DiagLogs/Actions/LogService.CollectDiagnosticData" \
113+
taskResponse=$(curl -s -k -u "${BMC_USERNAME}:${BMC_PASSWORD}" \
114+
"https://${BMC_IP}/redfish/v1/Oem/Supermicro/MI300X/Systems/UBB/LogServices/DiagLogs/Actions/LogService.CollectDiagnosticData" \
37115
-X POST -d '{"DiagnosticDataType":"OEM", "OEMDiagnosticDataType" : "AllLogs"}')
38116
check_curl_error "Failed to collect diagnostic data."
39-
117+
40118
# Extract Task ID
41-
TASK_ID=$(echo "$TASK_RESPONSE" | grep -oP '(?<=Tasks/)[^"]*')
42-
log "Task ID: $TASK_ID"
43-
44-
# Step 2: Poll for Task Completion with timeout
45-
log "Checking task status..."
46-
TASK_STATE=""
47-
max_retries=40
48-
retry_count=0
49-
while [ "$TASK_STATE" != "Completed" ]; do
50-
if [ "$retry_count" -ge "$max_retries" ]; then
51-
log "Task did not complete within the expected timeframe."
52-
exit 1
53-
fi
54-
TASK_STATE=$(curl -s -k -u "$BMC_USERNAME:$BMC_PASSWORD" \
55-
-X GET "https://$BMC_IP/redfish/v1/Oem/Supermicro/MI300X/TaskService/Tasks/$TASK_ID" \
56-
| grep -oP '(?<=TaskState": ")[^"]*')
57-
log "Current State: $TASK_STATE"
58-
if [ "$TASK_STATE" != "Completed" ]; then
59-
log "Task in progress... waiting 30 seconds."
60-
sleep 30
61-
retry_count=$((retry_count + 1))
119+
TASKS=$(echo "${taskResponse}" | grep -oP '(?<=Tasks/)[^"]*')
120+
121+
if [[ ${TASKS} == "" ]]; then
122+
log "Script failed, task ID has no value"
123+
exit 1
124+
fi
125+
126+
# Step 2: Poll for all tasks to complete
127+
tasksWait
128+
129+
entries=$(curl -k -s -u "${BMC_USERNAME}:${BMC_PASSWORD}" -X GET \
130+
-H 'Content-Type: application/json' -H 'Accept: application/json' \
131+
https://${BMC_IP}/redfish/v1/Oem/Supermicro/MI300X/Systems/UBB/LogServices/DiagLogs/Entries |
132+
python3 -c "import sys, json; print(json.load(sys.stdin)['[email protected]'])")
133+
134+
if [[ ${entries} == "0" ]]; then
135+
log "Script failed, logs not found"
136+
exit 1
137+
fi
138+
139+
idGreatest=0
140+
entryGreatest=0
141+
142+
# Step 3: Go through the entries and find the one with the greatest ID
143+
for ((entry = $((entries - 1)); entry >= 0; entry--)); do
144+
entryId=$(curl -k -s -u "${BMC_USERNAME}:${BMC_PASSWORD}" -X GET \
145+
-H 'Content-Type: application/json' -H 'Accept: application/json' \
146+
https://${BMC_IP}/redfish/v1/Oem/Supermicro/MI300X/Systems/UBB/LogServices/DiagLogs/Entries |
147+
python3 -c "import sys, json; print(json.load(sys.stdin)['Members'][${entry}]['Id'])")
148+
149+
if [ ${idGreatest} -le ${entryId} ]; then
150+
idGreatest=${entryId}
151+
entryGreatest=${entry}
62152
fi
63153
done
64-
log "Task completed!"
65-
66-
# Extract Entry ID after completion
67-
ENTRY_ID=$(curl -s -k -u "$BMC_USERNAME:$BMC_PASSWORD" \
68-
-X GET "https://$BMC_IP/redfish/v1/Oem/Supermicro/MI300X/TaskService/Tasks/$TASK_ID" \
69-
| grep -oP '(?<=Entries/)[^"]*')
70-
log "Entry ID: $ENTRY_ID"
71-
72-
# Step 3: Download All Logs to logs directory
154+
155+
entryId=${entryGreatest}
156+
log "Entry ID: ${entryId}"
157+
158+
# Step 4: Check to make sure the entry is an AllLogs
159+
entryType=$(curl -k -s -u "${BMC_USERNAME}:${BMC_PASSWORD}" -X GET \
160+
-H 'Content-Type: application/json' -H 'Accept: application/json' \
161+
https://${BMC_IP}/redfish/v1/Oem/Supermicro/MI300X/Systems/UBB/LogServices/DiagLogs/Entries |
162+
python3 -c "import sys, json; print(json.load(sys.stdin)['Members'][${entryGreatest}]['OEMDiagnosticDataType'])")
163+
164+
if [[ ${entryType} != "AllLogs" ]]; then
165+
log "Entry ID ${entryId} is of type ${entryType}, rather than AllLogs"
166+
exit 1
167+
fi
168+
169+
# Step 5: Download All Logs to logs directory
73170
OUTPUT_DIR="logs"
74-
mkdir -p "$OUTPUT_DIR"
171+
mkdir -p "${OUTPUT_DIR}"
75172
log "Downloading all logs..."
76-
curl -s -k -u "$BMC_USERNAME:$BMC_PASSWORD" \
77-
"https://$BMC_IP/redfish/v1/Oem/Supermicro/MI300X/Systems/UBB/LogServices/DiagLogs/Entries/$ENTRY_ID/attachment" > "$OUTPUT_DIR/all_logs.tar.xz"
173+
174+
attachment=$(curl -k -s -u "${BMC_USERNAME}:${BMC_PASSWORD}" -X GET \
175+
-H 'Content-Type: application/json' -H 'Accept: application/json' \
176+
https://${BMC_IP}/redfish/v1/Oem/Supermicro/MI300X/Systems/UBB/LogServices/DiagLogs/Entries |
177+
python3 -c "import sys, json; print(json.load(sys.stdin)['Members'][${entryId}]['AdditionalDataURI'])")
178+
179+
curl -k -s -u "${BMC_USERNAME}:${BMC_PASSWORD}" -X GET \
180+
https://${BMC_IP}${attachment} >"${OUTPUT_DIR}/all_logs.tar.xz"
78181
check_curl_error "Failed to download logs."
79-
log "All logs downloaded as $OUTPUT_DIR/all_logs.tar.xz"
182+
log "All logs downloaded as ${OUTPUT_DIR}/all_logs.tar.xz"

0 commit comments

Comments
 (0)