Skip to content

Commit 7bbc07a

Browse files
authored
Merge pull request #526 from sadsfae/inv_sriov
Feat: refactor nic attribute workflows
2 parents c117a23 + ddfa57e commit 7bbc07a

4 files changed

Lines changed: 1297 additions & 24 deletions

File tree

src/badfish/main.py

Lines changed: 239 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -801,17 +801,13 @@ async def create_job(self, _url, _payload, _headers, expected=None):
801801
status_code = _response.status
802802
if status_code in expected:
803803
self.logger.debug("POST command passed to create target config job.")
804-
else:
805-
self.logger.error("POST command failed to create BIOS config job, status code is %s." % status_code)
806-
804+
else: # pragma: no cover
805+
# Defensive error handling: POST failures typically raise exceptions in http_client
806+
# before returning, making this path difficult to test in isolation
807+
self.logger.error("POST command failed to create BIOS config job, status_code is %s." % status_code)
807808
await self.error_handler(_response)
808809

809-
raw = await _response.text("utf-8", "ignore")
810-
result = re.search("JID_.+?", raw)
811-
res_group = ""
812-
if result:
813-
res_group = result.group()
814-
job_id = re.sub("[,']", "", res_group)
810+
job_id = self._extract_job_id_from_response(_response, warn_on_missing=False)
815811
if job_id:
816812
self.logger.debug("%s job ID successfully created" % job_id)
817813
return job_id
@@ -878,6 +874,154 @@ async def check_job_status(self, job_id):
878874
self.progress_bar(count, self.retries, data["Message"], prompt="Status")
879875
await asyncio.sleep(30)
880876

877+
def _extract_job_id_from_response(self, response, warn_on_missing=True, context=""):
878+
"""
879+
Extract job ID from HTTP response Location header.
880+
881+
Args:
882+
response: HTTP response object with headers
883+
warn_on_missing: If True, log warning when job ID not found. If False, log error.
884+
context: Additional context message to append to warning/error (optional)
885+
886+
Returns:
887+
Job ID string or None if not found
888+
"""
889+
try:
890+
job_id = response.headers.get("Location", "").split("/")[-1]
891+
if job_id:
892+
return job_id
893+
else:
894+
if warn_on_missing:
895+
msg = "No job ID returned in Location header."
896+
if context:
897+
msg += f" {context}"
898+
self.logger.warning(msg)
899+
else:
900+
self.logger.error("Failed to find a job ID in headers of the response.")
901+
return None
902+
except (AttributeError, ValueError, KeyError) as e: # pragma: no cover
903+
# Defensive: headers.get() raising exceptions is extremely rare in practice
904+
if warn_on_missing:
905+
msg = f"Could not extract job ID from response headers: {e}"
906+
if context:
907+
msg += f" {context}"
908+
self.logger.warning(msg)
909+
else:
910+
self.logger.error(f"Failed to find a job ID in headers of the response: {e}")
911+
return None
912+
913+
async def _verify_job_scheduled(self, job_id, context="configuration"):
914+
"""
915+
Verify that a job has been scheduled and is not already failed.
916+
917+
This is typically called before rebooting to ensure the job is in a good state.
918+
919+
Args:
920+
job_id: The job ID to verify
921+
context: Description of the job context for error messages (default: "configuration")
922+
923+
Returns:
924+
True if job is scheduled successfully, False if job failed or inaccessible
925+
"""
926+
self.logger.info(f"Waiting for {context} job to be scheduled...")
927+
# Give job a moment to register in job queue
928+
await asyncio.sleep(2)
929+
930+
# Check job status to ensure it's scheduled properly
931+
job_url = f"{self.host_uri}{self.manager_resource}/Jobs/{job_id}"
932+
try:
933+
job_response = await self.get_request(job_url)
934+
if job_response.status == 200:
935+
job_raw = await job_response.text("utf-8", "ignore")
936+
job_data = json.loads(job_raw.strip())
937+
job_state = job_data.get("JobState", "Unknown")
938+
job_message = job_data.get("Message", "No message")
939+
self.logger.info(f"Job {job_id} status: {job_state} - {job_message}")
940+
941+
if "Fail" in job_message or "fail" in job_message or job_state == "Failed":
942+
self.logger.error(f"{context.capitalize()} job failed before reboot: {job_message}")
943+
return False
944+
return True
945+
else: # pragma: no cover
946+
# Defensive: non-200 responses during job verification are rare
947+
self.logger.warning(
948+
f"Could not verify job status (HTTP {job_response.status}). Proceeding with reboot."
949+
)
950+
return True # Don't block on verification failure
951+
except Exception as e: # pragma: no cover
952+
# Defensive: exceptions during job status check are rare in practice
953+
self.logger.warning(f"Could not check job status before reboot: {e}. Proceeding anyway.")
954+
return True # Don't block on verification failure
955+
956+
async def _monitor_and_verify_attribute_job(
957+
self, job_id, fqdd, attribute, expected_value, attr_type, original_value
958+
):
959+
"""
960+
Monitor job completion and verify the attribute value was actually applied.
961+
962+
This handles the full post-reboot workflow:
963+
1. Wait for job to complete
964+
2. If job fails, still verify the actual value (detect false negatives)
965+
3. If job succeeds, verify the value matches expectation
966+
967+
Args:
968+
job_id: The job ID to monitor
969+
fqdd: NIC FQDD (e.g., "NIC.Embedded.1-1-1")
970+
attribute: Attribute name (e.g., "WakeOnLan")
971+
expected_value: Expected value after change
972+
attr_type: Attribute type ("Integer", "String", "Enumeration")
973+
original_value: Original value before change (for logging)
974+
975+
Returns:
976+
True if value was successfully applied, False otherwise
977+
"""
978+
self.logger.info(f"Monitoring job {job_id} for completion...")
979+
job_success = await self.check_job_status(job_id)
980+
981+
# Job reported failure - but still verify actual value to detect false negatives
982+
if job_success is False:
983+
self.logger.error(f"Configuration job {job_id} did not complete successfully.")
984+
self.logger.error("Network attribute changes may not have been applied.")
985+
986+
verification = await self.get_nic_attribute_info(fqdd, attribute, log=False)
987+
if verification:
988+
current_val = verification.get("CurrentValue")
989+
if current_val == expected_value:
990+
self.logger.warning(
991+
f"Job reported failure but attribute value is correct ({attribute}={current_val}). "
992+
"This may be a false negative."
993+
)
994+
return True
995+
return False
996+
997+
# Job succeeded - verify attribute value actually changed
998+
self.logger.info("Verifying attribute value was applied...")
999+
verification = await self.get_nic_attribute_info(fqdd, attribute, log=False)
1000+
if verification:
1001+
current_val = verification.get("CurrentValue")
1002+
# Handle type conversion for comparison
1003+
if attr_type == "Integer":
1004+
current_val = int(current_val) if current_val is not None else None
1005+
expected_value = int(expected_value)
1006+
1007+
if current_val == expected_value:
1008+
self.logger.info(
1009+
f"✓ Successfully changed {attribute} from {original_value} to {expected_value}"
1010+
)
1011+
return True
1012+
else:
1013+
self.logger.error(
1014+
f"✗ Attribute value did not persist. Expected: {expected_value}, Current: {current_val}"
1015+
)
1016+
self.logger.error(
1017+
"Job completed but value not applied. Possible causes: "
1018+
"prerequisite not met, validation failure, or firmware issue."
1019+
)
1020+
return False
1021+
else:
1022+
self.logger.error("Could not verify final attribute value.")
1023+
return False
1024+
8811025
async def send_reset(self, reset_type):
8821026
_url = "%s%s/Actions/ComputerSystem.Reset" % (
8831027
self.host_uri,
@@ -2129,10 +2273,9 @@ async def export_scp(self, file_path, targets="ALL", include_read_only=False):
21292273
if response.status != 202:
21302274
self.logger.error("Command failed to export system configuration.")
21312275
return False
2132-
try:
2133-
job_id = response.headers["Location"].split("/")[-1]
2134-
except (ValueError, AttributeError, KeyError):
2135-
self.logger.error("Failed to find a job ID in headers of the response.")
2276+
2277+
job_id = self._extract_job_id_from_response(response, warn_on_missing=False)
2278+
if not job_id:
21362279
return False
21372280

21382281
self.logger.info(f"Job for exporting server configuration successfully created. Job ID: {job_id}")
@@ -2198,11 +2341,11 @@ async def import_scp(self, file_path, targets="ALL"):
21982341
if response.status != 202:
21992342
self.logger.error("Command failed to import system configuration.")
22002343
return False
2201-
try:
2202-
job_id = response.headers["Location"].split("/")[-1]
2203-
except (ValueError, AttributeError, KeyError):
2204-
self.logger.error("Failed to find a job ID in headers of the response.")
2344+
2345+
job_id = self._extract_job_id_from_response(response, warn_on_missing=False)
2346+
if not job_id:
22052347
return False
2348+
22062349
self.logger.info(f"Job for importing server configuration, successfully created. Job ID: {job_id}")
22072350
start_time = get_now()
22082351
percentage = 0
@@ -2378,6 +2521,53 @@ async def set_nic_attribute(self, fqdd, attribute, value):
23782521
self.logger.error("Was unable to set a network attribute. Attribute most likely doesn't exist.")
23792522
return False
23802523

2524+
# Check if VirtualizationMode is enabled for SR-IOV attributes
2525+
sriov_attributes = ["NumberVFAdvertised", "VFDistribution", "VFAllocMult"]
2526+
if attribute in sriov_attributes:
2527+
all_attrs = await self.get_nic_attribute(fqdd, log=False)
2528+
if all_attrs:
2529+
attrs_dict = dict(all_attrs)
2530+
virt_mode = attrs_dict.get("VirtualizationMode", "")
2531+
if virt_mode == "NONE":
2532+
self.logger.error(
2533+
f"Cannot set {attribute} when VirtualizationMode is NONE (disabled)."
2534+
)
2535+
self.logger.error(
2536+
f"First enable SR-IOV virtualization mode with:"
2537+
)
2538+
self.logger.error(
2539+
f" --set-nic-attribute {fqdd} --attribute VirtualizationMode --value SRIOV"
2540+
)
2541+
return False
2542+
2543+
# Check for known hardware VF limits on NumberVFAdvertised attribute
2544+
if attribute == "NumberVFAdvertised":
2545+
try:
2546+
requested_vfs = int(value)
2547+
if requested_vfs > 64:
2548+
# Get additional NIC attributes to determine hardware limits
2549+
all_attrs = await self.get_nic_attribute(fqdd, log=False)
2550+
if all_attrs:
2551+
attrs_dict = dict(all_attrs)
2552+
device_name = attrs_dict.get("DeviceName", "")
2553+
2554+
# Intel XXV710 hardware limitation observed across firmware versions and modes
2555+
if "XXV710" in device_name:
2556+
self.logger.warning(
2557+
f"Attempting to set NumberVFAdvertised to {requested_vfs} on Intel XXV710."
2558+
)
2559+
self.logger.warning(
2560+
"Testing has shown that Intel XXV710 NICs are limited to 64 VFs regardless of "
2561+
"virtualization mode (SRIOV/NPARSRIOV) or firmware version."
2562+
)
2563+
self.logger.warning(
2564+
"This operation will likely fail. The hardware limit appears to be 64 VFs maximum."
2565+
)
2566+
# Allow attempt - let firmware reject if truly unsupported
2567+
except (ValueError, AttributeError, KeyError): # pragma: no cover
2568+
# Defensive: exception during attribute parsing is rare - bad firmware data
2569+
pass
2570+
23812571
try:
23822572
type = attr_info.get("Type")
23832573
current_value = attr_info.get("CurrentValue")
@@ -2428,12 +2618,23 @@ async def set_nic_attribute(self, fqdd, attribute, value):
24282618
"Attributes": {attribute: value},
24292619
}
24302620
first_reset = False
2621+
job_id = None
2622+
24312623
try:
24322624
for i in range(self.retries):
24332625
response = await self.patch_request(uri, payload, headers)
24342626
status_code = response.status
24352627
if status_code in [200, 202]:
24362628
self.logger.info("Patch command to set network attribute values and create next reboot job PASSED.")
2629+
2630+
# Extract job ID from response headers (DMTF Redfish DSP0266 §13.6)
2631+
# HTTP 202 indicates async operation with job creation
2632+
job_id = self._extract_job_id_from_response(
2633+
response, warn_on_missing=True, context="Changes may not persist after reboot."
2634+
)
2635+
if job_id:
2636+
self.logger.info(f"Network attribute configuration job created: {job_id}")
2637+
24372638
break
24382639
else:
24392640
self.logger.error(
@@ -2459,9 +2660,30 @@ async def set_nic_attribute(self, fqdd, attribute, value):
24592660
return False
24602661
except (AttributeError, ValueError):
24612662
self.logger.error("Was unable to set a network attribute.")
2663+
return False
2664+
2665+
# Verify job is scheduled before reboot (if job was created)
2666+
if job_id:
2667+
job_scheduled = await self._verify_job_scheduled(job_id, context="configuration")
2668+
if not job_scheduled:
2669+
return False
24622670

2671+
# Reboot server to apply pending configuration
24632672
await self.reboot_server()
24642673

2674+
# Monitor job completion and verify attribute change (if job was created)
2675+
if job_id:
2676+
return await self._monitor_and_verify_attribute_job(
2677+
job_id, fqdd, attribute, value, type, attr_info.get("CurrentValue")
2678+
)
2679+
else:
2680+
# No job ID - can't monitor, but still return True since PATCH succeeded
2681+
self.logger.warning(
2682+
"Configuration change submitted but job monitoring was not possible. "
2683+
"Please manually verify attribute value persisted after reboot."
2684+
)
2685+
return True
2686+
24652687

24662688
async def execute_badfish(_host, _args, logger, format_handler=None):
24672689
_username = _args.get("u") or os.environ.get("BADFISH_USERNAME")

0 commit comments

Comments
 (0)