Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 234 additions & 8 deletions src/badfish/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,154 @@ async def check_job_status(self, job_id):
self.progress_bar(count, self.retries, data["Message"], prompt="Status")
await asyncio.sleep(30)

def _extract_job_id_from_response(self, response, warn_on_missing=True, context=""):
"""
Extract job ID from HTTP response Location header.

Args:
response: HTTP response object with headers
warn_on_missing: If True, log warning when job ID not found. If False, log error.
context: Additional context message to append to warning/error (optional)

Returns:
Job ID string or None if not found
"""
try:
job_id = response.headers.get("Location", "").split("/")[-1]
if job_id:
return job_id
else:
if warn_on_missing:
msg = "No job ID returned in Location header."
if context:
msg += f" {context}"
self.logger.warning(msg)
else:
self.logger.error("Failed to find a job ID in headers of the response.")
return None
except (AttributeError, ValueError, KeyError) as e: # pragma: no cover
# Defensive: headers.get() raising exceptions is extremely rare in practice
if warn_on_missing: # pragma: no cover
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all the # pragma comments after the excluded except are not necessary

msg = f"Could not extract job ID from response headers: {e}" # pragma: no cover
if context: # pragma: no cover
msg += f" {context}" # pragma: no cover
self.logger.warning(msg) # pragma: no cover
else: # pragma: no cover
self.logger.error(f"Failed to find a job ID in headers of the response: {e}") # pragma: no cover
return None # pragma: no cover

async def _verify_job_scheduled(self, job_id, context="configuration"):
"""
Verify that a job has been scheduled and is not already failed.

This is typically called before rebooting to ensure the job is in a good state.

Args:
job_id: The job ID to verify
context: Description of the job context for error messages (default: "configuration")

Returns:
True if job is scheduled successfully, False if job failed or inaccessible
"""
self.logger.info(f"Waiting for {context} job to be scheduled...")
# Give job a moment to register in job queue
await asyncio.sleep(2)

# Check job status to ensure it's scheduled properly
job_url = f"{self.host_uri}{self.manager_resource}/Jobs/{job_id}"
try:
job_response = await self.get_request(job_url)
if job_response.status == 200:
job_raw = await job_response.text("utf-8", "ignore")
job_data = json.loads(job_raw.strip())
job_state = job_data.get("JobState", "Unknown")
job_message = job_data.get("Message", "No message")
self.logger.info(f"Job {job_id} status: {job_state} - {job_message}")

if "Fail" in job_message or "fail" in job_message or job_state == "Failed":
self.logger.error(f"{context.capitalize()} job failed before reboot: {job_message}")
return False
return True
else: # pragma: no cover
# Defensive: non-200 responses during job verification are rare
self.logger.warning( # pragma: no cover
f"Could not verify job status (HTTP {job_response.status}). Proceeding with reboot." # pragma: no cover
) # pragma: no cover
return True # Don't block on verification failure # pragma: no cover
except Exception as e: # pragma: no cover
# Defensive: exceptions during job status check are rare in practice
self.logger.warning(f"Could not check job status before reboot: {e}. Proceeding anyway.") # pragma: no cover
return True # Don't block on verification failure # pragma: no cover

async def _monitor_and_verify_attribute_job(
self, job_id, fqdd, attribute, expected_value, attr_type, original_value
):
"""
Monitor job completion and verify the attribute value was actually applied.

This handles the full post-reboot workflow:
1. Wait for job to complete
2. If job fails, still verify the actual value (detect false negatives)
3. If job succeeds, verify the value matches expectation

Args:
job_id: The job ID to monitor
fqdd: NIC FQDD (e.g., "NIC.Embedded.1-1-1")
attribute: Attribute name (e.g., "WakeOnLan")
expected_value: Expected value after change
attr_type: Attribute type ("Integer", "String", "Enumeration")
original_value: Original value before change (for logging)

Returns:
True if value was successfully applied, False otherwise
"""
self.logger.info(f"Monitoring job {job_id} for completion...")
job_success = await self.check_job_status(job_id)

# Job reported failure - but still verify actual value to detect false negatives
if job_success is False:
self.logger.error(f"Configuration job {job_id} did not complete successfully.")
self.logger.error("Network attribute changes may not have been applied.")

verification = await self.get_nic_attribute_info(fqdd, attribute, log=False)
if verification:
current_val = verification.get("CurrentValue")
if current_val == expected_value:
self.logger.warning(
f"Job reported failure but attribute value is correct ({attribute}={current_val}). "
"This may be a false negative."
)
return True
return False

# Job succeeded - verify attribute value actually changed
self.logger.info("Verifying attribute value was applied...")
verification = await self.get_nic_attribute_info(fqdd, attribute, log=False)
if verification:
current_val = verification.get("CurrentValue")
# Handle type conversion for comparison
if attr_type == "Integer":
current_val = int(current_val) if current_val is not None else None
expected_value = int(expected_value)

if current_val == expected_value:
self.logger.info(
f"✓ Successfully changed {attribute} from {original_value} to {expected_value}"
)
return True
else:
self.logger.error(
f"✗ Attribute value did not persist. Expected: {expected_value}, Current: {current_val}"
)
self.logger.error(
"Job completed but value not applied. Possible causes: "
"prerequisite not met, validation failure, or firmware issue."
)
return False
else:
self.logger.error("Could not verify final attribute value.")
return False

async def send_reset(self, reset_type):
_url = "%s%s/Actions/ComputerSystem.Reset" % (
self.host_uri,
Expand Down Expand Up @@ -2129,10 +2277,9 @@ async def export_scp(self, file_path, targets="ALL", include_read_only=False):
if response.status != 202:
self.logger.error("Command failed to export system configuration.")
return False
try:
job_id = response.headers["Location"].split("/")[-1]
except (ValueError, AttributeError, KeyError):
self.logger.error("Failed to find a job ID in headers of the response.")

job_id = self._extract_job_id_from_response(response, warn_on_missing=False)
if not job_id:
return False

self.logger.info(f"Job for exporting server configuration successfully created. Job ID: {job_id}")
Expand Down Expand Up @@ -2198,11 +2345,11 @@ async def import_scp(self, file_path, targets="ALL"):
if response.status != 202:
self.logger.error("Command failed to import system configuration.")
return False
try:
job_id = response.headers["Location"].split("/")[-1]
except (ValueError, AttributeError, KeyError):
self.logger.error("Failed to find a job ID in headers of the response.")

job_id = self._extract_job_id_from_response(response, warn_on_missing=False)
if not job_id:
return False

self.logger.info(f"Job for importing server configuration, successfully created. Job ID: {job_id}")
start_time = get_now()
percentage = 0
Expand Down Expand Up @@ -2378,6 +2525,53 @@ async def set_nic_attribute(self, fqdd, attribute, value):
self.logger.error("Was unable to set a network attribute. Attribute most likely doesn't exist.")
return False

# Check if VirtualizationMode is enabled for SR-IOV attributes
sriov_attributes = ["NumberVFAdvertised", "VFDistribution", "VFAllocMult"]
if attribute in sriov_attributes:
all_attrs = await self.get_nic_attribute(fqdd, log=False)
if all_attrs:
attrs_dict = dict(all_attrs)
virt_mode = attrs_dict.get("VirtualizationMode", "")
if virt_mode == "NONE":
self.logger.error(
f"Cannot set {attribute} when VirtualizationMode is NONE (disabled)."
)
self.logger.error(
f"First enable SR-IOV virtualization mode with:"
)
self.logger.error(
f" --set-nic-attribute {fqdd} --attribute VirtualizationMode --value SRIOV"
)
return False

# Check for known hardware VF limits on NumberVFAdvertised attribute
if attribute == "NumberVFAdvertised":
try:
requested_vfs = int(value)
if requested_vfs > 64:
# Get additional NIC attributes to determine hardware limits
all_attrs = await self.get_nic_attribute(fqdd, log=False)
if all_attrs:
attrs_dict = dict(all_attrs)
device_name = attrs_dict.get("DeviceName", "")

# Intel XXV710 hardware limitation observed across firmware versions and modes
if "XXV710" in device_name:
self.logger.warning(
f"Attempting to set NumberVFAdvertised to {requested_vfs} on Intel XXV710."
)
self.logger.warning(
"Testing has shown that Intel XXV710 NICs are limited to 64 VFs regardless of "
"virtualization mode (SRIOV/NPARSRIOV) or firmware version."
)
self.logger.warning(
"This operation will likely fail. The hardware limit appears to be 64 VFs maximum."
)
# Allow attempt - let firmware reject if truly unsupported
except (ValueError, AttributeError, KeyError): # pragma: no cover
# Defensive: exception during attribute parsing is rare - bad firmware data
pass # pragma: no cover

try:
type = attr_info.get("Type")
current_value = attr_info.get("CurrentValue")
Expand Down Expand Up @@ -2428,12 +2622,23 @@ async def set_nic_attribute(self, fqdd, attribute, value):
"Attributes": {attribute: value},
}
first_reset = False
job_id = None

try:
for i in range(self.retries):
response = await self.patch_request(uri, payload, headers)
status_code = response.status
if status_code in [200, 202]:
self.logger.info("Patch command to set network attribute values and create next reboot job PASSED.")

# Extract job ID from response headers (DMTF Redfish DSP0266 §13.6)
# HTTP 202 indicates async operation with job creation
job_id = self._extract_job_id_from_response(
response, warn_on_missing=True, context="Changes may not persist after reboot."
)
if job_id:
self.logger.info(f"Network attribute configuration job created: {job_id}")

break
else:
self.logger.error(
Expand All @@ -2459,9 +2664,30 @@ async def set_nic_attribute(self, fqdd, attribute, value):
return False
except (AttributeError, ValueError):
self.logger.error("Was unable to set a network attribute.")
return False

# Verify job is scheduled before reboot (if job was created)
if job_id:
job_scheduled = await self._verify_job_scheduled(job_id, context="configuration")
if not job_scheduled:
return False

# Reboot server to apply pending configuration
await self.reboot_server()

# Monitor job completion and verify attribute change (if job was created)
if job_id:
return await self._monitor_and_verify_attribute_job(
job_id, fqdd, attribute, value, type, attr_info.get("CurrentValue")
)
else:
# No job ID - can't monitor, but still return True since PATCH succeeded
self.logger.warning(
"Configuration change submitted but job monitoring was not possible. "
"Please manually verify attribute value persisted after reboot."
)
return True


async def execute_badfish(_host, _args, logger, format_handler=None):
_username = _args.get("u") or os.environ.get("BADFISH_USERNAME")
Expand Down
Loading
Loading