Skip to content

Commit ec6cf1a

Browse files
committed
Restructure Machine Ready status check for allowUnhealthyMachine
Signed-off-by: Hitesh Wadekar <hwadekar@nvidia.com>
1 parent e23a8ae commit ec6cf1a

3 files changed

Lines changed: 56 additions & 171 deletions

File tree

api/pkg/api/handler/instance.go

Lines changed: 46 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,49 @@ func (cih CreateInstanceHandler) Handle(c echo.Context) error {
856856
return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("Machine: %s has status: %s, machine must be in Ready state to create an Instance", machine.ID, machine.Status), nil)
857857
}
858858

859+
// Check if it's possible to provision the Machine
860+
if machine.Status == cdbm.MachineStatusReady {
861+
logger.Info().Str("MachineID", machine.ID).Str("Status", machine.Status).Bool("AllowUnhealthyMachine", allowUnhealthyMachine).Msg("Machine is Ready, proceeding with Instance creation")
862+
} else {
863+
isProvisionable := false
864+
controllerState := ""
865+
if machine.Metadata != nil {
866+
if strings.Contains(machine.Metadata.State, "{") {
867+
controllerState = strings.TrimSpace(strings.Split(machine.Metadata.State, "{")[0])
868+
} else {
869+
controllerState = strings.TrimSpace(machine.Metadata.State)
870+
}
871+
}
872+
873+
if strings.HasPrefix(controllerState, cdbm.MachineStatusReady) {
874+
isProvisionable = true
875+
}
876+
877+
mlogger := logger.With().Str("MachineID", machine.ID).Str("Status", machine.Status).Str("ControllerState", controllerState).Bool("AllowUnhealthyMachine", allowUnhealthyMachine).Logger()
878+
879+
if allowUnhealthyMachine {
880+
if isProvisionable {
881+
mlogger.Info().Msg("Machine is provisionable, proceeding with Instance creation")
882+
} else {
883+
if machine.Status == cdbm.MachineStatusMaintenance || machine.Status == cdbm.MachineStatusError {
884+
mlogger.Warn().Msg("Machine has controller state that does not allow Instance creation")
885+
return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("Machine: %s has controller state: %s that does not allow Instance creation even with `allowUnhealthyMachine` set to true", machine.ID, controllerState), nil)
886+
} else {
887+
mlogger.Warn().Msg("Machine has status that does not allow Instance creation")
888+
return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("Machine: %s has status: %s that does not allow Instance creation even with `allowUnhealthyMachine` set to true", machine.ID, machine.Status), nil)
889+
}
890+
}
891+
} else {
892+
if isProvisionable {
893+
mlogger.Warn().Msg("Machine is not in Ready state, but it can be provisioned by setting `allowUnhealthyMachine` to true in request")
894+
return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("Machine: %s is not in Ready state, but it can be provisioned by setting `allowUnhealthyMachine` to true in request", machine.ID), nil)
895+
} else {
896+
mlogger.Warn().Msg("Machine has status that does not allow Instance creation")
897+
return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("Machine: %s has status: %s that does not allow Instance creation", machine.ID, machine.Status), nil)
898+
}
899+
}
900+
}
901+
859902
// Acquire a lock on the MachineID
860903
err = tx.TryAcquireAdvisoryLock(ctx, cdb.GetAdvisoryLockIDFromString(machine.ID), nil)
861904
if err != nil {
@@ -1002,20 +1045,6 @@ func (cih CreateInstanceHandler) Handle(c echo.Context) error {
10021045

10031046
// NOTE: At this stage, we have a Machine ID whether it was provided in request or selected through Instance Type
10041047

1005-
// ==================== Step 5: Machine Capability Validation ====================
1006-
1007-
// Check if Machine capabilities match with Instance Type capabilities
1008-
if instanceTypeID != nil && machine.ID != "" {
1009-
isMatch, _, apiErr := common.MatchInstanceTypeCapabilitiesForMachines(ctx, logger, cih.dbSession, *instanceTypeID, []string{machine.ID})
1010-
if apiErr != nil {
1011-
return cutil.NewAPIErrorResponse(c, apiErr.Code, apiErr.Message, apiErr.Data)
1012-
}
1013-
1014-
if !isMatch {
1015-
return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("Capabilities for Machine: %v do not match Instance Type's Capabilities", machine.ID), nil)
1016-
}
1017-
}
1018-
10191048
mcDAO := cdbm.NewMachineCapabilityDAO(cih.dbSession)
10201049

10211050
// Fetch InfiniBand Capabilities from Instance Type or Machine and validate InfiniBand Interfaces
@@ -1244,7 +1273,7 @@ func (cih CreateInstanceHandler) Handle(c echo.Context) error {
12441273
}
12451274
}
12461275

1247-
// ==================== Step 6: Create Instance Records ====================
1276+
// ==================== Step 5: Create Instance Records ====================
12481277

12491278
instanceCreateInput := cdbm.InstanceCreateInput{
12501279
Name: apiRequest.Name,
@@ -1518,7 +1547,7 @@ func (cih CreateInstanceHandler) Handle(c echo.Context) error {
15181547
return cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to get new Status Detail for Instance", nil)
15191548
}
15201549

1521-
// ==================== Step 7: Workflow Trigger ====================
1550+
// ==================== Step 6: Workflow Trigger ====================
15221551

15231552
// Get the temporal client for the site we are working with.
15241553
stc, err := cih.scp.GetClientByID(instance.SiteID)
@@ -1630,7 +1659,7 @@ func (cih CreateInstanceHandler) Handle(c echo.Context) error {
16301659

16311660
logger.Info().Str("Workflow ID", wid).Msg("completed synchronous create Instance workflow")
16321661

1633-
// ==================== Step 8: Commit & Response ====================
1662+
// ==================== Step 7: Commit & Response ====================
16341663

16351664
// Commit the DB transaction after the synchronous workflow has completed without error
16361665
err = tx.Commit()

api/pkg/api/handler/instance_test.go

Lines changed: 10 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -385,23 +385,6 @@ func testInstanceBuildMachineWithID(t *testing.T, dbSession *cdb.Session, ip uui
385385
return m
386386
}
387387

388-
// testInstanceAttachIST1MatchingMachineCapabilities adds machine_capability rows aligned with ist1's instance-type
389-
// capabilities so MatchInstanceTypeCapabilitiesForMachines (instance create Step 5) does not return 409.
390-
func testInstanceAttachIST1MatchingMachineCapabilities(t *testing.T, dbSession *cdb.Session, machineID string) {
391-
t.Helper()
392-
common.TestBuildMachineCapability(t, dbSession, &machineID, nil, cdbm.MachineCapabilityTypeInfiniBand, "MT28908 Family [ConnectX-6]", nil, nil, cdb.GetStrPtr("Mellanox Technologies"), cdb.GetIntPtr(3), cdb.GetStrPtr(""), nil)
393-
common.TestBuildMachineCapability(t, dbSession, &machineID, nil, cdbm.MachineCapabilityTypeNetwork, "MT42822 BlueField-2 integrated ConnectX-6 Dx network controller", nil, nil, cdb.GetStrPtr("Mellanox Technologies"), cdb.GetIntPtr(2), cdb.GetStrPtr("DPU"), nil)
394-
common.TestBuildMachineCapability(t, dbSession, &machineID, nil, cdbm.MachineCapabilityTypeGPU, "NVIDIA GB200", nil, nil, cdb.GetStrPtr("NVIDIA"), cdb.GetIntPtr(4), cdb.GetStrPtr(cdbm.MachineCapabilityDeviceTypeNVLink), nil)
395-
}
396-
397-
// testInstanceLinkMachineToIST1WithCaps links a machine to ist1 and inserts matching machine_capability rows for Step 5.
398-
func testInstanceLinkMachineToIST1WithCaps(t *testing.T, dbSession *cdb.Session, mc *cdbm.Machine, ist *cdbm.InstanceType) *cdbm.MachineInstanceType {
399-
t.Helper()
400-
mit := testInstanceBuildMachineInstanceType(t, dbSession, mc, ist)
401-
testInstanceAttachIST1MatchingMachineCapabilities(t, dbSession, mc.ID)
402-
return mit
403-
}
404-
405388
func testInstanceBuildMachineCapability(t *testing.T, dbSession *cdb.Session, mID *string, capabilityType string, name string, capacity *string, count *int) *cdbm.MachineCapability {
406389
mc := &cdbm.MachineCapability{
407390
ID: uuid.New(),
@@ -770,7 +753,7 @@ func TestCreateInstanceHandler_Handle(t *testing.T) {
770753
mc1 := testInstanceBuildMachine(t, dbSession, ip.ID, st1.ID, cdb.GetBoolPtr(false), nil)
771754
assert.NotNil(t, mc1)
772755

773-
mcinst1 := testInstanceLinkMachineToIST1WithCaps(t, dbSession, mc1, ist1)
756+
mcinst1 := testInstanceBuildMachineInstanceType(t, dbSession, mc1, ist1)
774757
assert.NotNil(t, mcinst1)
775758

776759
mc12 := testInstanceBuildMachine(t, dbSession, ip.ID, st1.ID, cdb.GetBoolPtr(false), nil)
@@ -800,31 +783,31 @@ func TestCreateInstanceHandler_Handle(t *testing.T) {
800783
mc20 := testInstanceBuildMachine(t, dbSession, ip.ID, st1.ID, cdb.GetBoolPtr(false), nil)
801784
assert.NotNil(t, mc20)
802785

803-
mcinst12 := testInstanceLinkMachineToIST1WithCaps(t, dbSession, mc12, ist1)
786+
mcinst12 := testInstanceBuildMachineInstanceType(t, dbSession, mc12, ist1)
804787
assert.NotNil(t, mcinst12)
805788

806-
mcinst13 := testInstanceLinkMachineToIST1WithCaps(t, dbSession, mc13, ist1)
789+
mcinst13 := testInstanceBuildMachineInstanceType(t, dbSession, mc13, ist1)
807790
assert.NotNil(t, mcinst13)
808791

809-
mcinst14 := testInstanceLinkMachineToIST1WithCaps(t, dbSession, mc14, ist1)
792+
mcinst14 := testInstanceBuildMachineInstanceType(t, dbSession, mc14, ist1)
810793
assert.NotNil(t, mcinst14)
811794

812-
mcinst15 := testInstanceLinkMachineToIST1WithCaps(t, dbSession, mc15, ist1)
795+
mcinst15 := testInstanceBuildMachineInstanceType(t, dbSession, mc15, ist1)
813796
assert.NotNil(t, mcinst15)
814797

815-
mcinst16 := testInstanceLinkMachineToIST1WithCaps(t, dbSession, mc16, ist1)
798+
mcinst16 := testInstanceBuildMachineInstanceType(t, dbSession, mc16, ist1)
816799
assert.NotNil(t, mcinst16)
817800

818-
mcinst17 := testInstanceLinkMachineToIST1WithCaps(t, dbSession, mc17, ist1)
801+
mcinst17 := testInstanceBuildMachineInstanceType(t, dbSession, mc17, ist1)
819802
assert.NotNil(t, mcinst17)
820803

821-
mcinst18 := testInstanceLinkMachineToIST1WithCaps(t, dbSession, mc18, ist1)
804+
mcinst18 := testInstanceBuildMachineInstanceType(t, dbSession, mc18, ist1)
822805
assert.NotNil(t, mcinst18)
823806

824-
mcinst19 := testInstanceLinkMachineToIST1WithCaps(t, dbSession, mc19, ist1)
807+
mcinst19 := testInstanceBuildMachineInstanceType(t, dbSession, mc19, ist1)
825808
assert.NotNil(t, mcinst19)
826809

827-
mcinst20 := testInstanceLinkMachineToIST1WithCaps(t, dbSession, mc20, ist1)
810+
mcinst20 := testInstanceBuildMachineInstanceType(t, dbSession, mc20, ist1)
828811
assert.NotNil(t, mcinst20)
829812

830813
// Tenant 1
@@ -882,18 +865,6 @@ func TestCreateInstanceHandler_Handle(t *testing.T) {
882865
mci1 := testInstanceBuildMachineInterface(t, dbSession, subnet1.ID, mc1.ID)
883866
assert.NotNil(t, mci1)
884867

885-
// Instance type vs machine: same InfiniBand device row but different inactive-device indices (active vs inactive ports).
886-
istIBPortMismatch := testInstanceBuildInstanceType(t, dbSession, ip, "test-instance-type-ib-port-mismatch", st1, cdbm.InstanceStatusReady)
887-
assert.NotNil(t, istIBPortMismatch)
888-
common.TestBuildMachineCapability(t, dbSession, nil, &istIBPortMismatch.ID, cdbm.MachineCapabilityTypeInfiniBand, "MT28908 Family [ConnectX-6]", nil, nil, cdb.GetStrPtr("Mellanox Technologies"), cdb.GetIntPtr(3), cdb.GetStrPtr(""), []int{0, 1})
889-
mcIBPortMismatch := testInstanceBuildMachine(t, dbSession, ip.ID, st1.ID, cdb.GetBoolPtr(false), nil)
890-
assert.NotNil(t, mcIBPortMismatch)
891-
common.TestBuildMachineCapability(t, dbSession, &mcIBPortMismatch.ID, nil, cdbm.MachineCapabilityTypeInfiniBand, "MT28908 Family [ConnectX-6]", nil, nil, cdb.GetStrPtr("Mellanox Technologies"), cdb.GetIntPtr(3), cdb.GetStrPtr(""), []int{2, 3})
892-
testInstanceBuildMachineInstanceType(t, dbSession, mcIBPortMismatch, istIBPortMismatch)
893-
testInstanceBuildMachineInterface(t, dbSession, subnet1.ID, mcIBPortMismatch.ID)
894-
alcIBPortMismatch := testInstanceSiteBuildAllocationContraints(t, dbSession, al1, cdbm.AllocationResourceTypeInstanceType, istIBPortMismatch.ID, cdbm.AllocationConstraintTypeReserved, 10, ipu)
895-
assert.NotNil(t, alcIBPortMismatch)
896-
897868
desd1 := common.TestBuildDpuExtensionService(t, dbSession, "test-dpu-extension-service-1", model.DpuExtensionServiceTypeKubernetesPod, tn1, st1, "V1-T1761856992374052", cdbm.DpuExtensionServiceStatusReady, tnu1)
898869
assert.NotNil(t, desd1)
899870

@@ -1721,35 +1692,6 @@ func TestCreateInstanceHandler_Handle(t *testing.T) {
17211692
},
17221693
wantErr: false,
17231694
},
1724-
{
1725-
name: "test Instance create API endpoint failure, InfiniBand inactive devices differ from instance type",
1726-
fields: fields{
1727-
dbSession: dbSession,
1728-
tc: tc,
1729-
cfg: cfg,
1730-
},
1731-
args: args{
1732-
reqData: &model.APIInstanceCreateRequest{
1733-
Name: "Test Instance IB inactive ports mismatch",
1734-
TenantID: tn1.ID.String(),
1735-
InstanceTypeID: cdb.GetStrPtr(istIBPortMismatch.ID.String()),
1736-
VpcID: vpc1.ID.String(),
1737-
UserData: cdb.GetStrPtr(""),
1738-
IpxeScript: cdb.GetStrPtr(common.DefaultIpxeScript),
1739-
Interfaces: []model.APIInterfaceCreateOrUpdateRequest{
1740-
{
1741-
SubnetID: cdb.GetStrPtr(subnet1.ID.String()),
1742-
},
1743-
},
1744-
PhoneHomeEnabled: cdb.GetBoolPtr(false),
1745-
},
1746-
reqOrg: tnOrg,
1747-
reqUser: tnu1,
1748-
respCode: http.StatusBadRequest,
1749-
respMessage: "do not match Instance Type's Capabilities",
1750-
},
1751-
wantErr: false,
1752-
},
17531695
{
17541696
name: "test Instance create API endpoint success, can't provide instance type id AND machine id",
17551697
fields: fields{

api/pkg/api/handler/util/common/common_test.go

Lines changed: 0 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -2106,92 +2106,6 @@ func TestMatchInstanceTypeCapabilitiesForMachines(t *testing.T) {
21062106
}
21072107
}
21082108

2109-
// TestMatchInstanceTypeCapabilitiesForMachines_InfiniBandInactiveDevices covers InfiniBand capability rows
2110-
// where inactive device indices (inactive ports) differ, which implies a different active-port profile.
2111-
func TestMatchInstanceTypeCapabilitiesForMachines_InfiniBandInactiveDevices(t *testing.T) {
2112-
ctx := context.Background()
2113-
dbSession := testCommonInitDB(t)
2114-
defer dbSession.Close()
2115-
2116-
testCommonSetupSchema(t, dbSession)
2117-
2118-
logger := zerolog.New(os.Stdout)
2119-
2120-
ipOrg1 := "test-ip-org-ib"
2121-
orgRoles := []string{"FORGE_SERVICE_PROVIDER_ADMIN"}
2122-
user := testCommonBuildUser(t, dbSession, uuid.New().String(), []string{ipOrg1}, orgRoles)
2123-
2124-
ip := testCommonBuildInfrastructureProvider(t, dbSession, "TestIpIB", ipOrg1, user)
2125-
assert.NotNil(t, ip)
2126-
2127-
site1 := testCommonBuildSite(t, dbSession, ip, "test", user)
2128-
assert.NotNil(t, site1)
2129-
2130-
tnOrg1 := "test-tenant-org-ib"
2131-
tnuser := testCommonBuildUser(t, dbSession, uuid.New().String(), []string{tnOrg1}, []string{"FORGE_TENANT_ADMIN_1"})
2132-
_ = testCommonBuildTenant(t, dbSession, "test-tenant-ib", tnOrg1, tnuser)
2133-
2134-
instIB := testCommonBuildInstanceType(t, dbSession, "it-ib", site1, ip, user)
2135-
assert.NotNil(t, instIB)
2136-
2137-
mcDAO := cdbm.NewMachineCapabilityDAO(dbSession)
2138-
2139-
ibName := "MT28908 Family [ConnectX-7]"
2140-
vendor := cdb.GetStrPtr("Mellanox Technologies")
2141-
count := cdb.GetIntPtr(2)
2142-
2143-
_, err := mcDAO.Create(ctx, nil, cdbm.MachineCapabilityCreateInput{
2144-
InstanceTypeID: &instIB.ID,
2145-
Type: cdbm.MachineCapabilityTypeInfiniBand,
2146-
Name: ibName,
2147-
Vendor: vendor,
2148-
Count: count,
2149-
InactiveDevices: []int{0, 1},
2150-
})
2151-
assert.Nil(t, err)
2152-
2153-
mcMatch := testCommonBuildMachine(t, dbSession, ip.ID, site1.ID, cdb.GetUUIDPtr(instIB.ID), uuid.New(), nil, nil, nil, cdbm.MachineStatusReady)
2154-
assert.NotNil(t, mcMatch)
2155-
2156-
_, err = mcDAO.Create(ctx, nil, cdbm.MachineCapabilityCreateInput{
2157-
MachineID: &mcMatch.ID,
2158-
Type: cdbm.MachineCapabilityTypeInfiniBand,
2159-
Name: ibName,
2160-
Vendor: vendor,
2161-
Count: count,
2162-
InactiveDevices: []int{0, 1},
2163-
})
2164-
assert.Nil(t, err)
2165-
2166-
mcMismatch := testCommonBuildMachine(t, dbSession, ip.ID, site1.ID, cdb.GetUUIDPtr(instIB.ID), uuid.New(), nil, nil, nil, cdbm.MachineStatusReady)
2167-
assert.NotNil(t, mcMismatch)
2168-
2169-
_, err = mcDAO.Create(ctx, nil, cdbm.MachineCapabilityCreateInput{
2170-
MachineID: &mcMismatch.ID,
2171-
Type: cdbm.MachineCapabilityTypeInfiniBand,
2172-
Name: ibName,
2173-
Vendor: vendor,
2174-
Count: count,
2175-
InactiveDevices: []int{2, 3},
2176-
})
2177-
assert.Nil(t, err)
2178-
2179-
t.Run("match when InfiniBand inactive device lists match", func(t *testing.T) {
2180-
ok, mid, apiErr := MatchInstanceTypeCapabilitiesForMachines(ctx, logger, dbSession, instIB.ID, []string{mcMatch.ID})
2181-
assert.Nil(t, apiErr)
2182-
assert.True(t, ok)
2183-
assert.Nil(t, mid)
2184-
})
2185-
2186-
t.Run("no match when InfiniBand inactive device lists differ", func(t *testing.T) {
2187-
ok, mid, apiErr := MatchInstanceTypeCapabilitiesForMachines(ctx, logger, dbSession, instIB.ID, []string{mcMismatch.ID})
2188-
assert.Nil(t, apiErr)
2189-
assert.False(t, ok)
2190-
assert.NotNil(t, mid)
2191-
assert.Equal(t, mcMismatch.ID, *mid)
2192-
})
2193-
}
2194-
21952109
func TestGetAllocationResourceTypeMaps(t *testing.T) {
21962110
ctx := context.Background()
21972111
dbSession := testCommonInitDB(t)

0 commit comments

Comments
 (0)