Skip to content

Commit 5d49e37

Browse files
committed
corrections following comments
Signed-off-by: Patrice Breton <pbreton@nvidia.com>
1 parent 5e59acd commit 5d49e37

7 files changed

Lines changed: 182 additions & 70 deletions

File tree

api/pkg/api/handler/allocation.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -478,15 +478,22 @@ func (cah CreateAllocationHandler) Handle(c echo.Context) error {
478478
}
479479
ossaDAO := cdbm.NewOperatingSystemSiteAssociationDAO(cah.dbSession)
480480
for _, gos := range globalTenantOSes {
481-
if _, aerr := ossaDAO.Create(ctx, tx, cdbm.OperatingSystemSiteAssociationCreateInput{
482-
OperatingSystemID: gos.ID,
483-
SiteID: site.ID,
484-
Status: cdbm.OperatingSystemSiteAssociationStatusSyncing,
485-
CreatedBy: dbUser.ID,
486-
}); aerr != nil {
487-
logger.Error().Err(aerr).Str("osID", gos.ID.String()).Msg("Failed to auto-associate tenant global OS with new site")
481+
_, aerr := ossaDAO.GetByOperatingSystemIDAndSiteID(ctx, tx, gos.ID, site.ID, nil)
482+
if aerr != nil && aerr != cdb.ErrDoesNotExist {
483+
logger.Error().Err(aerr).Str("osID", gos.ID.String()).Msg("Failed to check existing OS-site association")
488484
return cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to associate global-scoped Operating Systems with new Site", nil)
489485
}
486+
if aerr == cdb.ErrDoesNotExist {
487+
if _, aerr = ossaDAO.Create(ctx, tx, cdbm.OperatingSystemSiteAssociationCreateInput{
488+
OperatingSystemID: gos.ID,
489+
SiteID: site.ID,
490+
Status: cdbm.OperatingSystemSiteAssociationStatusSyncing,
491+
CreatedBy: dbUser.ID,
492+
}); aerr != nil {
493+
logger.Error().Err(aerr).Str("osID", gos.ID.String()).Msg("Failed to auto-associate tenant global OS with new site")
494+
return cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to associate global-scoped Operating Systems with new Site", nil)
495+
}
496+
}
490497
}
491498
if len(globalTenantOSes) > 0 {
492499
logger.Info().Int("count", len(globalTenantOSes)).Msg("Auto-associated tenant global-scoped OSes with new site")

api/pkg/api/handler/allocation_test.go

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,126 @@ func TestAllocationHandler_Create(t *testing.T) {
652652
}
653653
}
654654

655+
// TestAllocationHandler_Create_GlobalOSAutoAssociationIdempotent verifies that
656+
// creating an Allocation for a tenant that previously had (and lost) access to a
657+
// Site does not fail because the OperatingSystemSiteAssociation from the earlier
658+
// allocation already exists.
659+
//
660+
// Scenario:
661+
// 1. Tenant has a global-scoped IPXE OS.
662+
// 2. First Allocation → TenantSite is created → OS is auto-associated with Site.
663+
// 3. TenantSite is deleted to simulate all Allocations being removed.
664+
// 4. Second Allocation (same tenant + site) → TenantSite is recreated → OS
665+
// auto-association must be skipped (not fail) because the row still exists.
666+
func TestAllocationHandler_Create_GlobalOSAutoAssociationIdempotent(t *testing.T) {
667+
ctx := context.Background()
668+
dbSession := testMachineInitDB(t)
669+
defer dbSession.Close()
670+
671+
common.TestSetupSchema(t, dbSession)
672+
673+
ipOrg := "test-ip-org-idempotent"
674+
tnOrg := "test-tn-org-idempotent"
675+
676+
ipu := testMachineBuildUser(t, dbSession, uuid.New().String(), []string{ipOrg}, []string{"FORGE_PROVIDER_ADMIN"})
677+
tnu := testMachineBuildUser(t, dbSession, uuid.New().String(), []string{tnOrg}, []string{"FORGE_TENANT_ADMIN"})
678+
679+
ip := common.TestBuildInfrastructureProvider(t, dbSession, "TestIpIdempotent", ipOrg, ipu)
680+
site := testIPBlockBuildSite(t, dbSession, ip, "testSiteIdempotent", cdbm.SiteStatusRegistered, true, ipu)
681+
tenant := testMachineBuildTenant(t, dbSession, tnOrg, "t-idempotent")
682+
683+
it := common.TestBuildInstanceType(t, dbSession, "testITIdempotent", cdb.GetUUIDPtr(uuid.New()), site, map[string]string{
684+
"name": "test-instance-type-idempotent",
685+
"description": "Idempotent test instance type",
686+
}, ipu)
687+
for i := 0; i < 5; i++ {
688+
mc := testInstanceBuildMachine(t, dbSession, ip.ID, site.ID, cdb.GetBoolPtr(false), nil)
689+
require.NotNil(t, mc)
690+
require.NotNil(t, testInstanceBuildMachineInstanceType(t, dbSession, mc, it))
691+
}
692+
693+
ipb := testIPBlockBuildIPBlock(t, dbSession, "testipb-idempotent", site, ip, &tenant.ID,
694+
cdbm.IPBlockRoutingTypeDatacenterOnly, "10.99.0.0", 16, cdbm.IPBlockProtocolVersionV4,
695+
false, cdbm.IPBlockStatusReady, ipu)
696+
697+
ipamStorage := ipam.NewIpamStorage(dbSession.DB, nil)
698+
_, err := ipam.CreateIpamEntryForIPBlock(ctx, ipamStorage, ipb.Prefix, ipb.PrefixLength,
699+
ipb.RoutingType, ipb.InfrastructureProviderID.String(), ipb.SiteID.String())
700+
require.NoError(t, err)
701+
702+
// A tenant-owned global-scoped IPXE OS — this is what the auto-association code targets.
703+
globalScope := cdbm.OperatingSystemScopeGlobal
704+
globalOS := &cdbm.OperatingSystem{
705+
ID: uuid.New(),
706+
Name: "global-os-idempotent",
707+
TenantID: cdb.GetUUIDPtr(tenant.ID),
708+
Type: cdbm.OperatingSystemTypeIPXE,
709+
IpxeOsScope: &globalScope,
710+
IpxeScript: cdb.GetStrPtr(common.DefaultIpxeScript),
711+
IsActive: true,
712+
Status: cdbm.OperatingSystemStatusReady,
713+
CreatedBy: tnu.ID,
714+
}
715+
_, err = dbSession.DB.NewInsert().Model(globalOS).Exec(ctx)
716+
require.NoError(t, err)
717+
718+
ac := model.APIAllocationConstraintCreateRequest{
719+
ResourceType: cdbm.AllocationResourceTypeInstanceType,
720+
ResourceTypeID: it.ID.String(),
721+
ConstraintType: cdbm.AllocationConstraintTypeReserved,
722+
ConstraintValue: 2,
723+
}
724+
body, err := json.Marshal(model.APIAllocationCreateRequest{
725+
Name: "alloc-idempotent-1",
726+
Description: cdb.GetStrPtr(""),
727+
TenantID: tenant.ID.String(),
728+
SiteID: site.ID.String(),
729+
AllocationConstraints: []model.APIAllocationConstraintCreateRequest{ac},
730+
})
731+
require.NoError(t, err)
732+
733+
// First allocation: TenantSite is created and the global OS is auto-associated.
734+
a1 := testCreateAllocation(t, dbSession, ipamStorage, ipu, ipOrg, string(body))
735+
require.NotNil(t, a1)
736+
737+
ossaDAO := cdbm.NewOperatingSystemSiteAssociationDAO(dbSession)
738+
_, err = ossaDAO.GetByOperatingSystemIDAndSiteID(ctx, nil, globalOS.ID, site.ID, nil)
739+
require.NoError(t, err, "OS-site association must exist after first allocation")
740+
741+
// Simulate all Allocations being removed: delete the TenantSite record so that
742+
// the next Allocation triggers TenantSite (and OS auto-association) logic again.
743+
_, err = dbSession.DB.NewDelete().TableExpr("tenant_site").
744+
Where("tenant_id = ? AND site_id = ?", tenant.ID, site.ID).Exec(ctx)
745+
require.NoError(t, err)
746+
747+
body2, err := json.Marshal(model.APIAllocationCreateRequest{
748+
Name: "alloc-idempotent-2",
749+
Description: cdb.GetStrPtr(""),
750+
TenantID: tenant.ID.String(),
751+
SiteID: site.ID.String(),
752+
AllocationConstraints: []model.APIAllocationConstraintCreateRequest{ac},
753+
})
754+
require.NoError(t, err)
755+
756+
// Second allocation on the same site: must succeed even though the
757+
// OperatingSystemSiteAssociation from the first allocation still exists.
758+
a2 := testCreateAllocation(t, dbSession, ipamStorage, ipu, ipOrg, string(body2))
759+
require.NotNil(t, a2, "second allocation must succeed when OS-site association already exists")
760+
761+
// The association should still exist exactly once (not duplicated).
762+
ossas, ossaCount, err := ossaDAO.GetAll(ctx, nil,
763+
cdbm.OperatingSystemSiteAssociationFilterInput{
764+
OperatingSystemIDs: []uuid.UUID{globalOS.ID},
765+
SiteIDs: []uuid.UUID{site.ID},
766+
},
767+
cdbp.PageInput{},
768+
nil,
769+
)
770+
require.NoError(t, err)
771+
assert.Equal(t, 1, ossaCount, "OS-site association must exist exactly once after both allocations")
772+
_ = ossas
773+
}
774+
655775
func testCreateAllocation(t *testing.T, dbSession *cdb.Session, ipamStorage cipam.Storage, user *cdbm.User, reqOrgName, reqBody string) *model.APIAllocation {
656776
ctx := context.Background()
657777
e := echo.New()

api/pkg/api/handler/instance.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1910,7 +1910,7 @@ func (uih UpdateInstanceHandler) buildInstanceUpdateRequestOsConfig(c echo.Conte
19101910

19111911
// Confirm ownership between tenant and OS.
19121912
// Provider-owned OS (TenantID is nil) is accessible to any tenant.
1913-
if os.TenantID != nil && os.TenantID.String() != instance.Tenant.ID.String() {
1913+
if !(os.TenantID == nil || *os.TenantID == instance.TenantID) {
19141914
logger.Error().Msg("OperatingSystem in request is not owned by tenant")
19151915
return nil, nil, cutil.NewAPIError(http.StatusBadRequest, "Operating system specified in request is not owned by Tenant", nil)
19161916
}

api/pkg/api/handler/ipxetemplate.go

Lines changed: 10 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
cdb "github.com/NVIDIA/ncx-infra-controller-rest/db/pkg/db"
3434
cdbm "github.com/NVIDIA/ncx-infra-controller-rest/db/pkg/db/model"
3535
cdbp "github.com/NVIDIA/ncx-infra-controller-rest/db/pkg/db/paginator"
36+
mapset "github.com/deckarep/golang-set/v2"
3637
"github.com/google/uuid"
3738
"github.com/labstack/echo/v4"
3839
"github.com/rs/zerolog"
@@ -113,8 +114,8 @@ func (h GetAllIpxeTemplateHandler) Handle(c echo.Context) error {
113114
// Note on tenant-path scoping: tenant access is established per-site via
114115
// `TenantSite` associations (a tenant may be associated with some sites of
115116
// a provider but not others).
116-
providerSites := map[uuid.UUID]struct{}{}
117-
tenantSites := map[uuid.UUID]struct{}{}
117+
providerSites := mapset.NewSet[uuid.UUID]()
118+
tenantSites := mapset.NewSet[uuid.UUID]()
118119

119120
if infrastructureProvider != nil {
120121
siteDAO := cdbm.NewSiteDAO(h.dbSession)
@@ -128,7 +129,7 @@ func (h GetAllIpxeTemplateHandler) Handle(c echo.Context) error {
128129
return cerr.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve provider sites, DB error", nil)
129130
}
130131
for i := range sites {
131-
providerSites[sites[i].ID] = struct{}{}
132+
providerSites.Add(sites[i].ID)
132133
}
133134
}
134135

@@ -144,16 +145,12 @@ func (h GetAllIpxeTemplateHandler) Handle(c echo.Context) error {
144145
return cerr.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve Tenant Site associations, DB error", nil)
145146
}
146147
for i := range tss {
147-
tenantSites[tss[i].SiteID] = struct{}{}
148+
tenantSites.Add(tss[i].SiteID)
148149
}
149150
}
150151

151152
isAuthorized := func(id uuid.UUID) bool {
152-
if _, ok := providerSites[id]; ok {
153-
return true
154-
}
155-
_, ok := tenantSites[id]
156-
return ok
153+
return providerSites.Contains(id) || tenantSites.Contains(id)
157154
}
158155

159156
// Determine the effective site filter:
@@ -169,20 +166,7 @@ func (h GetAllIpxeTemplateHandler) Handle(c echo.Context) error {
169166
}
170167
effectiveSiteIDs = requestedSiteIDs
171168
} else {
172-
effectiveSiteIDs = make([]uuid.UUID, 0, len(providerSites)+len(tenantSites))
173-
seen := map[uuid.UUID]struct{}{}
174-
for id := range providerSites {
175-
if _, ok := seen[id]; !ok {
176-
seen[id] = struct{}{}
177-
effectiveSiteIDs = append(effectiveSiteIDs, id)
178-
}
179-
}
180-
for id := range tenantSites {
181-
if _, ok := seen[id]; !ok {
182-
seen[id] = struct{}{}
183-
effectiveSiteIDs = append(effectiveSiteIDs, id)
184-
}
185-
}
169+
effectiveSiteIDs = providerSites.Union(tenantSites).ToSlice()
186170
}
187171

188172
// No authorized sites — neither provider-owned nor reachable via a tenant account.
@@ -214,27 +198,11 @@ func (h GetAllIpxeTemplateHandler) Handle(c echo.Context) error {
214198
return cerr.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve iPXE template site associations, DB error", nil)
215199
}
216200

217-
templateIDSet := map[uuid.UUID]struct{}{}
218-
templateIDs := make([]uuid.UUID, 0, len(associations))
201+
templateIDSet := mapset.NewSet[uuid.UUID]()
219202
for _, a := range associations {
220-
if _, ok := templateIDSet[a.IpxeTemplateID]; ok {
221-
continue
222-
}
223-
templateIDSet[a.IpxeTemplateID] = struct{}{}
224-
templateIDs = append(templateIDs, a.IpxeTemplateID)
225-
}
226-
227-
if len(templateIDs) == 0 {
228-
emptyPageResponse := pagination.NewPageResponse(*pageRequest.PageNumber, *pageRequest.PageSize, 0, pageRequest.OrderByStr)
229-
emptyPageHeader, err := json.Marshal(emptyPageResponse)
230-
if err != nil {
231-
logger.Error().Err(err).Msg("error marshaling empty pagination response")
232-
return cerr.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to generate pagination response header", nil)
233-
}
234-
c.Response().Header().Set(pagination.ResponseHeaderName, string(emptyPageHeader))
235-
logger.Info().Msg("finishing API handler")
236-
return c.JSON(http.StatusOK, []*model.APIIpxeTemplate{})
203+
templateIDSet.Add(a.IpxeTemplateID)
237204
}
205+
templateIDs := templateIDSet.ToSlice()
238206

239207
templateDAO := cdbm.NewIpxeTemplateDAO(h.dbSession)
240208
templates, total, err := templateDAO.GetAll(

db/pkg/db/model/ipxetemplate.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,10 @@ func (itd IpxeTemplateSQLDAO) GetAll(ctx context.Context, tx *db.Tx, filter Ipxe
217217

218218
templates := []IpxeTemplate{}
219219

220+
if filter.IDs != nil && len(filter.IDs) == 0 {
221+
return templates, 0, nil
222+
}
223+
220224
query := db.GetIDB(tx, itd.dbSession).NewSelect().Model(&templates)
221225

222226
query, err := itd.setQueryWithFilter(filter, query, span)

site-agent/pkg/components/testing.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,11 @@ func TestInitElektra(t *testing.T) {
288288
if testElektra != nil {
289289
return
290290
}
291-
os.Setenv("CARBIDE_CERT_CHECK_INTERVAL", "1") // set this to check if certs were rotated every second to help with unit tests
292-
defer os.Unsetenv("CARBIDE_CERT_CHECK_INTERVAL")
291+
// Keep CARBIDE_CERT_CHECK_INTERVAL set for the lifetime of the test binary so that
292+
// the cert-reload goroutine (started inside api.Start below) reads the 1-second
293+
// interval before the ticker is created. Unsetting it via defer races against
294+
// goroutine scheduling and causes the goroutine to fall back to the 15-minute default.
295+
os.Setenv("CARBIDE_CERT_CHECK_INTERVAL", "1")
293296

294297
// Initialize test Site Agent
295298
log.Info().Msg("Elektra: Initializing test Site Agent")

workflow/pkg/activity/operatingsystem/operatingsystem.go

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,12 @@ func (mos ManageOperatingSystem) UpdateOperatingSystemsInDB(ctx context.Context,
617617
slogger.Error().Err(ossaErr).Msg("Failed to create site association for new OS")
618618
continue
619619
}
620+
621+
// Newly-created OS: definition and per-site association have just been
622+
// written with the reported state. Skip the existing-OS update path
623+
// below (it dereferences existingOS which is nil here) and do not add
624+
// to globalOrLimitedOSIDs because new records are always Local scope.
625+
continue
620626
}
621627

622628
// REST layer has already soft-deleted this OS (user-initiated)
@@ -662,32 +668,36 @@ func (mos ManageOperatingSystem) UpdateOperatingSystemsInDB(ctx context.Context,
662668
slogger.Error().Err(serr).Msg("Failed to create Operating System Site Association")
663669
continue
664670
}
665-
}
666-
667-
// Update existing Operating System Site Association
668-
_, uerr := ossaDAO.Update(ctx, nil, cdbm.OperatingSystemSiteAssociationUpdateInput{
669-
OperatingSystemSiteAssociationID: ossa.ID,
670-
Status: &ossaStatus,
671-
ControllerState: &controllerState,
672-
})
673-
if uerr != nil {
674-
slogger.Error().Err(uerr).Msg("Failed to update Operating System Site Association")
675-
continue
671+
} else {
672+
// Update existing Operating System Site Association
673+
_, uerr := ossaDAO.Update(ctx, nil, cdbm.OperatingSystemSiteAssociationUpdateInput{
674+
OperatingSystemSiteAssociationID: ossa.ID,
675+
Status: &ossaStatus,
676+
ControllerState: &controllerState,
677+
})
678+
if uerr != nil {
679+
slogger.Error().Err(uerr).Msg("Failed to update Operating System Site Association")
680+
continue
681+
}
676682
}
677683

678684
// TODO: Is this correct?
679685
if !isLocalScope {
680686
globalOrLimitedOSIDs[reportedOSID] = struct{}{}
681687
}
682688

683-
// Operating System exists in both REST and Site, update the REST record if it is newer
684-
// Backfill: older records may have been created with tenant_id set and no infrastructure_provider_id (before this ownership model was established).
685-
needsProviderBackfill := existingOS.InfrastructureProviderID == nil
686-
needsOrgBackfill := existingOS.Org == "" && site.Org != ""
687-
needsIsActiveCorrection := existingOS.IsActive != reportedOS.IsActive
688-
needsTenantClear := existingOS.TenantID != nil
689+
// Operating System exists in both REST and Site; update the REST record only for
690+
// Local-scoped OSes (Site is the source of truth for the definition).
691+
// Global/Limited OSes are REST-owned: skip the definition update and rely solely on
692+
// the aggregate status recomputation that runs at the end of this function.
693+
// Backfill: older records may have been created with tenant_id set and no
694+
// infrastructure_provider_id (before this ownership model was established).
695+
needsProviderBackfill := isLocalScope && existingOS.InfrastructureProviderID == nil
696+
needsOrgBackfill := isLocalScope && existingOS.Org == "" && site.Org != ""
697+
needsIsActiveCorrection := isLocalScope && existingOS.IsActive != reportedOS.IsActive
698+
needsTenantClear := isLocalScope && existingOS.TenantID != nil
689699

690-
if coreUpdated.After(existingOS.Updated) || needsProviderBackfill || needsOrgBackfill || needsIsActiveCorrection || needsTenantClear {
700+
if isLocalScope && (coreUpdated.After(existingOS.Updated) || needsProviderBackfill || needsOrgBackfill || needsIsActiveCorrection || needsTenantClear) {
691701
controllerState := cdbm.OperatingSystemStatusFromProtoMap[reportedOS.Status]
692702
if controllerState == "" {
693703
slogger.Warn().Str("Status", reportedOS.Status.String()).Msg("Received unknown status from Site, using `Syncing` as default")

0 commit comments

Comments
 (0)