diff --git a/pkg/cache/cluster.go b/pkg/cache/cluster.go index 38f1e6016..18f987e3b 100644 --- a/pkg/cache/cluster.go +++ b/pkg/cache/cluster.go @@ -3,6 +3,7 @@ package cache import ( "context" "fmt" + "os" "runtime/debug" "strings" "sync" @@ -59,6 +60,15 @@ const ( defaultEventProcessingInterval = 100 * time.Millisecond ) +// Environment variables +const ( + // EnvDisableClusterScopedParentRefs disables cluster-scoped parent owner reference resolution + // when set to any non-empty value. This provides an emergency fallback to disable the + // cluster-scoped to namespaced hierarchy traversal feature if performance issues are encountered. + // Default behavior (empty/unset) enables cluster-scoped parent owner reference resolution. + EnvDisableClusterScopedParentRefs = "GITOPS_ENGINE_DISABLE_CLUSTER_SCOPED_PARENT_REFS" +) + const ( // RespectRbacDisabled default value for respectRbac RespectRbacDisabled = iota @@ -187,6 +197,8 @@ func NewClusterCache(config *rest.Config, opts ...UpdateSettingsFunc) *clusterCa listRetryLimit: 1, listRetryUseBackoff: false, listRetryFunc: ListRetryFuncNever, + // Check environment variable once at startup for performance + disableClusterScopedParentRefs: os.Getenv("GITOPS_ENGINE_DISABLE_CLUSTER_SCOPED_PARENT_REFS") != "", } for i := range opts { opts[i](cache) @@ -245,6 +257,10 @@ type clusterCache struct { gvkParser *managedfields.GvkParser respectRBAC int + + // disableClusterScopedParentRefs is set at initialization to cache the environment variable check + // When true, cluster-scoped parent owner reference resolution is disabled for emergency fallback + disableClusterScopedParentRefs bool } type clusterCacheSync struct { @@ -1065,7 +1081,12 @@ func (c *clusterCache) IterateHierarchyV2(keys []kube.ResourceKey, action func(r } for namespace, namespaceKeys := range keysPerNamespace { nsNodes := c.nsIndex[namespace] - graph := buildGraph(nsNodes) + // Use cluster-scoped parent owner reference resolution unless disabled by environment variable + var allResources map[kube.ResourceKey]*Resource + if !c.disableClusterScopedParentRefs { + allResources = c.resources + } + graph := buildGraph(nsNodes, allResources) visited := make(map[kube.ResourceKey]int) for _, key := range namespaceKeys { visited[key] = 0 @@ -1095,17 +1116,23 @@ func (c *clusterCache) IterateHierarchyV2(keys []kube.ResourceKey, action func(r } } -func buildGraph(nsNodes map[kube.ResourceKey]*Resource) map[kube.ResourceKey]map[types.UID]*Resource { +func buildGraph(nsNodes map[kube.ResourceKey]*Resource, allResources map[kube.ResourceKey]*Resource) map[kube.ResourceKey]map[types.UID]*Resource { // Prepare to construct a graph nodesByUID := make(map[types.UID][]*Resource, len(nsNodes)) + processingClusterScope := false for _, node := range nsNodes { nodesByUID[node.Ref.UID] = append(nodesByUID[node.Ref.UID], node) + // Check if we're processing the cluster-scoped namespace slice (namespace == "") + if node.Ref.Namespace == "" { + processingClusterScope = true + } } // In graph, they key is the parent and the value is a list of children. graph := make(map[kube.ResourceKey]map[types.UID]*Resource) // Loop through all nodes, calling each one "childNode," because we're only bothering with it if it has a parent. + // First process nodes in the current namespace for _, childNode := range nsNodes { for i, ownerRef := range childNode.OwnerRefs { // First, backfill UID of inferred owner child references. @@ -1115,7 +1142,16 @@ func buildGraph(nsNodes map[kube.ResourceKey]*Resource) map[kube.ResourceKey]map // APIVersion is invalid, so we couldn't find the parent. continue } - graphKeyNode, ok := nsNodes[kube.ResourceKey{Group: group.Group, Kind: ownerRef.Kind, Namespace: childNode.Ref.Namespace, Name: ownerRef.Name}] + // Try same-namespace lookup first (preserves existing behavior) + sameNSKey := kube.ResourceKey{Group: group.Group, Kind: ownerRef.Kind, Namespace: childNode.Ref.Namespace, Name: ownerRef.Name} + graphKeyNode, ok := nsNodes[sameNSKey] + + // If not found and we have cluster-scoped parent capabilities, try cluster-scoped lookup + if !ok && allResources != nil { + clusterScopedKey := kube.ResourceKey{Group: group.Group, Kind: ownerRef.Kind, Namespace: "", Name: ownerRef.Name} + graphKeyNode, ok = allResources[clusterScopedKey] + } + if !ok { // No resource found with the given graph key, so move on. continue @@ -1126,6 +1162,18 @@ func buildGraph(nsNodes map[kube.ResourceKey]*Resource) map[kube.ResourceKey]map // Now that we have the UID of the parent, update the graph. uidNodes, ok := nodesByUID[ownerRef.UID] + if !ok && allResources != nil { + // If parent not found in current namespace, check if it exists in allResources + // and create a temporary uidNodes list for cluster-scoped parent relationships + for _, parentCandidate := range allResources { + if parentCandidate.Ref.UID == ownerRef.UID { + uidNodes = []*Resource{parentCandidate} + ok = true + break + } + } + } + if ok { for _, uidNode := range uidNodes { // Update the graph for this owner to include the child. @@ -1148,6 +1196,34 @@ func buildGraph(nsNodes map[kube.ResourceKey]*Resource) map[kube.ResourceKey]map } } } + + // Second pass: process namespaced children of cluster-scoped parents if allResources is provided + // Skip this pass when processing regular namespaces (only needed when processing cluster-scoped resources) + if allResources != nil && processingClusterScope { + for _, childNode := range allResources { + // Skip if already processed in the current namespace + if _, exists := nsNodes[childNode.ResourceKey()]; exists { + continue + } + + // Check if this child has a cluster-scoped parent that we're currently processing + for _, ownerRef := range childNode.OwnerRefs { + group, err := schema.ParseGroupVersion(ownerRef.APIVersion) + if err != nil { + continue + } + parentKey := kube.ResourceKey{Group: group.Group, Kind: ownerRef.Kind, Namespace: "", Name: ownerRef.Name} + if parentNode, exists := nsNodes[parentKey]; exists { + // Found a cluster-scoped parent → namespaced child relationship + if _, ok := graph[parentNode.ResourceKey()]; !ok { + graph[parentNode.ResourceKey()] = make(map[types.UID]*Resource) + } + graph[parentNode.ResourceKey()][childNode.Ref.UID] = childNode + } + } + } + } + return graph } diff --git a/pkg/cache/cluster_test.go b/pkg/cache/cluster_test.go index 7bac78043..bc78e797f 100644 --- a/pkg/cache/cluster_test.go +++ b/pkg/cache/cluster_test.go @@ -24,6 +24,7 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/dynamic/fake" "k8s.io/client-go/kubernetes/scheme" @@ -91,11 +92,11 @@ func newClusterWithOptions(_ testing.TB, opts []UpdateSettingsFunc, objs ...runt client.PrependReactor("list", "*", func(action testcore.Action) (handled bool, ret runtime.Object, err error) { handled, ret, err = reactor.React(action) if err != nil || !handled { - return + return handled, ret, fmt.Errorf("reactor failed: %w", err) } // make sure list response have resource version ret.(metav1.ListInterface).SetResourceVersion("123") - return + return handled, ret, nil }) apiResources := []kube.APIResourceInfo{{ @@ -1157,6 +1158,389 @@ func TestIterateHierachyV2(t *testing.T) { }) } +// TestIterateHierarchyV2_ClusterScopedParentOwnerReference tests that cluster-scoped parent +// owner references work correctly, specifically cluster-scoped resources with +// namespaced children +func TestIterateHierarchyV2_ClusterScopedParentOwnerReference(t *testing.T) { + cluster := newCluster(t) + + // Create cluster-scoped parent resource (ProviderRevision) + parentUID := types.UID("parent-uid-123") + clusterScopedParent := &Resource{ + Ref: corev1.ObjectReference{ + APIVersion: "pkg.crossplane.io/v1", + Kind: "ProviderRevision", + Name: "provider-aws-cloudformation-3b2c213545b8", + UID: parentUID, + // No namespace = cluster-scoped + }, + OwnerRefs: []metav1.OwnerReference{}, + } + + // Create namespaced child with ownerReference to cluster-scoped parent + namespacedChild := &Resource{ + Ref: corev1.ObjectReference{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "provider-aws-cloudformation-3b2c213545b8", + Namespace: "crossplane-system", + UID: types.UID("child-uid-456"), + }, + OwnerRefs: []metav1.OwnerReference{{ + APIVersion: "pkg.crossplane.io/v1", + Kind: "ProviderRevision", + Name: "provider-aws-cloudformation-3b2c213545b8", + // UID: parentUID, // Don't set UID - let it be resolved via cross-namespace lookup + }}, + } + + // Create cluster-scoped child with ownerReference to cluster-scoped parent (this should work already) + clusterScopedChild := &Resource{ + Ref: corev1.ObjectReference{ + APIVersion: "rbac.authorization.k8s.io/v1", + Kind: "ClusterRole", + Name: "crossplane:provider:provider-aws-cloudformation-3b2c213545b8:system", + UID: types.UID("child-uid-789"), + // No namespace = cluster-scoped + }, + OwnerRefs: []metav1.OwnerReference{{ + APIVersion: "pkg.crossplane.io/v1", + Kind: "ProviderRevision", + Name: "provider-aws-cloudformation-3b2c213545b8", + UID: parentUID, + }}, + } + + // Add all resources to cluster cache + cluster.setNode(clusterScopedParent) + cluster.setNode(namespacedChild) + cluster.setNode(clusterScopedChild) + + // Test hierarchy traversal starting from cluster-scoped parent + var visitedResources []*Resource + cluster.IterateHierarchyV2( + []kube.ResourceKey{clusterScopedParent.ResourceKey()}, + func(resource *Resource, _ map[kube.ResourceKey]*Resource) bool { + visitedResources = append(visitedResources, resource) + return true + }, + ) + + // Should visit parent + both children (3 resources) + assert.Len(t, visitedResources, 3, "Should visit parent and both children") + + visitedNames := make([]string, len(visitedResources)) + for i, res := range visitedResources { + visitedNames[i] = res.Ref.Name + } + + // Check we have the expected resources by type and namespace combination + foundParent := false + foundClusterChild := false + foundNamespacedChild := false + + for _, res := range visitedResources { + switch { + case res.Ref.Kind == "ProviderRevision" && res.Ref.Namespace == "": + foundParent = true + case res.Ref.Kind == "ClusterRole" && res.Ref.Namespace == "": + foundClusterChild = true + case res.Ref.Kind == "Deployment" && res.Ref.Namespace == "crossplane-system": + foundNamespacedChild = true + } + } + + assert.True(t, foundParent, "Should visit ProviderRevision parent") + assert.True(t, foundClusterChild, "Should visit ClusterRole child") + assert.True(t, foundNamespacedChild, "Should visit Deployment child (this tests the fix)") +} + +// TestIterateHierarchyV2_DisabledClusterScopedParents tests that the environment variable +// disables cluster-scoped parent owner reference resolution +func TestIterateHierarchyV2_DisabledClusterScopedParents(t *testing.T) { + // Set environment variable to disable cluster-scoped parent functionality + t.Setenv("GITOPS_ENGINE_DISABLE_CLUSTER_SCOPED_PARENT_REFS", "1") + + cluster := newCluster(t) + + // Create cluster-scoped parent resource (ProviderRevision) + parentUID := types.UID("parent-uid-123") + clusterScopedParent := &Resource{ + Ref: corev1.ObjectReference{ + APIVersion: "pkg.crossplane.io/v1", + Kind: "ProviderRevision", + Name: "provider-aws-cloudformation-3b2c213545b8", + UID: parentUID, + // No namespace = cluster-scoped + }, + OwnerRefs: []metav1.OwnerReference{}, + } + + // Create namespaced child with ownerReference to cluster-scoped parent + namespacedChild := &Resource{ + Ref: corev1.ObjectReference{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "provider-aws-cloudformation-3b2c213545b8", + Namespace: "crossplane-system", + UID: types.UID("child-uid-456"), + }, + OwnerRefs: []metav1.OwnerReference{{ + APIVersion: "pkg.crossplane.io/v1", + Kind: "ProviderRevision", + Name: "provider-aws-cloudformation-3b2c213545b8", + }}, + } + + // Add resources to cluster cache + cluster.setNode(clusterScopedParent) + cluster.setNode(namespacedChild) + + // Test hierarchy traversal starting from cluster-scoped parent + var visitedResources []*Resource + cluster.IterateHierarchyV2( + []kube.ResourceKey{clusterScopedParent.ResourceKey()}, + func(resource *Resource, _ map[kube.ResourceKey]*Resource) bool { + visitedResources = append(visitedResources, resource) + return true + }, + ) + + // When disabled, should only visit the parent (1 resource) + // because cluster-scoped parent resolution is disabled + assert.Len(t, visitedResources, 1, "Should only visit parent when cluster-scoped parent resolution disabled") + assert.Equal(t, "ProviderRevision", visitedResources[0].Ref.Kind) +} + +// buildGraphTestHelper creates test resources and maps for buildGraph testing +type buildGraphTestHelper struct { + parentUID types.UID + childUID types.UID + parent *Resource + child *Resource +} + +func newBuildGraphTestHelper() *buildGraphTestHelper { + parentUID := types.UID("parent-123") + childUID := types.UID("child-456") + return &buildGraphTestHelper{ + parentUID: parentUID, + childUID: childUID, + } +} + +func (h *buildGraphTestHelper) createParent(namespace string) { + h.parent = &Resource{ + Ref: corev1.ObjectReference{ + APIVersion: "v1", + Kind: "ConfigMap", + Name: "parent", + Namespace: namespace, + UID: h.parentUID, + }, + } +} + +func (h *buildGraphTestHelper) createChild(ownerRefs ...metav1.OwnerReference) { + h.child = &Resource{ + Ref: corev1.ObjectReference{ + APIVersion: "v1", + Kind: "Pod", + Name: "child", + Namespace: "test-ns", + UID: h.childUID, + }, + OwnerRefs: ownerRefs, + } +} + +func (h *buildGraphTestHelper) standardOwnerRef() metav1.OwnerReference { + return metav1.OwnerReference{ + APIVersion: "v1", + Kind: "ConfigMap", + Name: "parent", + UID: h.parentUID, + } +} + +func (h *buildGraphTestHelper) ownerRefWithoutUID() metav1.OwnerReference { + return metav1.OwnerReference{ + APIVersion: "v1", + Kind: "ConfigMap", + Name: "parent", + } +} + +func (h *buildGraphTestHelper) invalidOwnerRef() metav1.OwnerReference { + return metav1.OwnerReference{ + APIVersion: "invalid/api/version", + Kind: "ConfigMap", + Name: "parent", + } +} + +func (h *buildGraphTestHelper) nsNodes() map[kube.ResourceKey]*Resource { + result := make(map[kube.ResourceKey]*Resource) + if h.parent != nil { + result[h.parent.ResourceKey()] = h.parent + } + if h.child != nil { + result[h.child.ResourceKey()] = h.child + } + return result +} + +func (h *buildGraphTestHelper) allResources() map[kube.ResourceKey]*Resource { + return h.nsNodes() // For simplicity, use same as nsNodes unless overridden +} + +// Common assertion helpers +func (h *buildGraphTestHelper) assertParentChildRelationship(t *testing.T, graph map[kube.ResourceKey]map[types.UID]*Resource) { + t.Helper() + assert.Contains(t, graph, h.parent.ResourceKey()) + assert.Contains(t, graph[h.parent.ResourceKey()], h.childUID) +} + +func (h *buildGraphTestHelper) assertEmptyGraph(t *testing.T, graph map[kube.ResourceKey]map[types.UID]*Resource) { + t.Helper() + assert.Empty(t, graph) +} + +func (h *buildGraphTestHelper) assertUIDBackfilled(t *testing.T) { + t.Helper() + assert.Equal(t, h.parentUID, h.child.OwnerRefs[0].UID) +} + +// TestBuildGraph_NilAllResources tests buildGraph with nil allResources parameter +func TestBuildGraph_NilAllResources(t *testing.T) { + h := newBuildGraphTestHelper() + h.createParent("test-ns") + h.createChild(h.standardOwnerRef()) + + graph := buildGraph(h.nsNodes(), nil) + h.assertParentChildRelationship(t, graph) +} + +// TestBuildGraph_InvalidAPIVersion tests handling of invalid API versions in owner references +func TestBuildGraph_InvalidAPIVersion(t *testing.T) { + h := newBuildGraphTestHelper() + h.createChild(h.invalidOwnerRef()) + + graph := buildGraph(h.nsNodes(), h.allResources()) + h.assertEmptyGraph(t, graph) +} + +// TestBuildGraph_CrossNamespaceMissingUID tests cross-namespace parent resolution with missing UID +func TestBuildGraph_CrossNamespaceMissingUID(t *testing.T) { + h := newBuildGraphTestHelper() + h.createParent("") // Cluster-scoped + h.createChild(h.ownerRefWithoutUID()) + + allResources := map[kube.ResourceKey]*Resource{ + h.parent.ResourceKey(): h.parent, + h.child.ResourceKey(): h.child, + } + nsNodes := map[kube.ResourceKey]*Resource{h.child.ResourceKey(): h.child} + + graph := buildGraph(nsNodes, allResources) + h.assertParentChildRelationship(t, graph) + h.assertUIDBackfilled(t) +} + +// TestBuildGraph_NonExistentParent tests cross-namespace child with non-existent parent +func TestBuildGraph_NonExistentParent(t *testing.T) { + h := newBuildGraphTestHelper() + nonExistentOwnerRef := metav1.OwnerReference{ + APIVersion: "v1", Kind: "ConfigMap", Name: "non-existent-parent", + } + h.createChild(nonExistentOwnerRef) + + graph := buildGraph(h.nsNodes(), h.allResources()) + h.assertEmptyGraph(t, graph) +} + +// TestBuildGraph_CrossNamespaceUIDLookup tests cross-namespace parent lookup by UID +func TestBuildGraph_CrossNamespaceUIDLookup(t *testing.T) { + h := newBuildGraphTestHelper() + h.createParent("") // Cluster-scoped + h.createChild(h.standardOwnerRef()) + + nsNodes := map[kube.ResourceKey]*Resource{h.child.ResourceKey(): h.child} + allResources := map[kube.ResourceKey]*Resource{ + h.parent.ResourceKey(): h.parent, + h.child.ResourceKey(): h.child, + } + + graph := buildGraph(nsNodes, allResources) + h.assertParentChildRelationship(t, graph) +} + +// TestBuildGraph_DuplicateUIDs tests handling of resources with duplicate UIDs +func TestBuildGraph_DuplicateUIDs(t *testing.T) { + duplicateUID := types.UID("duplicate-456") + h := newBuildGraphTestHelper() + h.createParent("test-ns") + + // Create two children with same UID but different API versions (simulating replicasets from different API groups) + child1 := &Resource{ + Ref: corev1.ObjectReference{ + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: "child-apps", + Namespace: "test-ns", + UID: duplicateUID, + }, + OwnerRefs: []metav1.OwnerReference{h.standardOwnerRef()}, + } + + child2 := &Resource{ + Ref: corev1.ObjectReference{ + APIVersion: "extensions/v1beta1", + Kind: "ReplicaSet", + Name: "child-extensions", + Namespace: "test-ns", + UID: duplicateUID, + }, + OwnerRefs: []metav1.OwnerReference{h.standardOwnerRef()}, + } + + nsNodes := map[kube.ResourceKey]*Resource{ + h.parent.ResourceKey(): h.parent, + child1.ResourceKey(): child1, + child2.ResourceKey(): child2, + } + + graph := buildGraph(nsNodes, nil) + + assert.Contains(t, graph, h.parent.ResourceKey()) + assert.Contains(t, graph[h.parent.ResourceKey()], duplicateUID) + assert.NotNil(t, graph[h.parent.ResourceKey()][duplicateUID]) +} + +// TestIterateHierarchyV2_EdgeCases tests additional edge cases for hierarchy iteration +func TestIterateHierarchyV2_EdgeCases(t *testing.T) { + cluster := newCluster(t) + + t.Run("EmptyKeysList", func(t *testing.T) { + var visited []kube.ResourceKey + cluster.IterateHierarchyV2([]kube.ResourceKey{}, func(resource *Resource, _ map[kube.ResourceKey]*Resource) bool { + visited = append(visited, resource.ResourceKey()) + return true + }) + assert.Empty(t, visited) + }) + + t.Run("NonExistentKeys", func(t *testing.T) { + var visited []kube.ResourceKey + nonExistentKey := kube.ResourceKey{Group: "fake", Kind: "Fake", Namespace: "fake", Name: "fake"} + cluster.IterateHierarchyV2([]kube.ResourceKey{nonExistentKey}, func(resource *Resource, _ map[kube.ResourceKey]*Resource) bool { + visited = append(visited, resource.ResourceKey()) + return true + }) + assert.Empty(t, visited) + }) +} + // Test_watchEvents_Deadlock validates that starting watches will not create a deadlock // caused by using improper locking in various callback methods when there is a high load on the // system. @@ -1189,7 +1573,7 @@ func Test_watchEvents_Deadlock(t *testing.T) { // deadlock.RLock() // defer deadlock.RUnlock() - return + return info, cacheManifest }), }, res1, res2, testDeploy()) defer func() { @@ -1273,7 +1657,7 @@ func BenchmarkBuildGraph(b *testing.B) { testResources := buildTestResourceMap() b.ResetTimer() for n := 0; n < b.N; n++ { - buildGraph(testResources) + buildGraph(testResources, nil) } }