diff --git a/pkg/tnf/operator/starter.go b/pkg/tnf/operator/starter.go index ba053b7632..a64eafc3c9 100644 --- a/pkg/tnf/operator/starter.go +++ b/pkg/tnf/operator/starter.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "os" + "sync" operatorv1 "github.com/openshift/api/operator/v1" configv1informers "github.com/openshift/client-go/config/informers/externalversions" @@ -16,8 +17,10 @@ import ( "github.com/openshift/library-go/pkg/operator/v1helpers" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" + corev1listers "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" @@ -55,16 +58,40 @@ func HandleDualReplicaClusters( runExternalEtcdSupportController(ctx, controllerContext, operatorClient, envVarGetter, kubeInformersForNamespaces, configInformers, networkInformer, controlPlaneNodeInformer, kubeClient) runTnfResourceController(ctx, controllerContext, kubeClient, dynamicClient, operatorClient, kubeInformersForNamespaces) - // we need node names for assigning auth jobs to specific nodes + controlPlaneNodeLister := corev1listers.NewNodeLister(controlPlaneNodeInformer.GetIndexer()) + + // we need node names for assigning auth and after-setup jobs to specific nodes + var once sync.Once klog.Infof("watching for nodes...") _, err := controlPlaneNodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { node, ok := obj.(*corev1.Node) if !ok { - klog.Warningf("failed to convert node to Node %+v", obj) + klog.Warningf("failed to convert added object to Node %+v", obj) + return + } + klog.Infof("node added: %s", node.GetName()) + + // ensure we have both control plane nodes before creating jobs + nodeList, err := controlPlaneNodeLister.List(labels.Everything()) + if err != nil { + klog.Errorf("failed to list control plane nodes while waiting to create TNF jobs: %v", err) + return } - runTnfAuthJobController(ctx, node.GetName(), controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces) - runTnfAfterSetupJobController(ctx, node.GetName(), controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces) + if len(nodeList) != 2 { + klog.Info("not starting TNF jobs yet, waiting for 2 control plane nodes to exist") + return + } + // we can have 2 nodes on the first call of AddFunc already, ensure we create job controllers once only + once.Do(func() { + klog.Infof("found 2 control plane nodes (%q, %q), creating TNF jobs", nodeList[0].GetName(), nodeList[1].GetName()) + // the order of job creation does not matter, the jobs wait on each other as needed + for _, node := range nodeList { + runTnfAuthJobController(ctx, node.GetName(), controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces) + runTnfAfterSetupJobController(ctx, node.GetName(), controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces) + } + runTnfSetupJobController(ctx, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces) + }) }, }) if err != nil { @@ -72,8 +99,6 @@ func HandleDualReplicaClusters( return false, err } - runTnfSetupJobController(ctx, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces) - return true, nil } diff --git a/pkg/tnf/pkg/config/cluster.go b/pkg/tnf/pkg/config/cluster.go index 01bfad0ad1..76e98f754b 100644 --- a/pkg/tnf/pkg/config/cluster.go +++ b/pkg/tnf/pkg/config/cluster.go @@ -2,6 +2,7 @@ package config import ( "context" + "fmt" "sort" corev1 "k8s.io/api/core/v1" @@ -24,10 +25,15 @@ func GetClusterConfig(ctx context.Context, kubeClient kubernetes.Interface) (Clu clusterCfg := ClusterConfig{} // Get nodes - nodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + nodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{ + LabelSelector: "node-role.kubernetes.io/master", + }) if err != nil { return clusterCfg, err } + if len(nodes.Items) != 2 { + return clusterCfg, fmt.Errorf("expected 2 nodes, got %d", len(nodes.Items)) + } sort.Slice(nodes.Items, func(i, j int) bool { return nodes.Items[i].Name < nodes.Items[j].Name diff --git a/pkg/tnf/pkg/config/cluster_test.go b/pkg/tnf/pkg/config/cluster_test.go index e15d5f04bf..2debc1a712 100644 --- a/pkg/tnf/pkg/config/cluster_test.go +++ b/pkg/tnf/pkg/config/cluster_test.go @@ -17,6 +17,7 @@ type args struct { } func TestGetClusterConfig(t *testing.T) { + tests := []struct { name string args args @@ -29,6 +30,9 @@ func TestGetClusterConfig(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{ Name: "test1", + Labels: map[string]string{ + "node-role.kubernetes.io/master": "", + }, }, Status: corev1.NodeStatus{ Addresses: []corev1.NodeAddress{ @@ -41,6 +45,9 @@ func TestGetClusterConfig(t *testing.T) { { ObjectMeta: metav1.ObjectMeta{ Name: "test2", + Labels: map[string]string{ + "node-role.kubernetes.io/master": "", + }, }, Status: corev1.NodeStatus{ Addresses: []corev1.NodeAddress{ @@ -58,6 +65,44 @@ func TestGetClusterConfig(t *testing.T) { }, wantErr: false, }, + { + name: "one node only should fail", + args: getArgs(t, []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + Labels: map[string]string{ + "node-role.kubernetes.io/master": "", + }, + }, + }, + }), + want: ClusterConfig{}, + wantErr: true, + }, + { + name: "one control plane node only should fail", + args: getArgs(t, []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test1", + Labels: map[string]string{ + "node-role.kubernetes.io/master": "", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test2", + Labels: map[string]string{ + "node-role.kubernetes.io/no-master": "", + }, + }, + }, + }), + want: ClusterConfig{}, + wantErr: true, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -69,6 +114,12 @@ func TestGetClusterConfig(t *testing.T) { if !reflect.DeepEqual(got, tt.want) { t.Errorf("GetClusterConfig() got = %v, want %v", got, tt.want) } + // delete nodes + c := tt.args.kubeClient + nodes, _ := c.CoreV1().Nodes().List(tt.args.ctx, metav1.ListOptions{}) + for _, node := range nodes.Items { + c.CoreV1().Nodes().Delete(tt.args.ctx, node.Name, metav1.DeleteOptions{}) + } }) } }