diff --git a/.gitignore b/.gitignore index 8e079f42e20db66cf04e209b488c0b64d27e279b..50dcd270689c255c487b6c52517242380c2927e8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ terraform providers master -worker \ No newline at end of file +worker +housekeeper/bin +app/apis/nkd/_const.go \ No newline at end of file diff --git a/data/ignition/controlplane/systemd/install-cni-plugin.service.template b/data/ignition/controlplane/systemd/install-cni-plugin.service.template index cebdd4f240f79735413ed2225fca4cfc17073e7e..412614880dc587728b58c5e6ee1ad4861d1b8076 100644 --- a/data/ignition/controlplane/systemd/install-cni-plugin.service.template +++ b/data/ignition/controlplane/systemd/install-cni-plugin.service.template @@ -10,7 +10,7 @@ Type=oneshot RemainAfterExit=yes ExecStart=bash -c "sed -i 's#usr/libexec/#opt/libexec/#g' /etc/nkd/calico.yaml" ExecStart=bash -c "sed -i 's/# - name: CALICO_IPV4POOL_CIDR/- name: CALICO_IPV4POOL_CIDR/g' /etc/nkd/calico.yaml" -ExecStart=bash -c 'sed -i "s/# value: \"192.168.0.0\/16\"/ value: \"{{.IpSegment}}\/16\"/" /etc/nkd/calico.yaml' +ExecStart=bash -c "sed -i 's/# value: \"192.168.0.0\/16\"/ value: \"{{.IpSegment}}\/16\"/g' /etc/nkd/calico.yaml" ExecStart=kubectl apply -f /etc/nkd/calico.yaml --kubeconfig=/etc/kubernetes/admin.conf ExecStart=/bin/touch /var/log/install-cni-plugin.stamp diff --git a/housekeeper/operator/housekeeper-operator/controllers/update_controller.go b/housekeeper/operator/housekeeper-operator/controllers/update_controller.go index d0d6469482ee7dc8dbc75ae97547459764e65eea..3b2f35439ec44770e828b60fa7f2929e25419320 100644 --- a/housekeeper/operator/housekeeper-operator/controllers/update_controller.go +++ b/housekeeper/operator/housekeeper-operator/controllers/update_controller.go @@ -100,7 +100,7 @@ func reconcile(ctx context.Context, r common.ReadWriterClient, req ctrl.Request) return common.RequeueNow, err } - return common.NoRequeue, nil + return common.RequeueNow, nil } func getMasterNodesItems(ctx context.Context, r common.ReadWriterClient) ( @@ -132,7 +132,7 @@ func getWorkerNodesItems(ctx context.Context, r common.ReadWriterClient) ( } reqWorker, err := labels.NewRequirement(constants.LabelMaster, selection.DoesNotExist, nil) if err != nil { - logrus.Errorf("unable to create requirement %s: %v"+constants.LabelMaster, err) + logrus.Errorf("unable to create requirement %s: %v", constants.LabelMaster, err) return } nodesItems, err = getNodes(ctx, r, *reqUpgrade, *reqWorker) @@ -155,20 +155,19 @@ func getNodes(ctx context.Context, r common.ReadWriterClient, reqs ...labels.Req // Add the label to nodes func assignUpdated(ctx context.Context, r common.ReadWriterClient, nodeList []corev1.Node, - maxUnavailable int, upInstance housekeeperiov1alpha1.Update) error { + max int, upInstance housekeeperiov1alpha1.Update) error { var ( kubeVersionSpec = upInstance.Spec.KubeVersion osVersionSpec = upInstance.Spec.OSVersion count = 0 - wg sync.WaitGroup ) // Create a channel to receive the task result - resultChan := make(chan error) for _, node := range nodeList { - if count >= maxUnavailable { + if count >= max { count = 0 - // Wait for a timeout or update to complete after each maxUnavailable node upgrade - wg.Wait() + if err := waitForUpgradeComplete(node, kubeVersionSpec, osVersionSpec); err != nil { + return err + } } if conditionMet(node, kubeVersionSpec, osVersionSpec) { node.Labels[constants.LabelUpgrading] = "" @@ -177,33 +176,21 @@ func assignUpdated(ctx context.Context, r common.ReadWriterClient, nodeList []co return err } count++ - wg.Add(1) // Increase WaitGroup counter - go func(node corev1.Node) { - waitForUpgradeComplete(node, kubeVersionSpec, osVersionSpec, resultChan, &wg) - }(node) - } - } - close(resultChan) - // Iterate over the results channel and process the results of each task - for err := range resultChan { - if err != nil { - return err } } return nil } -func waitForUpgradeComplete(node corev1.Node, kubeVersionSpec string, osVersionSpec string, - resultChan chan<- error, wg *sync.WaitGroup) { - defer wg.Done() // Reduce the number of waitgroups when the execution is complete - +func waitForUpgradeComplete(node corev1.Node, kubeVersionSpec string, osVersionSpec string) error { ctx, cancel := context.WithTimeout(context.Background(), constants.NodeTimeout) defer cancel() done := make(chan struct{}) + success := false go func() { wait.Until(func() { if !conditionMet(node, kubeVersionSpec, osVersionSpec) { + success = true close(done) } }, 10*time.Second, ctx.Done()) @@ -211,15 +198,18 @@ func waitForUpgradeComplete(node corev1.Node, kubeVersionSpec string, osVersionS select { case <-done: - logrus.Infof("successful upgrade node: %s", node.Name) - resultChan <- nil + if success { + logrus.Infof("successful upgrade node: %s", node.Name) + } else { + logrus.Infof("upgrade conditions not met for node: %s", node.Name) + } + return nil case <-ctx.Done(): if ctx.Err() == context.DeadlineExceeded { logrus.Errorf("failed to upgrade node: %s: %v", node.Name, ctx.Err()) - resultChan <- ctx.Err() } + return ctx.Err() } - close(done) } func conditionMet(node corev1.Node, kubeVersionSpec string, osVersionSpec string) bool { diff --git a/housekeeper/pkg/constants/constants.go b/housekeeper/pkg/constants/constants.go index 05c167e4de88fa142d26e389525ace51bcd91409..7bdee24836277dd63ea6900dabc61a47c5804abb 100644 --- a/housekeeper/pkg/constants/constants.go +++ b/housekeeper/pkg/constants/constants.go @@ -33,6 +33,4 @@ const ( const ( // node upgrade timeout NodeTimeout = 5 * time.Minute - // time to sleep after processing maxUnavailable nodes - NodeSleepTime = 2 * time.Minute )