From 28532c4c74caa36fc021f9f8b15a5df653e73aaa Mon Sep 17 00:00:00 2001 From: Ellis Tarn Date: Thu, 18 Jul 2024 11:15:24 -0700 Subject: [PATCH] test: Improved Disruption Test Suite duration from 527 seconds to 38 seconds (#1435) --- .../disruption/consolidation_test.go | 255 ++++++------------ pkg/controllers/disruption/drift_test.go | 53 +--- pkg/controllers/disruption/emptiness_test.go | 43 +-- pkg/controllers/disruption/suite_test.go | 69 ++--- 4 files changed, 95 insertions(+), 325 deletions(-) diff --git a/pkg/controllers/disruption/consolidation_test.go b/pkg/controllers/disruption/consolidation_test.go index 7a1519c55a..2def836fd1 100644 --- a/pkg/controllers/disruption/consolidation_test.go +++ b/pkg/controllers/disruption/consolidation_test.go @@ -105,11 +105,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - Expect(recorder.Calls("Unconsolidatable")).To(Equal(0)) }) It("should fire an event for ConsolidationDisabled when the NodePool has consolidateAfter set to 'Never'", func() { @@ -119,12 +115,7 @@ var _ = Describe("Consolidation", func() { ExpectManualBinding(ctx, env.Client, pod, node) ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - // We get six calls here because we have Nodes and NodeClaims that fired for this event // and each of the consolidation mechanisms specifies that this event should be fired Expect(recorder.Calls("Unconsolidatable")).To(Equal(6)) @@ -147,11 +138,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - wg := sync.WaitGroup{} - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - for _, ct := range consolidationTypes { ExpectMetricGaugeValue(disruption.EligibleNodesGauge, 0, map[string]string{ "method": "consolidation", @@ -165,7 +152,8 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - ExpectTriggerVerifyAction(&wg) + wg := sync.WaitGroup{} + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -214,7 +202,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -240,7 +228,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -264,13 +252,7 @@ var _ = Describe("Consolidation", func() { } // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) - ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ "nodepool": nodePool.Name, }) @@ -319,7 +301,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -364,9 +346,9 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) var wg sync.WaitGroup + ExpectToWait(&wg) // Reconcile 5 times, enqueuing 3 commands total. for i := 0; i < 5; i++ { - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) } wg.Wait() @@ -426,7 +408,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -490,7 +472,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -552,12 +534,7 @@ var _ = Describe("Consolidation", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - for _, np := range nps { metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ "nodepool": np.Name, @@ -587,13 +564,10 @@ var _ = Describe("Consolidation", func() { candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, emptyConsolidation.ShouldDisrupt, emptyConsolidation.Class(), queue) Expect(err).To(Succeed()) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) cmd, results, err := emptyConsolidation.ComputeCommand(ctx, budgets, candidates...) Expect(err).To(Succeed()) Expect(results).To(Equal(pscheduling.Results{})) Expect(cmd).To(Equal(disruption.Command{})) - wg.Wait() Expect(emptyConsolidation.IsConsolidated()).To(BeFalse()) }) @@ -651,13 +625,10 @@ var _ = Describe("Consolidation", func() { candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, emptyConsolidation.ShouldDisrupt, emptyConsolidation.Class(), queue) Expect(err).To(Succeed()) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) cmd, results, err := emptyConsolidation.ComputeCommand(ctx, budgets, candidates...) Expect(err).To(Succeed()) Expect(results).To(Equal(pscheduling.Results{})) Expect(cmd).To(Equal(disruption.Command{})) - wg.Wait() Expect(emptyConsolidation.IsConsolidated()).To(BeFalse()) }) @@ -678,13 +649,10 @@ var _ = Describe("Consolidation", func() { candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, multiConsolidation.ShouldDisrupt, multiConsolidation.Class(), queue) Expect(err).To(Succeed()) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) cmd, results, err := multiConsolidation.ComputeCommand(ctx, budgets, candidates...) Expect(err).To(Succeed()) Expect(results).To(Equal(pscheduling.Results{})) Expect(cmd).To(Equal(disruption.Command{})) - wg.Wait() Expect(multiConsolidation.IsConsolidated()).To(BeFalse()) }) @@ -742,13 +710,10 @@ var _ = Describe("Consolidation", func() { candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, multiConsolidation.ShouldDisrupt, multiConsolidation.Class(), queue) Expect(err).To(Succeed()) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) cmd, results, err := multiConsolidation.ComputeCommand(ctx, budgets, candidates...) Expect(err).To(Succeed()) Expect(results).To(Equal(pscheduling.Results{})) Expect(cmd).To(Equal(disruption.Command{})) - wg.Wait() Expect(multiConsolidation.IsConsolidated()).To(BeFalse()) }) @@ -769,13 +734,10 @@ var _ = Describe("Consolidation", func() { candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, singleConsolidation.ShouldDisrupt, singleConsolidation.Class(), queue) Expect(err).To(Succeed()) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) cmd, results, err := singleConsolidation.ComputeCommand(ctx, budgets, candidates...) Expect(err).To(Succeed()) Expect(results).To(Equal(pscheduling.Results{})) Expect(cmd).To(Equal(disruption.Command{})) - wg.Wait() Expect(singleConsolidation.IsConsolidated()).To(BeFalse()) }) @@ -833,13 +795,10 @@ var _ = Describe("Consolidation", func() { candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, singleConsolidation.ShouldDisrupt, singleConsolidation.Class(), queue) Expect(err).To(Succeed()) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) cmd, results, err := singleConsolidation.ComputeCommand(ctx, budgets, candidates...) Expect(err).To(Succeed()) Expect(results).To(Equal(pscheduling.Results{})) Expect(cmd).To(Equal(disruption.Command{})) - wg.Wait() Expect(singleConsolidation.IsConsolidated()).To(BeFalse()) }) @@ -875,7 +834,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -897,7 +856,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) wg := sync.WaitGroup{} - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -1012,7 +971,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -1077,7 +1036,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -1136,11 +1095,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - ExpectSingletonReconciled(ctx, queue) // Cascade any deletion of the nodeclaim to the node @@ -1187,7 +1142,7 @@ var _ = Describe("Consolidation", func() { // consolidation won't delete the old nodeclaim until the new nodeclaim is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -1278,11 +1233,8 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) // consolidation won't delete the old nodeclaim until the new nodeclaim is ready - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) ExpectSingletonReconciled(ctx, queue) - wg.Wait() // shouldn't delete the node Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(1)) @@ -1324,11 +1276,8 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) // consolidation won't delete the old nodeclaim until the new nodeclaim is ready - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) ExpectSingletonReconciled(ctx, queue) - wg.Wait() // shouldn't delete the node Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(1)) @@ -1402,10 +1351,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) // consolidation won't delete the old nodeclaim until the new nodeclaim is ready - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() // we didn't create a new nodeclaim or delete the old one Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -1492,7 +1438,7 @@ var _ = Describe("Consolidation", func() { // consolidation won't delete the old nodeclaim until the new nodeclaim is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -1617,7 +1563,7 @@ var _ = Describe("Consolidation", func() { // consolidation won't delete the old nodeclaim until the new nodeclaim is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -1742,7 +1688,7 @@ var _ = Describe("Consolidation", func() { // consolidation won't delete the old nodeclaim until the new nodeclaim is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -1863,10 +1809,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) // consolidation won't delete the old nodeclaim until the new nodeclaim is ready - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() // we didn't create a new nodeclaim or delete the old one Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -1913,7 +1856,7 @@ var _ = Describe("Consolidation", func() { // consolidation won't delete the old nodeclaim until the new nodeclaim is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2065,7 +2008,7 @@ var _ = Describe("Consolidation", func() { // consolidation won't delete the old nodeclaim until the new nodeclaim is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2139,7 +2082,7 @@ var _ = Describe("Consolidation", func() { // consolidation won't delete the old node until the new node is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2234,7 +2177,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2325,7 +2268,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2419,10 +2362,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -2523,10 +2463,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() // Expect to not create or delete more nodeclaims Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) @@ -2586,7 +2523,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2633,7 +2570,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2687,7 +2624,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2749,7 +2686,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2801,7 +2738,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2849,7 +2786,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2901,12 +2838,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{nodes[0], nodes[1]}, []*v1.NodeClaim{nodeClaims[0], nodeClaims[1]}) fakeClock.Step(10 * time.Minute) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - ExpectSingletonReconciled(ctx, queue) // Cascade any deletion of the nodeclaim to the node @@ -2955,12 +2887,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{nodes[0], nodes[1]}, []*v1.NodeClaim{nodeClaims[0], nodeClaims[1]}) fakeClock.Step(10 * time.Minute) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - ExpectSingletonReconciled(ctx, queue) // Cascade any deletion of the nodeclaim to the node @@ -3005,7 +2932,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -3050,11 +2977,8 @@ var _ = Describe("Consolidation", func() { ExpectReconcileSucceeded(ctx, nodeClaimStateController, client.ObjectKeyFromObject(nodeClaims[0])) ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{nodes[1]}, []*v1.NodeClaim{nodeClaims[1]}) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) ExpectSingletonReconciled(ctx, queue) - wg.Wait() // shouldn't delete the node Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(2)) @@ -3196,7 +3120,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{consolidatableNode}, []*v1.NodeClaim{consolidatableNodeClaim}) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -3252,7 +3176,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -3362,7 +3286,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -3695,10 +3619,6 @@ var _ = Describe("Consolidation", func() { ExpectApplied(ctx, env.Client, doNotDisruptPod) ExpectManualBinding(ctx, env.Client, doNotDisruptPod, node) - // Step forward to satisfy the validation timeout and wait for the reconcile to finish - ExpectTriggerVerifyAction(&wg) - wg.Wait() - // we would normally be able to replace a node, but we are blocked by the do-not-disrupt pods during validation Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(1)) @@ -3745,10 +3665,6 @@ var _ = Describe("Consolidation", func() { ExpectApplied(ctx, env.Client, blockingPDBPod, pdb) ExpectManualBinding(ctx, env.Client, blockingPDBPod, node) - // Step forward to satisfy the validation timeout and wait for the reconcile to finish - ExpectTriggerVerifyAction(&wg) - wg.Wait() - // we would normally be able to replace a node, but we are blocked by the PDB during validation Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(1)) @@ -3795,10 +3711,6 @@ var _ = Describe("Consolidation", func() { ExpectManualBinding(ctx, env.Client, doNotDisruptPods[0], nodes[0]) ExpectManualBinding(ctx, env.Client, doNotDisruptPods[1], nodes[1]) - // Step forward to satisfy the validation timeout and wait for the reconcile to finish - ExpectTriggerVerifyAction(&wg) - wg.Wait() - // we would normally be able to consolidate down to a single node, but we are blocked by the do-not-disrupt pods during validation Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(2)) Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(2)) @@ -3848,10 +3760,6 @@ var _ = Describe("Consolidation", func() { ExpectManualBinding(ctx, env.Client, blockingPDBPods[0], nodes[0]) ExpectManualBinding(ctx, env.Client, blockingPDBPods[1], nodes[1]) - // Step forward to satisfy the validation timeout and wait for the reconcile to finish - ExpectTriggerVerifyAction(&wg) - wg.Wait() - // we would normally be able to consolidate down to a single node, but we are blocked by the PDB during validation Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(2)) Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(2)) @@ -3859,7 +3767,6 @@ var _ = Describe("Consolidation", func() { ExpectExists(ctx, env.Client, nodes[1]) }) }) - Context("Multi-NodeClaim", func() { var nodeClaims, spotNodeClaims []*v1.NodeClaim var nodes, spotNodes []*corev1.Node @@ -3902,56 +3809,55 @@ var _ = Describe("Consolidation", func() { }, }) }) - DescribeTable("can merge 3 nodes into 1", - func(spotToSpot bool) { - nodeClaims = lo.Ternary(spotToSpot, spotNodeClaims, nodeClaims) - nodes = lo.Ternary(spotToSpot, spotNodes, nodes) - // create our RS so we can link a pod to it - rs := test.ReplicaSet() - ExpectApplied(ctx, env.Client, rs) - pods := test.Pods(3, test.PodOptions{ - ObjectMeta: metav1.ObjectMeta{Labels: labels, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "apps/v1", - Kind: "ReplicaSet", - Name: rs.Name, - UID: rs.UID, - Controller: lo.ToPtr(true), - BlockOwnerDeletion: lo.ToPtr(true), - }, - }}}) + DescribeTable("can merge 3 nodes into 1", func(spotToSpot bool) { + nodeClaims = lo.Ternary(spotToSpot, spotNodeClaims, nodeClaims) + nodes = lo.Ternary(spotToSpot, spotNodes, nodes) + // create our RS so we can link a pod to it + rs := test.ReplicaSet() + ExpectApplied(ctx, env.Client, rs) + pods := test.Pods(3, test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: rs.Name, + UID: rs.UID, + Controller: lo.ToPtr(true), + BlockOwnerDeletion: lo.ToPtr(true), + }, + }}}) - ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], nodeClaims[0], nodes[0], nodeClaims[1], nodes[1], nodeClaims[2], nodes[2], nodePool) - ExpectMakeNodesInitialized(ctx, env.Client, nodes[0], nodes[1], nodes[2]) + ExpectApplied(ctx, env.Client, rs, pods[0], pods[1], pods[2], nodeClaims[0], nodes[0], nodeClaims[1], nodes[1], nodeClaims[2], nodes[2], nodePool) + ExpectMakeNodesInitialized(ctx, env.Client, nodes[0], nodes[1], nodes[2]) - // bind pods to nodes - ExpectManualBinding(ctx, env.Client, pods[0], nodes[0]) - ExpectManualBinding(ctx, env.Client, pods[1], nodes[1]) - ExpectManualBinding(ctx, env.Client, pods[2], nodes[2]) + // bind pods to nodes + ExpectManualBinding(ctx, env.Client, pods[0], nodes[0]) + ExpectManualBinding(ctx, env.Client, pods[1], nodes[1]) + ExpectManualBinding(ctx, env.Client, pods[2], nodes[2]) - // inform cluster state about nodes and nodeclaims - ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{nodes[0], nodes[1], nodes[2]}, []*v1.NodeClaim{nodeClaims[0], nodeClaims[1], nodeClaims[2]}) + // inform cluster state about nodes and nodeclaims + ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{nodes[0], nodes[1], nodes[2]}, []*v1.NodeClaim{nodeClaims[0], nodeClaims[1], nodeClaims[2]}) - fakeClock.Step(10 * time.Minute) + fakeClock.Step(10 * time.Minute) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) - ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() + var wg sync.WaitGroup + ExpectToWait(&wg) + ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) + ExpectSingletonReconciled(ctx, disruptionController) + wg.Wait() - // Process the item so that the nodes can be deleted. - ExpectSingletonReconciled(ctx, queue) + // Process the item so that the nodes can be deleted. + ExpectSingletonReconciled(ctx, queue) - // Cascade any deletion of the nodeclaim to the node - ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaims[0], nodeClaims[1], nodeClaims[2]) + // Cascade any deletion of the nodeclaim to the node + ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaims[0], nodeClaims[1], nodeClaims[2]) - // three nodeclaims should be replaced with a single nodeclaim - Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) - Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(1)) - ExpectNotFound(ctx, env.Client, nodeClaims[0], nodes[0], nodeClaims[1], nodes[1], nodeClaims[2], nodes[2]) - }, + // three nodeclaims should be replaced with a single nodeclaim + Expect(ExpectNodeClaims(ctx, env.Client)).To(HaveLen(1)) + Expect(ExpectNodes(ctx, env.Client)).To(HaveLen(1)) + ExpectNotFound(ctx, env.Client, nodeClaims[0], nodes[0], nodeClaims[1], nodes[1], nodeClaims[2], nodes[2]) + }, Entry("if the candidate is on-demand node", false), Entry("if the candidate is spot node", true), ) @@ -4000,7 +3906,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -4073,7 +3979,7 @@ var _ = Describe("Consolidation", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -4404,7 +4310,7 @@ var _ = Describe("Consolidation", func() { fakeClock.SetTime(time.Now()) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -4513,7 +4419,7 @@ var _ = Describe("Consolidation", func() { // consolidation won't delete the old node until the new node is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -4596,11 +4502,7 @@ var _ = Describe("Consolidation", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{nodes[0], nodes[1], nodes[2]}, []*v1.NodeClaim{nodeClaims[0], nodeClaims[1], nodeClaims[2]}) fakeClock.Step(10 * time.Minute) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() // our nodes are already the cheapest available, so we can't replace them. If we delete, it would // violate the anti-affinity rule, so we can't do anything. @@ -4649,7 +4551,7 @@ var _ = Describe("Consolidation", func() { // Run the processing loop in parallel in the background with environment context var wg sync.WaitGroup ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) go func() { defer GinkgoRecover() _, _ = disruptionController.Reconcile(ctx) @@ -4733,9 +4635,6 @@ var _ = Describe("Consolidation", func() { // Trigger a reconciliation run which should take into account the deleting node // consolidation shouldn't trigger additional actions fakeClock.Step(10 * time.Minute) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) - result, err := disruptionController.Reconcile(ctx) Expect(err).ToNot(HaveOccurred()) Expect(result.RequeueAfter).To(BeNumerically(">", 0)) diff --git a/pkg/controllers/disruption/drift_test.go b/pkg/controllers/disruption/drift_test.go index 5b5e4c987c..3f9375fb40 100644 --- a/pkg/controllers/disruption/drift_test.go +++ b/pkg/controllers/disruption/drift_test.go @@ -95,11 +95,7 @@ var _ = Describe("Drift", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - wg := sync.WaitGroup{} - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - ExpectMetricGaugeValue(disruption.EligibleNodesGauge, 0, eligibleNodesLabels) // remove the do-not-disrupt annotation to make the node eligible for drift and update cluster state @@ -108,10 +104,7 @@ var _ = Describe("Drift", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - ExpectMetricGaugeValue(disruption.EligibleNodesGauge, 1, eligibleNodesLabels) }) }) @@ -156,11 +149,7 @@ var _ = Describe("Drift", func() { } // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ "nodepool": nodePool.Name, @@ -199,11 +188,7 @@ var _ = Describe("Drift", func() { } // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ "nodepool": nodePool.Name, @@ -242,12 +227,7 @@ var _ = Describe("Drift", func() { } // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ "nodepool": nodePool.Name, }) @@ -315,13 +295,10 @@ var _ = Describe("Drift", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) // Reconcile 5 times, enqueuing 3 commands total. for i := 0; i < 5; i++ { ExpectSingletonReconciled(ctx, disruptionController) } - wg.Wait() nodes = ExpectNodes(ctx, env.Client) Expect(len(lo.Filter(nodes, func(nc *corev1.Node, _ int) bool { @@ -386,11 +363,7 @@ var _ = Describe("Drift", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() for _, np := range nps { metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ @@ -451,11 +424,7 @@ var _ = Describe("Drift", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() for _, np := range nps { metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ @@ -470,7 +439,6 @@ var _ = Describe("Drift", func() { Expect(len(ExpectNodeClaims(ctx, env.Client))).To(Equal(0)) }) }) - Context("Drift", func() { It("should continue to the next drifted node if the first cannot reschedule all pods", func() { pod := test.Pod(test.PodOptions{ @@ -519,7 +487,6 @@ var _ = Describe("Drift", func() { // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -652,12 +619,7 @@ var _ = Describe("Drift", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - // Process the item so that the nodes can be deleted. ExpectSingletonReconciled(ctx, queue) // Cascade any deletion of the nodeClaim to the node @@ -696,11 +658,7 @@ var _ = Describe("Drift", func() { // inform cluster state about nodes and nodeClaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() // Process the item so that the nodes can be deleted. ExpectSingletonReconciled(ctx, queue) @@ -745,7 +703,6 @@ var _ = Describe("Drift", func() { // disruption won't delete the old nodeClaim until the new nodeClaim is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -799,11 +756,12 @@ var _ = Describe("Drift", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectNewNodeClaimsDeleted(ctx, env.Client, &wg, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() + // Wait > 5 seconds for eventual consistency hack in orchestration.Queue + fakeClock.Step(5*time.Second + time.Nanosecond*1) ExpectSingletonReconciled(ctx, queue) // We should have tried to create a new nodeClaim but failed to do so; therefore, we untainted the existing node node = ExpectExists(ctx, env.Client, node) @@ -889,7 +847,6 @@ var _ = Describe("Drift", func() { // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 3) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -965,7 +922,6 @@ var _ = Describe("Drift", func() { // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -989,12 +945,7 @@ var _ = Describe("Drift", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - // Process the item so that the nodes can be deleted. ExpectSingletonReconciled(ctx, queue) // Cascade any deletion of the nodeClaim to the node diff --git a/pkg/controllers/disruption/emptiness_test.go b/pkg/controllers/disruption/emptiness_test.go index 447958e960..e0ea86f93b 100644 --- a/pkg/controllers/disruption/emptiness_test.go +++ b/pkg/controllers/disruption/emptiness_test.go @@ -80,7 +80,7 @@ var _ = Describe("Emptiness", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -91,11 +91,7 @@ var _ = Describe("Emptiness", func() { ExpectApplied(ctx, env.Client, node, nodeClaim, nodePool) ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() // We get six calls here because we have Nodes and NodeClaims that fired for this event // and each of the consolidation mechanisms specifies that this event should be fired @@ -116,22 +112,12 @@ var _ = Describe("Emptiness", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - wg := sync.WaitGroup{} - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - ExpectMetricGaugeValue(disruption.EligibleNodesGauge, 0, eligibleNodesEmptinessLabels) - // delete pod and update cluster state, node should now be disruptable ExpectDeleted(ctx, env.Client, pod) ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) - - fakeClock.Step(10 * time.Minute) - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - ExpectMetricGaugeValue(disruption.EligibleNodesGauge, 1, eligibleNodesEmptinessLabels) }) }) @@ -169,12 +155,7 @@ var _ = Describe("Emptiness", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ "nodepool": nodePool.Name, }) @@ -215,11 +196,7 @@ var _ = Describe("Emptiness", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ "nodepool": nodePool.Name, @@ -261,12 +238,7 @@ var _ = Describe("Emptiness", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ "nodepool": nodePool.Name, }) @@ -329,11 +301,7 @@ var _ = Describe("Emptiness", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() for _, np := range nps { metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ @@ -398,12 +366,7 @@ var _ = Describe("Emptiness", func() { // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - for _, np := range nps { metric, found := FindMetricWithLabelValues("karpenter_disruption_budgets_allowed_disruptions", map[string]string{ "nodepool": np.Name, @@ -425,11 +388,7 @@ var _ = Describe("Emptiness", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - wg := sync.WaitGroup{} - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() - ExpectSingletonReconciled(ctx, queue) // Cascade any deletion of the nodeClaim to the node ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaim) diff --git a/pkg/controllers/disruption/suite_test.go b/pkg/controllers/disruption/suite_test.go index 8be0b58923..78f85efa7f 100644 --- a/pkg/controllers/disruption/suite_test.go +++ b/pkg/controllers/disruption/suite_test.go @@ -185,10 +185,8 @@ var _ = Describe("Simulate Scheduling", func() { for i := 0; i < numNodes; i++ { ExpectApplied(ctx, env.Client, nodeClaims[i], nodes[i]) } - // inform cluster state about nodes and nodeclaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims) - pod := test.Pod(test.PodOptions{ ResourceRequirements: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ @@ -202,9 +200,6 @@ var _ = Describe("Simulate Scheduling", func() { ExpectApplied(ctx, env.Client, pod) ExpectManualBinding(ctx, env.Client, pod, nodes[0]) - // nodePool.Spec.Disruption.Budgets = []v1.Budget{{Nodes: "100%"}} - // ExpectApplied(ctx, env.Client, nodePool) - nodePoolMap, nodePoolToInstanceTypesMap, err := disruption.BuildNodePoolMap(ctx, env.Client, cloudProvider) Expect(err).To(Succeed()) @@ -306,11 +301,7 @@ var _ = Describe("Simulate Scheduling", func() { nodeClaimNames := lo.SliceToMap(nodeClaims, func(nc *v1.NodeClaim) (string, struct{}) { return nc.Name, struct{}{} }) - - wg := sync.WaitGroup{} - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() // Expect a replace action ExpectTaintedNodeCount(ctx, env.Client, 1) @@ -325,10 +316,7 @@ var _ = Describe("Simulate Scheduling", func() { // which needs to be deployed ExpectNodeClaimDeployedAndStateUpdated(ctx, env.Client, cluster, cloudProvider, nc) nodeClaimNames[nc.Name] = struct{}{} - - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() // Another replacement disruption action ncs = ExpectNodeClaims(ctx, env.Client) @@ -341,9 +329,7 @@ var _ = Describe("Simulate Scheduling", func() { ExpectNodeClaimDeployedAndStateUpdated(ctx, env.Client, cluster, cloudProvider, nc) nodeClaimNames[nc.Name] = struct{}{} - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() // One more replacement disruption action ncs = ExpectNodeClaims(ctx, env.Client) @@ -357,9 +343,7 @@ var _ = Describe("Simulate Scheduling", func() { nodeClaimNames[nc.Name] = struct{}{} // Try one more time, but fail since the budgets only allow 3 disruptions. - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() ncs = ExpectNodeClaims(ctx, env.Client) Expect(len(ncs)).To(Equal(13)) @@ -438,7 +422,7 @@ var _ = Describe("Simulate Scheduling", func() { // disruption won't delete the old node until the new node is ready var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -535,16 +519,7 @@ var _ = Describe("Disruption Taints", func() { // inform cluster state about nodes and nodeClaims ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) - - // Trigger the reconcile loop to start but don't trigger the verify action - wg := sync.WaitGroup{} - wg.Add(1) - go func() { - defer wg.Done() - ExpectTriggerVerifyAction(&wg) - ExpectSingletonReconciled(ctx, disruptionController) - }() - wg.Wait() + ExpectSingletonReconciled(ctx, disruptionController) node = ExpectNodeExists(ctx, env.Client, node.Name) Expect(node.Spec.Taints).ToNot(ContainElement(v1.DisruptionNoScheduleTaint)) }) @@ -569,7 +544,7 @@ var _ = Describe("Disruption Taints", func() { wg.Add(1) go func() { defer wg.Done() - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) }() @@ -1796,11 +1771,7 @@ var _ = Describe("Metrics", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() ExpectMetricCounterValue(disruption.ActionsPerformedCounter, 1, map[string]string{ "action": "delete", @@ -1850,11 +1821,7 @@ var _ = Describe("Metrics", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{nodes[0], nodes[1]}, []*v1.NodeClaim{nodeClaims[0], nodeClaims[1]}) fakeClock.Step(10 * time.Minute) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() ExpectMetricCounterValue(disruption.ActionsPerformedCounter, 1, map[string]string{ "action": "delete", @@ -1904,11 +1871,7 @@ var _ = Describe("Metrics", func() { ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*corev1.Node{node}, []*v1.NodeClaim{nodeClaim}) fakeClock.Step(10 * time.Minute) - - var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) ExpectSingletonReconciled(ctx, disruptionController) - wg.Wait() ExpectMetricCounterValue(disruption.ActionsPerformedCounter, 1, map[string]string{ "action": "replace", @@ -1950,7 +1913,7 @@ var _ = Describe("Metrics", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2021,7 +1984,7 @@ var _ = Describe("Metrics", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2093,7 +2056,7 @@ var _ = Describe("Metrics", func() { fakeClock.Step(10 * time.Minute) var wg sync.WaitGroup - ExpectTriggerVerifyAction(&wg) + ExpectToWait(&wg) ExpectSingletonReconciled(ctx, disruptionController) wg.Wait() @@ -2144,20 +2107,18 @@ func fromInt(i int) *intstr.IntOrString { // This continually polls the wait group to see if there // is a timer waiting, incrementing the clock if not. -// If you're seeing goroutine timeouts on suite tests, it's possible -// another timer was added, or the computation required for a loop is taking more than -// 20 * 400 milliseconds = 8s to complete, potentially requiring an increase in the -// duration of the polling period. -func ExpectTriggerVerifyAction(wg *sync.WaitGroup) { +func ExpectToWait(wg *sync.WaitGroup) { wg.Add(1) + Expect(fakeClock.HasWaiters()).To(BeFalse()) go func() { + defer GinkgoRecover() defer wg.Done() - for i := 0; i < 20; i++ { - time.Sleep(400 * time.Millisecond) - if fakeClock.HasWaiters() { - break - } - } + Eventually(func() bool { return fakeClock.HasWaiters() }). + // Caution: if another go routine takes longer than this timeout to + // wait on the clock, we will deadlock until the test suite timeout + WithTimeout(10 * time.Second). + WithPolling(10 * time.Millisecond). + Should(BeTrue()) fakeClock.Step(45 * time.Second) }() }