From 296c2b66794f5990dbfeb00b030282efe9874b3a Mon Sep 17 00:00:00 2001 From: Jesus Carrillo Date: Fri, 7 Jul 2023 10:34:30 -0700 Subject: [PATCH] Node Drain Logic: Allow users to force node drain If pods can't be drained due to a PDB the controller fails to drain the node and crashes. F0706 23:17:17.114069 3022288 main.go:88] Error running agent: processing: getting pods for deletion: [cannot delete Pods declare no controller (use --force to override): test/test-iscsi] This PR allows users to force drains. --- cmd/update-agent/main.go | 2 ++ pkg/agent/agent.go | 9 +++++--- pkg/agent/agent_test.go | 46 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/cmd/update-agent/main.go b/cmd/update-agent/main.go index 1f49f6949..bb58390ed 100644 --- a/cmd/update-agent/main.go +++ b/cmd/update-agent/main.go @@ -27,6 +27,7 @@ var ( reapTimeout = flag.Int("grace-period", defaultGracePeriodSeconds, "Period of time in seconds given to a pod to terminate when rebooting for an update") + forceNodeDrain = flag.Bool("force-drain", false, "Force removal of pods with custom or no owners while draining node") ) func main() { @@ -74,6 +75,7 @@ func main() { Clientset: clientset, StatusReceiver: updateEngineClient, Rebooter: rebooter, + ForceNodeDrain: *forceNodeDrain, } agent, err := agent.New(config) diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index 2ca395ed1..75f7febc7 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -34,6 +34,7 @@ import ( type Config struct { NodeName string PodDeletionGracePeriod time.Duration + ForceNodeDrain bool Clientset kubernetes.Interface StatusReceiver StatusReceiver Rebooter Rebooter @@ -65,6 +66,7 @@ type klocksmith struct { ue StatusReceiver lc Rebooter reapTimeout time.Duration + forceNodeDrain bool hostFilesPrefix string pollInterval time.Duration maxOperatorResponseTime time.Duration @@ -114,6 +116,7 @@ func New(config *Config) (Klocksmith, error) { ue: config.StatusReceiver, lc: config.Rebooter, reapTimeout: config.PodDeletionGracePeriod, + forceNodeDrain: config.ForceNodeDrain, hostFilesPrefix: config.HostFilesPrefix, pollInterval: pollInterval, maxOperatorResponseTime: maxOperatorResponseTime, @@ -269,7 +272,7 @@ func (k *klocksmith) process(ctx context.Context) error { klog.Info("Node already marked as unschedulable") } - drainer := newDrainer(ctx, k.clientset, k.reapTimeout) + drainer := newDrainer(ctx, k.clientset, k.reapTimeout, k.forceNodeDrain) klog.Info("Getting pod list for deletion") @@ -461,11 +464,11 @@ type drainer interface { DeleteOrEvictPods([]corev1.Pod) error } -func newDrainer(ctx context.Context, cs kubernetes.Interface, timeout time.Duration) drainer { +func newDrainer(ctx context.Context, cs kubernetes.Interface, timeout time.Duration, forceNodeDrain bool) drainer { return &drain.Helper{ Ctx: ctx, Client: cs, - Force: false, + Force: forceNodeDrain, GracePeriodSeconds: -1, Timeout: timeout, // Explicitly don't terminate self? we'll probably just be a diff --git a/pkg/agent/agent_test.go b/pkg/agent/agent_test.go index 44560b2f6..2b9960421 100644 --- a/pkg/agent/agent_test.go +++ b/pkg/agent/agent_test.go @@ -931,6 +931,52 @@ func Test_Running_agent(t *testing.T) { }) }) + t.Run("removes_pod_without_owner_when_force_drain_is_configured", func(t *testing.T) { + t.Parallel() + + rebootTriggerred := make(chan bool) + + podsToCreate := []*corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + Namespace: "default", + }, + Spec: corev1.PodSpec{ + NodeName: testNode().Name, + }, + }, + } + + fakeClient := fake.NewSimpleClientset(podsToCreate[0], testNode()) + addEvictionSupport(t, fakeClient) + + testConfig, node, _ := validTestConfig(t, testNode()) + testConfig.ForceNodeDrain = true + testConfig.Clientset = fakeClient + testConfig.Rebooter = &mockRebooter{ + rebootF: func(auth bool) { + rebootTriggerred <- auth + }, + } + + ctx := contextWithTimeout(t, agentRunTimeLimit) + + assertNodeProperty(ctx, t, &assertNodePropertyContext{ + done: runAgent(ctx, t, testConfig), + config: testConfig, + testF: assertNodeAnnotationValue(constants.AnnotationRebootNeeded, constants.True), + }) + + okToReboot(ctx, t, testConfig.Clientset.CoreV1().Nodes(), node.Name) + + select { + case <-ctx.Done(): + t.Fatal("Timed out waiting for reboot to be triggered") + case <-rebootTriggerred: + } + }) + t.Run("after_draining_node", func(t *testing.T) { t.Parallel()