From 296c2b66794f5990dbfeb00b030282efe9874b3a Mon Sep 17 00:00:00 2001
From: Jesus Carrillo <jesuscarrillo8@gmail.com>
Date: Fri, 7 Jul 2023 10:34:30 -0700
Subject: [PATCH] Node Drain Logic: Allow users to force node drain

If pods can't be drained due to a PDB the controller fails to drain the node and crashes.

F0706 23:17:17.114069 3022288 main.go:88] Error running agent: processing: getting pods for deletion: [cannot delete Pods declare no controller (use --force to override): test/test-iscsi]

This PR allows users to force drains.
---
 cmd/update-agent/main.go |  2 ++
 pkg/agent/agent.go       |  9 +++++---
 pkg/agent/agent_test.go  | 46 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/cmd/update-agent/main.go b/cmd/update-agent/main.go
index 1f49f6949..bb58390ed 100644
--- a/cmd/update-agent/main.go
+++ b/cmd/update-agent/main.go
@@ -27,6 +27,7 @@ var (
 
 	reapTimeout = flag.Int("grace-period", defaultGracePeriodSeconds,
 		"Period of time in seconds given to a pod to terminate when rebooting for an update")
+	forceNodeDrain = flag.Bool("force-drain", false, "Force removal of pods with custom or no owners while draining node")
 )
 
 func main() {
@@ -74,6 +75,7 @@ func main() {
 		Clientset:              clientset,
 		StatusReceiver:         updateEngineClient,
 		Rebooter:               rebooter,
+		ForceNodeDrain:         *forceNodeDrain,
 	}
 
 	agent, err := agent.New(config)
diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go
index 2ca395ed1..75f7febc7 100644
--- a/pkg/agent/agent.go
+++ b/pkg/agent/agent.go
@@ -34,6 +34,7 @@ import (
 type Config struct {
 	NodeName                string
 	PodDeletionGracePeriod  time.Duration
+	ForceNodeDrain          bool
 	Clientset               kubernetes.Interface
 	StatusReceiver          StatusReceiver
 	Rebooter                Rebooter
@@ -65,6 +66,7 @@ type klocksmith struct {
 	ue                      StatusReceiver
 	lc                      Rebooter
 	reapTimeout             time.Duration
+	forceNodeDrain          bool
 	hostFilesPrefix         string
 	pollInterval            time.Duration
 	maxOperatorResponseTime time.Duration
@@ -114,6 +116,7 @@ func New(config *Config) (Klocksmith, error) {
 		ue:                      config.StatusReceiver,
 		lc:                      config.Rebooter,
 		reapTimeout:             config.PodDeletionGracePeriod,
+		forceNodeDrain:          config.ForceNodeDrain,
 		hostFilesPrefix:         config.HostFilesPrefix,
 		pollInterval:            pollInterval,
 		maxOperatorResponseTime: maxOperatorResponseTime,
@@ -269,7 +272,7 @@ func (k *klocksmith) process(ctx context.Context) error {
 		klog.Info("Node already marked as unschedulable")
 	}
 
-	drainer := newDrainer(ctx, k.clientset, k.reapTimeout)
+	drainer := newDrainer(ctx, k.clientset, k.reapTimeout, k.forceNodeDrain)
 
 	klog.Info("Getting pod list for deletion")
 
@@ -461,11 +464,11 @@ type drainer interface {
 	DeleteOrEvictPods([]corev1.Pod) error
 }
 
-func newDrainer(ctx context.Context, cs kubernetes.Interface, timeout time.Duration) drainer {
+func newDrainer(ctx context.Context, cs kubernetes.Interface, timeout time.Duration, forceNodeDrain bool) drainer {
 	return &drain.Helper{
 		Ctx:                ctx,
 		Client:             cs,
-		Force:              false,
+		Force:              forceNodeDrain,
 		GracePeriodSeconds: -1,
 		Timeout:            timeout,
 		// Explicitly don't terminate self? we'll probably just be a
diff --git a/pkg/agent/agent_test.go b/pkg/agent/agent_test.go
index 44560b2f6..2b9960421 100644
--- a/pkg/agent/agent_test.go
+++ b/pkg/agent/agent_test.go
@@ -931,6 +931,52 @@ func Test_Running_agent(t *testing.T) {
 		})
 	})
 
+	t.Run("removes_pod_without_owner_when_force_drain_is_configured", func(t *testing.T) {
+		t.Parallel()
+
+		rebootTriggerred := make(chan bool)
+
+		podsToCreate := []*corev1.Pod{
+			{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "foo",
+					Namespace: "default",
+				},
+				Spec: corev1.PodSpec{
+					NodeName: testNode().Name,
+				},
+			},
+		}
+
+		fakeClient := fake.NewSimpleClientset(podsToCreate[0], testNode())
+		addEvictionSupport(t, fakeClient)
+
+		testConfig, node, _ := validTestConfig(t, testNode())
+		testConfig.ForceNodeDrain = true
+		testConfig.Clientset = fakeClient
+		testConfig.Rebooter = &mockRebooter{
+			rebootF: func(auth bool) {
+				rebootTriggerred <- auth
+			},
+		}
+
+		ctx := contextWithTimeout(t, agentRunTimeLimit)
+
+		assertNodeProperty(ctx, t, &assertNodePropertyContext{
+			done:   runAgent(ctx, t, testConfig),
+			config: testConfig,
+			testF:  assertNodeAnnotationValue(constants.AnnotationRebootNeeded, constants.True),
+		})
+
+		okToReboot(ctx, t, testConfig.Clientset.CoreV1().Nodes(), node.Name)
+
+		select {
+		case <-ctx.Done():
+			t.Fatal("Timed out waiting for reboot to be triggered")
+		case <-rebootTriggerred:
+		}
+	})
+
 	t.Run("after_draining_node", func(t *testing.T) {
 		t.Parallel()