feat: Implement preferredDuringSchedulingIgnoredDuringExecution for RemovePodsViolatingNodeAffinity (#1210)

* feat: Implement preferredDuringSchedulingIgnoredDuringExecution for RemovePodsViolatingNodeAffinity Now, the descheduler can detect and evict pods that are not optimally allocated according to the "preferred..." node affinity. It only evicts a pod if it can be scheduled on a node that scores higher in terms of preferred node affinity than the current one. This can be activated by enabling the RemovePodsViolatingNodeAffinity plugin and passing "preferredDuringSchedulingIgnoredDuringExecution" in the args. For example, imagine we have a pod that prefers nodes with label "key1: value1" with a weight of 10. If this pod is scheduled on a node that doesn't have "key1: value1" as label but there's another node that has this label and where this pod can potentially run, then the descheduler will evict the pod. Another effect of this commit is that the RemovePodsViolatingNodeAffinity plugin will not remove pods that don't fit in the current node but for other reasons than violating the node affinity. Before that, enabling this plugin could cause evictions on pods that were running on tainted nodes without the necessary tolerations. This commit also fixes the wording of some tests from node_affinity_test.go and some parameters and expectations of these tests, which were wrong. * Optimization on RemovePodsViolatingNodeAffinity Before checking if a pod can be evicted or if it can be scheduled somewhere else, we first check if it has the corresponding nodeAffinity field defined. Otherwise, the pod is automatically discarded as a candidate. Apart from that, the method that calculates the weight that a pod gives to a node based on its preferred node affinity has been renamed to better reflect what it does.
2026-01-26 05:14:13 +01:00 · 2023-08-04 11:08:21 +01:00
parent 1be0ab2bd1
commit 31704047c5
8 changed files with 516 additions and 74 deletions
--- a/pkg/utils/pod.go
+++ b/pkg/utils/pod.go
@@ -213,3 +213,28 @@ func PodToleratesTaints(pod *v1.Pod, taintsOfNodes map[string][]v1.Taint) bool {
 	}
 	return false
 }
+
+// PodHasNodeAffinity returns true if the pod has a node affinity of type
+// `nodeAffinityType` defined. The nodeAffinityType param can take this two values:
+// "requiredDuringSchedulingIgnoredDuringExecution" or "requiredDuringSchedulingIgnoredDuringExecution"
+func PodHasNodeAffinity(pod *v1.Pod, nodeAffinityType NodeAffinityType) bool {
+	if pod.Spec.Affinity == nil {
+		return false
+	}
+	if pod.Spec.Affinity.NodeAffinity == nil {
+		return false
+	}
+	if nodeAffinityType == RequiredDuringSchedulingIgnoredDuringExecution {
+		return pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil
+	} else if nodeAffinityType == PreferredDuringSchedulingIgnoredDuringExecution {
+		return len(pod.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution) > 0
+	}
+	return false
+}
+
+type NodeAffinityType string
+
+const (
+	RequiredDuringSchedulingIgnoredDuringExecution  NodeAffinityType = "requiredDuringSchedulingIgnoredDuringExecution"
+	PreferredDuringSchedulingIgnoredDuringExecution NodeAffinityType = "preferredDuringSchedulingIgnoredDuringExecution"
+)
--- a/pkg/utils/predicates.go
+++ b/pkg/utils/predicates.go
@@ -275,3 +275,27 @@ func TolerationsEqual(t1, t2 []v1.Toleration) bool {
 	}
 	return true
 }
+
+// Returns the weight that the pod gives to a node by analyzing the
+// soft node affinity of that pod
+// (nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution)
+func GetNodeWeightGivenPodPreferredAffinity(pod *v1.Pod, node *v1.Node) (int32, error) {
+	if !PodHasNodeAffinity(pod, PreferredDuringSchedulingIgnoredDuringExecution) {
+		return 0, nil
+	}
+	// Iterate over each PreferredSchedulingTerm and check if it matches with the current node labels.
+	// If so, add the weight of the PreferredSchedulingTerm to the sum of weight. With that, we'll know
+	// the weight that the nodeAffinity from this pod gives to this node.
+	var sumWeights int32 = 0
+	for _, prefSchedulTerm := range pod.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution {
+		preferredNodeSelector := &v1.NodeSelector{NodeSelectorTerms: []v1.NodeSelectorTerm{prefSchedulTerm.Preference}}
+		match, err := corev1.MatchNodeSelectorTerms(node, preferredNodeSelector)
+		if err != nil {
+			klog.ErrorS(err, "error parsing node selector", "selector", preferredNodeSelector)
+		}
+		if match {
+			sumWeights += prefSchedulTerm.Weight
+		}
+	}
+	return sumWeights, nil
+}
--- a/pkg/utils/predicates_test.go
+++ b/pkg/utils/predicates_test.go
@@ -5,6 +5,7 @@ import (
 	"testing"

 	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )

 func TestUniqueSortTolerations(t *testing.T) {
@@ -938,3 +939,105 @@ func TestNodeSelectorTermsEqual(t *testing.T) {
 		})
 	}
 }
+
+func createNodeSelectorTerm(key, value string) v1.NodeSelectorTerm {
+	return v1.NodeSelectorTerm{
+		MatchExpressions: []v1.NodeSelectorRequirement{
+			{
+				Key:      key,
+				Operator: "In",
+				Values:   []string{value},
+			},
+		},
+	}
+}
+
+func TestPodNodeAffinityWeight(t *testing.T) {
+	defaultNode := v1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Labels: map[string]string{
+				"key1": "value1",
+				"key2": "value2",
+				"key3": "value3",
+			},
+		},
+	}
+	tests := []struct {
+		name           string
+		affinity       *v1.Affinity
+		expectedWeight int32
+	}{
+		{
+			name:           "No affinity",
+			affinity:       nil,
+			expectedWeight: 0,
+		},
+		{
+			name:           "No node affinity",
+			affinity:       &v1.Affinity{},
+			expectedWeight: 0,
+		},
+		{
+			name: "Empty preferred node affinity, but matching required node affinity",
+			affinity: &v1.Affinity{
+				NodeAffinity: &v1.NodeAffinity{
+					RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
+						NodeSelectorTerms: []v1.NodeSelectorTerm{
+							createNodeSelectorTerm("key1", "value1"),
+						},
+					},
+				},
+			},
+			expectedWeight: 0,
+		},
+		{
+			name: "Matching single key in preferred node affinity",
+			affinity: &v1.Affinity{
+				NodeAffinity: &v1.NodeAffinity{
+					PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{
+						{
+							Weight:     10,
+							Preference: createNodeSelectorTerm("key1", "value1"),
+						},
+						{
+							Weight:     5,
+							Preference: createNodeSelectorTerm("key1", "valueX"),
+						},
+					},
+				},
+			},
+			expectedWeight: 10,
+		},
+		{
+			name: "Matching two keys in preferred node affinity",
+			affinity: &v1.Affinity{
+				NodeAffinity: &v1.NodeAffinity{
+					PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{
+						{
+							Weight:     10,
+							Preference: createNodeSelectorTerm("key1", "value1"),
+						},
+						{
+							Weight:     5,
+							Preference: createNodeSelectorTerm("key2", "value2"),
+						},
+					},
+				},
+			},
+			expectedWeight: 15,
+		},
+	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			pod := v1.Pod{}
+			pod.Spec.Affinity = test.affinity
+			totalWeight, err := GetNodeWeightGivenPodPreferredAffinity(&pod, &defaultNode)
+			if err != nil {
+				t.Error("Found non nil error")
+			}
+			if totalWeight != test.expectedWeight {
+				t.Errorf("Expected total weight is %v but actual total weight is %v", test.expectedWeight, totalWeight)
+			}
+		})
+	}
+}