1
0
mirror of https://github.com/kubernetes-sigs/descheduler.git synced 2026-01-26 05:14:13 +01:00

Take node's taints into consideration when balancing domains

This commit is contained in:
Amir Alavi
2021-05-13 23:16:07 -04:00
parent 9b26abd538
commit 24c0ca2ef9
2 changed files with 178 additions and 20 deletions

View File

@@ -19,6 +19,7 @@ package strategies
import (
"context"
"fmt"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"math"
"sort"
@@ -283,20 +284,11 @@ func balanceDomains(
// also (just for tracking), add them to the list of pods in the lower topology
aboveToEvict := sortedDomains[j].pods[len(sortedDomains[j].pods)-movePods:]
for k := range aboveToEvict {
// if the pod has a hard nodeAffinity or nodeSelector that only matches this node,
// don't bother evicting it as it will just end up back on the same node
// however we still account for it "being evicted" so the algorithm can complete
// TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod
// but still try to get the required # of movePods (instead of just chopping that value off the slice above)
isRequiredDuringSchedulingIgnoredDuringExecution := aboveToEvict[k].Spec.Affinity != nil &&
aboveToEvict[k].Spec.Affinity.NodeAffinity != nil &&
aboveToEvict[k].Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil
if (aboveToEvict[k].Spec.NodeSelector != nil || isRequiredDuringSchedulingIgnoredDuringExecution) &&
nodesPodFitsOnBesidesCurrent(aboveToEvict[k], nodeMap) == 0 {
klog.V(2).InfoS("Ignoring pod for eviction due to node selector/affinity", "pod", klog.KObj(aboveToEvict[k]))
if err := validatePodFitsOnOtherNodes(aboveToEvict[k], nodeMap); err != nil {
klog.V(2).InfoS(fmt.Sprintf("ignoring pod for eviction due to: %s", err.Error()), "pod", klog.KObj(aboveToEvict[k]))
continue
}
podsForEviction[aboveToEvict[k]] = struct{}{}
}
sortedDomains[j].pods = sortedDomains[j].pods[:len(sortedDomains[j].pods)-movePods]
@@ -304,18 +296,54 @@ func balanceDomains(
}
}
// nodesPodFitsOnBesidesCurrent counts the number of nodes this pod could fit on based on its affinity
// validatePodFitsOnOtherNodes performs validation based on scheduling predicates for affinity and toleration.
// It excludes the current node because, for the sake of domain balancing only, we care about if there is any other
// place it could theoretically fit.
// If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading
func nodesPodFitsOnBesidesCurrent(pod *v1.Pod, nodeMap map[string]*v1.Node) int {
count := 0
func validatePodFitsOnOtherNodes(pod *v1.Pod, nodeMap map[string]*v1.Node) error {
// if the pod has a hard nodeAffinity/nodeSelector/toleration that only matches this node,
// don't bother evicting it as it will just end up back on the same node
// however we still account for it "being evicted" so the algorithm can complete
// TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod
// but still try to get the required # of movePods (instead of just chopping that value off the slice above)
isRequiredDuringSchedulingIgnoredDuringExecution := pod.Spec.Affinity != nil &&
pod.Spec.Affinity.NodeAffinity != nil &&
pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil
hardTaintsFilter := func(taint *v1.Taint) bool {
return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute
}
var eligibleNodesCount, ineligibleAffinityNodesCount, ineligibleTaintedNodesCount int
for _, node := range nodeMap {
if nodeutil.PodFitsCurrentNode(pod, node) && node != nodeMap[pod.Spec.NodeName] {
count++
if node == nodeMap[pod.Spec.NodeName] {
continue
}
if pod.Spec.NodeSelector != nil || isRequiredDuringSchedulingIgnoredDuringExecution {
if !nodeutil.PodFitsCurrentNode(pod, node) {
ineligibleAffinityNodesCount++
continue
}
}
return count
if !utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, hardTaintsFilter) {
ineligibleTaintedNodesCount++
continue
}
eligibleNodesCount++
}
if eligibleNodesCount == 0 {
var errs []error
if ineligibleAffinityNodesCount > 0 {
errs = append(errs, fmt.Errorf("%d nodes with ineligible selector/affinity", ineligibleAffinityNodesCount))
}
if ineligibleTaintedNodesCount > 0 {
errs = append(errs, fmt.Errorf("%d nodes with taints that are not tolerated", ineligibleTaintedNodesCount))
}
return utilerrors.NewAggregate(errs)
}
return nil
}
// sortDomains sorts and splits the list of topology domains based on their size

View File

@@ -574,6 +574,134 @@ func TestTopologySpreadConstraint(t *testing.T) {
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{IncludeSoftConstraints: true}},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [2,0], maxSkew=1, move 1 pod since pod tolerates the node with taint",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) {
n.Labels["zone"] = "zoneB"
n.Spec.Taints = []v1.Taint{
{
Key: "taint-test",
Value: "test",
Effect: v1.TaintEffectNoSchedule,
},
}
}),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
tolerations: []v1.Toleration{
{
Key: "taint-test",
Value: "test",
Operator: v1.TolerationOpEqual,
Effect: v1.TaintEffectNoSchedule,
},
},
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
nodeSelector: map[string]string{"zone": "zoneA"},
},
}),
expectedEvictedCount: 1,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [2,0], maxSkew=1, move 0 pods since pod does not tolerate the tainted node",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) {
n.Labels["zone"] = "zoneB"
n.Spec.Taints = []v1.Taint{
{
Key: "taint-test",
Value: "test",
Effect: v1.TaintEffectNoSchedule,
},
}
}),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
nodeSelector: map[string]string{"zone": "zoneA"},
},
}),
expectedEvictedCount: 0,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [2,0], maxSkew=1, move 1 pod for node with PreferNoSchedule Taint",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) {
n.Labels["zone"] = "zoneB"
n.Spec.Taints = []v1.Taint{
{
Key: "taint-test",
Value: "test",
Effect: v1.TaintEffectPreferNoSchedule,
},
}
}),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
nodeSelector: map[string]string{"zone": "zoneA"},
},
}),
expectedEvictedCount: 1,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
}
for _, tc := range testCases {
@@ -617,6 +745,7 @@ type testPodList struct {
nodeSelector map[string]string
nodeAffinity *v1.Affinity
noOwners bool
tolerations []v1.Toleration
}
func createTestPods(testPods []testPodList) []*v1.Pod {
@@ -636,6 +765,7 @@ func createTestPods(testPods []testPodList) []*v1.Pod {
p.Spec.TopologySpreadConstraints = tp.constraints
p.Spec.NodeSelector = tp.nodeSelector
p.Spec.Affinity = tp.nodeAffinity
p.Spec.Tolerations = tp.tolerations
}))
podNum++
}