diff --git a/README.md b/README.md index e6ff22ff3..f0176ce33 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ Table of Contents * [RemovePodsViolatingInterPodAntiAffinity](#removepodsviolatinginterpodantiaffinity) * [RemovePodsViolatingNodeAffinity](#removepodsviolatingnodeaffinity) * [RemovePodsViolatingNodeTaints](#removepodsviolatingnodetaints) + * [RemovePodsViolatingTopologySpreadConstraint](#removepodsviolatingtopologyspreadconstraint) * [RemovePodsHavingTooManyRestarts](#removepodshavingtoomanyrestarts) * [PodLifeTime](#podlifetime) * [Filter Pods](#filter-pods) @@ -102,17 +103,17 @@ See the [user guide](docs/user-guide.md) in the `/docs` directory. ## Policy and Strategies Descheduler's policy is configurable and includes strategies that can be enabled or disabled. -Seven strategies `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`, -`RemovePodsViolatingNodeAffinity`, `RemovePodsViolatingNodeTaints`, `RemovePodsHavingTooManyRestarts`, and `PodLifeTime` -are currently implemented. As part of the policy, the parameters associated with the strategies can be configured too. -By default, all strategies are enabled. +Eight strategies `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`, +`RemovePodsViolatingNodeAffinity`, `RemovePodsViolatingNodeTaints`, `RemovePodsViolatingTopologySpreadConstraint`, +`RemovePodsHavingTooManyRestarts`, and `PodLifeTime` are currently implemented. As part of the policy, the +parameters associated with the strategies can be configured too. By default, all strategies are enabled. The policy also includes common configuration for all the strategies: - `nodeSelector` - limiting the nodes which are processed - `evictLocalStoragePods` - allowing to evict pods with local storage - `maxNoOfPodsToEvictPerNode` - maximum number of pods evicted from each node (summed through all strategies) -``` +```yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" nodeSelector: prod=dev @@ -144,7 +145,7 @@ has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considere |`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))| **Example:** -``` +```yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: @@ -186,7 +187,7 @@ These thresholds, `thresholds` and `targetThresholds`, could be tuned as per you **Example:** -``` +```yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: @@ -236,7 +237,7 @@ node. **Example:** -``` +```yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: @@ -273,7 +274,7 @@ podA gets evicted from nodeA. **Example:** -``` +```yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: @@ -301,7 +302,7 @@ and will be evicted. **Example:** -```` +````yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: @@ -309,6 +310,31 @@ strategies: enabled: true ```` +### RemovePodsViolatingTopologySpreadConstraint + +This strategy makes sure that pods violating [topology spread constraints](https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/) +are evicted from nodes. Specifically, it tries to evict the minimum number of pods required to balance topology domains to within each constraint's `maxSkew`. +This strategy requires k8s version 1.18 at a minimum. + +**Parameters:** + +|Name|Type| +|---|---| +|`thresholdPriority`|int (see [priority filtering](#priority-filtering))| +|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))| +|`namespaces`|(see [namespace filtering](#namespace-filtering))| + +**Example:** + +```yaml +apiVersion: "descheduler/v1alpha1" +kind: "DeschedulerPolicy" +strategies: + "RemovePodsViolatingTopologySpreadConstraint": + enabled: true +``` + + ### RemovePodsHavingTooManyRestarts This strategy makes sure that pods having too many restarts are removed from nodes. For example a pod with EBS/PD that @@ -328,7 +354,7 @@ which determines whether init container restarts should be factored into that ca **Example:** -``` +```yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: @@ -359,7 +385,7 @@ to `Running` and `Pending`. **Example:** -``` +```yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: @@ -383,10 +409,11 @@ The following strategies accept a `namespaces` parameter which allows to specify * `RemovePodsViolatingNodeAffinity` * `RemovePodsViolatingInterPodAntiAffinity` * `RemoveDuplicates` +* `RemovePodsViolatingTopologySpreadConstraint` For example: -``` +```yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: @@ -404,7 +431,7 @@ strategies: In the examples `PodLifeTime` gets executed only over `namespace1` and `namespace2`. The similar holds for `exclude` field: -``` +```yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: @@ -432,7 +459,7 @@ is set to the value of `system-cluster-critical` priority class. E.g. Setting `thresholdPriority` -``` +```yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: @@ -445,7 +472,7 @@ strategies: ``` Setting `thresholdPriorityClassName` -``` +```yaml apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: diff --git a/go.sum b/go.sum index 80d64c897..3aff4c8d5 100644 --- a/go.sum +++ b/go.sum @@ -144,6 +144,7 @@ github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7a github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls= github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903 h1:LbsanbbD6LieFkXbj9YNNBupiGHJgFeLpO0j0Fza1h8= github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= diff --git a/kubernetes/base/rbac.yaml b/kubernetes/base/rbac.yaml index f07af760a..7a9e31fec 100644 --- a/kubernetes/base/rbac.yaml +++ b/kubernetes/base/rbac.yaml @@ -10,6 +10,9 @@ rules: - apiGroups: [""] resources: ["nodes"] verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list"] - apiGroups: [""] resources: ["pods"] verbs: ["get", "watch", "list", "delete"] diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go index dff3e62a4..a6d86fd92 100644 --- a/pkg/descheduler/descheduler.go +++ b/pkg/descheduler/descheduler.go @@ -70,13 +70,14 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer sharedInformerFactory.WaitForCacheSync(stopChannel) strategyFuncs := map[string]strategyFunction{ - "RemoveDuplicates": strategies.RemoveDuplicatePods, - "LowNodeUtilization": strategies.LowNodeUtilization, - "RemovePodsViolatingInterPodAntiAffinity": strategies.RemovePodsViolatingInterPodAntiAffinity, - "RemovePodsViolatingNodeAffinity": strategies.RemovePodsViolatingNodeAffinity, - "RemovePodsViolatingNodeTaints": strategies.RemovePodsViolatingNodeTaints, - "RemovePodsHavingTooManyRestarts": strategies.RemovePodsHavingTooManyRestarts, - "PodLifeTime": strategies.PodLifeTime, + "RemoveDuplicates": strategies.RemoveDuplicatePods, + "LowNodeUtilization": strategies.LowNodeUtilization, + "RemovePodsViolatingInterPodAntiAffinity": strategies.RemovePodsViolatingInterPodAntiAffinity, + "RemovePodsViolatingNodeAffinity": strategies.RemovePodsViolatingNodeAffinity, + "RemovePodsViolatingNodeTaints": strategies.RemovePodsViolatingNodeTaints, + "RemovePodsHavingTooManyRestarts": strategies.RemovePodsHavingTooManyRestarts, + "PodLifeTime": strategies.PodLifeTime, + "RemovePodsViolatingTopologySpreadConstraint": strategies.RemovePodsViolatingTopologySpreadConstraint, } nodeSelector := rs.NodeSelector diff --git a/pkg/descheduler/strategies/topologyspreadconstraint.go b/pkg/descheduler/strategies/topologyspreadconstraint.go new file mode 100644 index 000000000..ce43c54b5 --- /dev/null +++ b/pkg/descheduler/strategies/topologyspreadconstraint.go @@ -0,0 +1,355 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package strategies + +import ( + "context" + "fmt" + "math" + "sort" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/sets" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + "sigs.k8s.io/descheduler/pkg/api" + "sigs.k8s.io/descheduler/pkg/descheduler/evictions" + nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" + "sigs.k8s.io/descheduler/pkg/utils" +) + +// AntiAffinityTerm's topology key value used in predicate metadata +type topologyPair struct { + key string + value string +} + +type topology struct { + pair topologyPair + pods []*v1.Pod +} + +func validateAndParseTopologySpreadParams(ctx context.Context, client clientset.Interface, params *api.StrategyParameters) (int32, sets.String, sets.String, error) { + var includedNamespaces, excludedNamespaces sets.String + if params == nil { + return 0, includedNamespaces, excludedNamespaces, nil + } + // At most one of include/exclude can be set + if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 { + return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("only one of Include/Exclude namespaces can be set") + } + if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" { + return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set") + } + thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, params) + if err != nil { + return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("failed to get threshold priority from strategy's params: %+v", err) + } + if params.Namespaces != nil { + includedNamespaces = sets.NewString(params.Namespaces.Include...) + excludedNamespaces = sets.NewString(params.Namespaces.Exclude...) + } + + return thresholdPriority, includedNamespaces, excludedNamespaces, nil +} + +func RemovePodsViolatingTopologySpreadConstraint( + ctx context.Context, + client clientset.Interface, + strategy api.DeschedulerStrategy, + nodes []*v1.Node, + podEvictor *evictions.PodEvictor, +) { + thresholdPriority, includedNamespaces, excludedNamespaces, err := validateAndParseTopologySpreadParams(ctx, client, strategy.Params) + if err != nil { + klog.ErrorS(err, "Invalid PodLifeTime parameters") + return + } + + nodeMap := make(map[string]*v1.Node, len(nodes)) + for _, node := range nodes { + nodeMap[node.Name] = node + } + evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority)) + + // 1. for each namespace for which there is Topology Constraint + // 2. for each TopologySpreadyConstraint in that namespace + // { find all evictable pods in that namespace + // { 3. for each evictable pod in that namespace + // 4. If the pod matches this TopologySpreadConstraint LabelSelector + // 5. If the pod nodeName is present in the nodeMap + // 6. create a topoPair with key as this TopologySpreadConstraint.TopologyKey and value as this pod's Node Label Value for this TopologyKey + // 7. add the pod with key as this topoPair + // 8. find the min number of pods in any topoPair for this topologyKey + // iterate through all topoPairs for this topologyKey and diff currentPods -minPods <=maxSkew + // if diff > maxSkew, add this pod in the current bucket for eviction + + // First record all of the constraints by namespace + namespaces, err := client.CoreV1().Namespaces().List(ctx, metav1.ListOptions{}) + if err != nil { + klog.ErrorS(err, "Couldn't list namespaces") + return + } + podsForEviction := make(map[*v1.Pod]struct{}) + // 1. for each namespace... + for _, namespace := range namespaces.Items { + if (!includedNamespaces.Has(namespace.Name) || excludedNamespaces.Has(namespace.Name)) && (includedNamespaces.Len()+excludedNamespaces.Len() > 0) { + continue + } + namespacePods, err := client.CoreV1().Pods(namespace.Name).List(ctx, metav1.ListOptions{}) + if err != nil { + klog.ErrorS(err, "Couldn't list pods in namespace", "namespace", namespace) + continue + } + + // ...where there is a topology constraint + //namespaceTopologySpreadConstrainPods := make([]v1.Pod, 0, len(namespacePods.Items)) + namespaceTopologySpreadConstraints := make(map[v1.TopologySpreadConstraint]struct{}) + for _, pod := range namespacePods.Items { + for _, constraint := range pod.Spec.TopologySpreadConstraints { + // Only deal with hard topology constraints + // TODO(@damemi): add support for soft constraints + if constraint.WhenUnsatisfiable != v1.DoNotSchedule { + continue + } + namespaceTopologySpreadConstraints[constraint] = struct{}{} + } + } + if len(namespaceTopologySpreadConstraints) == 0 { + continue + } + + // 2. for each topologySpreadConstraint in that namespace + for constraint := range namespaceTopologySpreadConstraints { + constraintTopologies := make(map[topologyPair][]*v1.Pod) + // pre-populate the topologyPair map with all the topologies available from the nodeMap + // (we can't just build it from existing pods' nodes because a topology may have 0 pods) + for _, node := range nodeMap { + if val, ok := node.Labels[constraint.TopologyKey]; ok { + constraintTopologies[topologyPair{key: constraint.TopologyKey, value: val}] = make([]*v1.Pod, 0) + } + } + + selector, err := metav1.LabelSelectorAsSelector(constraint.LabelSelector) + if err != nil { + klog.ErrorS(err, "Couldn't parse label selector as selector", "selector", constraint.LabelSelector) + continue + } + + // 3. for each evictable pod in that namespace + // (this loop is where we count the number of pods per topologyValue that match this constraint'selector selector) + var sumPods float64 + for i := range namespacePods.Items { + // 4. if the pod matches this TopologySpreadConstraint LabelSelector + if !selector.Matches(labels.Set(namespacePods.Items[i].Labels)) { + continue + } + + // 5. If the pod'selector node matches this constraint'selector topologyKey, create a topoPair and add the pod + node, ok := nodeMap[namespacePods.Items[i].Spec.NodeName] + if !ok { + // If ok is false, node is nil in which case node.Labels will panic. In which case a pod is yet to be scheduled. So it's safe to just continue here. + continue + } + nodeValue, ok := node.Labels[constraint.TopologyKey] + if !ok { + continue + } + // 6. create a topoPair with key as this TopologySpreadConstraint + topoPair := topologyPair{key: constraint.TopologyKey, value: nodeValue} + // 7. add the pod with key as this topoPair + constraintTopologies[topoPair] = append(constraintTopologies[topoPair], &namespacePods.Items[i]) + sumPods++ + } + balanceDomains(podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodeMap) + } + } + + for pod := range podsForEviction { + if _, err := podEvictor.EvictPod(ctx, pod, nodeMap[pod.Spec.NodeName], "PodTopologySpread"); err != nil && !evictable.IsEvictable(pod) { + klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod)) + break + } + } +} + +// balanceDomains determines how many pods (minimum) should be evicted from large domains to achieve an ideal balance within maxSkew +// To actually determine how many pods need to be moved, we sort the topology domains in ascending length +// [2, 5, 3, 8, 5, 7] +// +// Would end up like: +// [2, 3, 5, 5, 7, 8] +// +// We then start at i=[0] and j=[len(list)-1] and compare the 2 topology sizes. +// If the diff of the size of the domains is more than the maxSkew, we will move up to half that skew, +// or the available pods from the higher domain, or the number required to bring the smaller domain up to the average, +// whichever number is less. +// +// (Note, we will only move as many pods from a domain as possible without bringing it below the ideal average, +// and we will not bring any smaller domain above the average) +// If the diff is within the skew, we move to the next highest domain. +// If the higher domain can't give any more without falling below the average, we move to the next lowest "high" domain +// +// Following this, the above topology domains end up "sorted" as: +// [5, 5, 5, 5, 5, 5] +// (assuming even distribution by the scheduler of the evicted pods) +func balanceDomains( + podsForEviction map[*v1.Pod]struct{}, + constraint v1.TopologySpreadConstraint, + constraintTopologies map[topologyPair][]*v1.Pod, + sumPods float64, + isEvictable func(*v1.Pod) bool, + nodeMap map[string]*v1.Node) { + idealAvg := sumPods / float64(len(constraintTopologies)) + sortedDomains := sortDomains(constraintTopologies, isEvictable) + // i is the index for belowOrEqualAvg + // j is the index for aboveAvg + i := 0 + j := len(sortedDomains) - 1 + for i < j { + // if j has no more to give without falling below the ideal average, move to next aboveAvg + if float64(len(sortedDomains[j].pods)) < idealAvg { + j-- + } + + // skew = actual difference between the domains + skew := float64(len(sortedDomains[j].pods) - len(sortedDomains[i].pods)) + + // if k and j are within the maxSkew of each other, move to next belowOrEqualAvg + if int32(skew) <= constraint.MaxSkew { + i++ + continue + } + + // the most that can be given from aboveAvg is: + // 1. up to half the distance between them, minus MaxSkew, rounded up + // 2. how many it has remaining without falling below the average rounded up, or + // 3. how many can be added without bringing the smaller domain above the average rounded up, + // whichever is less + // (This is the basic principle of keeping all sizes within ~skew of the average) + aboveAvg := math.Ceil(float64(len(sortedDomains[j].pods)) - idealAvg) + belowAvg := math.Ceil(idealAvg - float64(len(sortedDomains[i].pods))) + smallestDiff := math.Min(aboveAvg, belowAvg) + halfSkew := math.Ceil((skew - float64(constraint.MaxSkew)) / 2) + movePods := int(math.Min(smallestDiff, halfSkew)) + if movePods <= 0 { + i++ + continue + } + + // remove pods from the higher topology and add them to the list of pods to be evicted + // also (just for tracking), add them to the list of pods in the lower topology + aboveToEvict := sortedDomains[j].pods[len(sortedDomains[j].pods)-movePods:] + for k := range aboveToEvict { + // if the pod has a hard nodeAffinity or nodeSelector that only matches this node, + // don't bother evicting it as it will just end up back on the same node + // however we still account for it "being evicted" so the algorithm can complete + // TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod + // but still try to get the required # of movePods (instead of just chopping that value off the slice above) + if aboveToEvict[k].Spec.NodeSelector != nil || + (aboveToEvict[k].Spec.Affinity != nil && + aboveToEvict[k].Spec.Affinity.NodeAffinity != nil && + aboveToEvict[k].Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil && + nodesPodFitsOnBesidesCurrent(aboveToEvict[k], nodeMap) == 0) { + continue + } + podsForEviction[aboveToEvict[k]] = struct{}{} + } + sortedDomains[j].pods = sortedDomains[j].pods[:len(sortedDomains[j].pods)-movePods] + sortedDomains[i].pods = append(sortedDomains[i].pods, aboveToEvict...) + } +} + +// nodesPodFitsOnBesidesCurrent counts the number of nodes this pod could fit on based on its affinity +// It excludes the current node because, for the sake of domain balancing only, we care about if there is any other +// place it could theoretically fit. +// If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading +func nodesPodFitsOnBesidesCurrent(pod *v1.Pod, nodeMap map[string]*v1.Node) int { + count := 0 + for _, node := range nodeMap { + if nodeutil.PodFitsCurrentNode(pod, node) && node != nodeMap[pod.Spec.NodeName] { + count++ + } + } + return count +} + +// sortDomains sorts and splits the list of topology domains based on their size +// it also sorts the list of pods within the domains based on their node affinity/selector and priority in the following order: +// 1. non-evictable pods +// 2. pods with selectors or affinity +// 3. pods in descending priority +// 4. all other pods +// We then pop pods off the back of the list for eviction +func sortDomains(constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(*v1.Pod) bool) []topology { + sortedTopologies := make([]topology, 0, len(constraintTopologyPairs)) + // sort the topologies and return 2 lists: those <= the average and those > the average (> list inverted) + for pair, list := range constraintTopologyPairs { + // Sort the pods within the domain so that the lowest priority pods are considered first for eviction, + // followed by the highest priority, + // followed by the lowest priority pods with affinity or nodeSelector, + // followed by the highest priority pods with affinity or nodeSelector + sort.Slice(list, func(i, j int) bool { + // any non-evictable pods should be considered last (ie, first in the list) + if !isEvictable(list[i]) || !isEvictable(list[j]) { + // false - i is the only non-evictable, so return true to put it first + // true - j is non-evictable, so return false to put j before i + // if true and both and non-evictable, order doesn't matter + return !(isEvictable(list[i]) && !isEvictable(list[j])) + } + + // if both pods have selectors/affinity, compare them by their priority + if hasSelectorOrAffinity(*list[i]) == hasSelectorOrAffinity(*list[j]) { + comparePodsByPriority(list[i], list[j]) + } + return hasSelectorOrAffinity(*list[i]) && !hasSelectorOrAffinity(*list[j]) + }) + sortedTopologies = append(sortedTopologies, topology{pair: pair, pods: list}) + } + + // create an ascending slice of all key-value toplogy pairs + sort.Slice(sortedTopologies, func(i, j int) bool { + return len(sortedTopologies[i].pods) < len(sortedTopologies[j].pods) + }) + + return sortedTopologies +} + +func hasSelectorOrAffinity(pod v1.Pod) bool { + return pod.Spec.NodeSelector != nil || (pod.Spec.Affinity != nil && pod.Spec.Affinity.NodeAffinity != nil) +} + +// comparePodsByPriority is a helper to the sort function to compare 2 pods based on their priority values +// It will sort the pods in DESCENDING order of priority, since in our logic we evict pods from the back +// of the list first. +func comparePodsByPriority(iPod, jPod *v1.Pod) bool { + if iPod.Spec.Priority != nil && jPod.Spec.Priority != nil { + // a LOWER priority value should be evicted FIRST + return *iPod.Spec.Priority > *jPod.Spec.Priority + } else if iPod.Spec.Priority != nil && jPod.Spec.Priority == nil { + // i should come before j + return true + } else if iPod.Spec.Priority == nil && jPod.Spec.Priority != nil { + // j should come before i + return false + } else { + // it doesn't matter. just return true + return true + } +} diff --git a/pkg/descheduler/strategies/topologyspreadconstraint_test.go b/pkg/descheduler/strategies/topologyspreadconstraint_test.go new file mode 100644 index 000000000..93057a230 --- /dev/null +++ b/pkg/descheduler/strategies/topologyspreadconstraint_test.go @@ -0,0 +1,340 @@ +package strategies + +import ( + "context" + "fmt" + "sigs.k8s.io/descheduler/pkg/api" + "testing" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" + core "k8s.io/client-go/testing" + "sigs.k8s.io/descheduler/pkg/descheduler/evictions" + "sigs.k8s.io/descheduler/test" +) + +func TestTopologySpreadConstraint(t *testing.T) { + ctx := context.Background() + testCases := []struct { + name string + pods []*v1.Pod + expectedEvictedCount int + nodes []*v1.Node + strategy api.DeschedulerStrategy + namespaces []string + }{ + { + name: "2 domains, sizes [3,1], maxSkew=1, move 1 pod to achieve [2,2]", + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + constraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}, + }, + }, + }, + { + count: 2, + node: "n1", + labels: map[string]string{"foo": "bar"}, + }, + { + count: 1, + node: "n2", + labels: map[string]string{"foo": "bar"}, + }, + }), + expectedEvictedCount: 1, + strategy: api.DeschedulerStrategy{}, + namespaces: []string{"ns1"}, + }, + { + name: "2 domains, sizes [5,2], maxSkew=1, move 1 pod to achieve [4,3]", + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + constraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}, + }, + }, + }, + { + count: 4, + node: "n1", + labels: map[string]string{"foo": "bar"}, + }, + { + count: 2, + node: "n2", + labels: map[string]string{"foo": "bar"}, + }, + }), + expectedEvictedCount: 1, + strategy: api.DeschedulerStrategy{}, + namespaces: []string{"ns1"}, + }, + { + name: "2 domains, sizes [4,0], maxSkew=1, move 2 pods to achieve [2,2]", + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + constraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}, + }, + }, + }, + { + count: 3, + node: "n1", + labels: map[string]string{"foo": "bar"}, + }, + }), + expectedEvictedCount: 2, + strategy: api.DeschedulerStrategy{}, + namespaces: []string{"ns1"}, + }, + { + name: "2 domains, sizes [4,0], maxSkew=1, only move 1 pod since pods with nodeSelector and nodeAffinity aren't evicted", + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + constraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}, + }, + }, + nodeSelector: map[string]string{"zone": "zoneA"}, + }, + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + nodeSelector: map[string]string{"zone": "zoneA"}, + }, + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + nodeAffinity: &v1.Affinity{NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{NodeSelectorTerms: []v1.NodeSelectorTerm{ + {MatchExpressions: []v1.NodeSelectorRequirement{{Key: "foo", Values: []string{"bar"}, Operator: v1.NodeSelectorOpIn}}}, + }}, + }}, + }, + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + }, + }), + expectedEvictedCount: 1, + strategy: api.DeschedulerStrategy{}, + namespaces: []string{"ns1"}, + }, + { + name: "3 domains, sizes [0, 1, 100], maxSkew=1, move 66 pods to get [34, 33, 34]", + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + test.BuildTestNode("n3", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneC" }), + }, + pods: createTestPods([]testPodList{ + { + count: 1, + node: "n2", + labels: map[string]string{"foo": "bar"}, + constraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}, + }, + }, + }, + { + count: 100, + node: "n3", + labels: map[string]string{"foo": "bar"}, + }, + }), + expectedEvictedCount: 66, + strategy: api.DeschedulerStrategy{}, + namespaces: []string{"ns1"}, + }, + { + name: "4 domains, sizes [0, 1, 3, 5], should move 3 to get [2, 2, 3, 2]", + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + test.BuildTestNode("n3", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneC" }), + test.BuildTestNode("n4", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneD" }), + }, + pods: createTestPods([]testPodList{ + { + count: 1, + node: "n2", + labels: map[string]string{"foo": "bar"}, + constraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}, + }, + }, + }, + { + count: 3, + node: "n3", + labels: map[string]string{"foo": "bar"}, + }, + { + count: 5, + node: "n4", + labels: map[string]string{"foo": "bar"}, + }, + }), + expectedEvictedCount: 3, + strategy: api.DeschedulerStrategy{}, + namespaces: []string{"ns1"}, + }, + { + name: "2 domains size [2 6], maxSkew=2, should move 1 to get [3 5]", + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + constraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 2, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}, + }, + }, + }, + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + }, + { + count: 6, + node: "n2", + labels: map[string]string{"foo": "bar"}, + }, + }), + expectedEvictedCount: 1, + strategy: api.DeschedulerStrategy{}, + namespaces: []string{"ns1"}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + fakeClient := &fake.Clientset{} + fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { + podList := make([]v1.Pod, 0, len(tc.pods)) + for _, pod := range tc.pods { + podList = append(podList, *pod) + } + return true, &v1.PodList{Items: podList}, nil + }) + fakeClient.Fake.AddReactor("list", "namespaces", func(action core.Action) (bool, runtime.Object, error) { + return true, &v1.NamespaceList{Items: []v1.Namespace{{ObjectMeta: metav1.ObjectMeta{Name: "ns1", Namespace: "ns1"}}}}, nil + }) + + podEvictor := evictions.NewPodEvictor( + fakeClient, + "v1", + false, + 100, + tc.nodes, + false, + ) + RemovePodsViolatingTopologySpreadConstraint(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor) + podsEvicted := podEvictor.TotalEvicted() + if podsEvicted != tc.expectedEvictedCount { + t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", tc.name, tc.expectedEvictedCount, podsEvicted) + } + }) + } +} + +type testPodList struct { + count int + node string + labels map[string]string + constraints []v1.TopologySpreadConstraint + nodeSelector map[string]string + nodeAffinity *v1.Affinity +} + +func createTestPods(testPods []testPodList) []*v1.Pod { + ownerRef1 := test.GetReplicaSetOwnerRefList() + pods := make([]*v1.Pod, 0) + podNum := 0 + for _, tp := range testPods { + for i := 0; i < tp.count; i++ { + pods = append(pods, + test.BuildTestPod(fmt.Sprintf("pod-%d", podNum), 100, 0, tp.node, func(p *v1.Pod) { + p.Labels = make(map[string]string) + p.Labels = tp.labels + p.Namespace = "ns1" + p.ObjectMeta.OwnerReferences = ownerRef1 + p.Spec.TopologySpreadConstraints = tp.constraints + p.Spec.NodeSelector = tp.nodeSelector + p.Spec.Affinity = tp.nodeAffinity + })) + podNum++ + } + } + return pods +}