1
0
mirror of https://github.com/kubernetes-sigs/descheduler.git synced 2026-01-26 05:14:13 +01:00

Add RemovePodsViolatingTopologySpreadConstraint strategy

This adds a strategy to balance pod topology domains based on the scheduler's
PodTopologySpread constraints. It attempts to find the minimum number of pods
that should be sent for eviction by comparing the largest domains in a topology
with the smallest domains in that topology.
This commit is contained in:
Mayank Kumar
2019-05-23 13:46:09 -07:00
committed by Mike Dame
parent 6240aa68f7
commit 4108362158
6 changed files with 750 additions and 23 deletions

View File

@@ -37,6 +37,7 @@ Table of Contents
* [RemovePodsViolatingInterPodAntiAffinity](#removepodsviolatinginterpodantiaffinity)
* [RemovePodsViolatingNodeAffinity](#removepodsviolatingnodeaffinity)
* [RemovePodsViolatingNodeTaints](#removepodsviolatingnodetaints)
* [RemovePodsViolatingTopologySpreadConstraint](#removepodsviolatingtopologyspreadconstraint)
* [RemovePodsHavingTooManyRestarts](#removepodshavingtoomanyrestarts)
* [PodLifeTime](#podlifetime)
* [Filter Pods](#filter-pods)
@@ -102,17 +103,17 @@ See the [user guide](docs/user-guide.md) in the `/docs` directory.
## Policy and Strategies
Descheduler's policy is configurable and includes strategies that can be enabled or disabled.
Seven strategies `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`,
`RemovePodsViolatingNodeAffinity`, `RemovePodsViolatingNodeTaints`, `RemovePodsHavingTooManyRestarts`, and `PodLifeTime`
are currently implemented. As part of the policy, the parameters associated with the strategies can be configured too.
By default, all strategies are enabled.
Eight strategies `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`,
`RemovePodsViolatingNodeAffinity`, `RemovePodsViolatingNodeTaints`, `RemovePodsViolatingTopologySpreadConstraint`,
`RemovePodsHavingTooManyRestarts`, and `PodLifeTime` are currently implemented. As part of the policy, the
parameters associated with the strategies can be configured too. By default, all strategies are enabled.
The policy also includes common configuration for all the strategies:
- `nodeSelector` - limiting the nodes which are processed
- `evictLocalStoragePods` - allowing to evict pods with local storage
- `maxNoOfPodsToEvictPerNode` - maximum number of pods evicted from each node (summed through all strategies)
```
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
nodeSelector: prod=dev
@@ -144,7 +145,7 @@ has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considere
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
**Example:**
```
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -186,7 +187,7 @@ These thresholds, `thresholds` and `targetThresholds`, could be tuned as per you
**Example:**
```
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -236,7 +237,7 @@ node.
**Example:**
```
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -273,7 +274,7 @@ podA gets evicted from nodeA.
**Example:**
```
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -301,7 +302,7 @@ and will be evicted.
**Example:**
````
````yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -309,6 +310,31 @@ strategies:
enabled: true
````
### RemovePodsViolatingTopologySpreadConstraint
This strategy makes sure that pods violating [topology spread constraints](https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/)
are evicted from nodes. Specifically, it tries to evict the minimum number of pods required to balance topology domains to within each constraint's `maxSkew`.
This strategy requires k8s version 1.18 at a minimum.
**Parameters:**
|Name|Type|
|---|---|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
**Example:**
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"RemovePodsViolatingTopologySpreadConstraint":
enabled: true
```
### RemovePodsHavingTooManyRestarts
This strategy makes sure that pods having too many restarts are removed from nodes. For example a pod with EBS/PD that
@@ -328,7 +354,7 @@ which determines whether init container restarts should be factored into that ca
**Example:**
```
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -359,7 +385,7 @@ to `Running` and `Pending`.
**Example:**
```
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -383,10 +409,11 @@ The following strategies accept a `namespaces` parameter which allows to specify
* `RemovePodsViolatingNodeAffinity`
* `RemovePodsViolatingInterPodAntiAffinity`
* `RemoveDuplicates`
* `RemovePodsViolatingTopologySpreadConstraint`
For example:
```
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -404,7 +431,7 @@ strategies:
In the examples `PodLifeTime` gets executed only over `namespace1` and `namespace2`.
The similar holds for `exclude` field:
```
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -432,7 +459,7 @@ is set to the value of `system-cluster-critical` priority class.
E.g.
Setting `thresholdPriority`
```
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
@@ -445,7 +472,7 @@ strategies:
```
Setting `thresholdPriorityClassName`
```
```yaml
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:

1
go.sum
View File

@@ -126,6 +126,7 @@ github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7a
github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls=
github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58=
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903 h1:LbsanbbD6LieFkXbj9YNNBupiGHJgFeLpO0j0Fza1h8=
github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=

View File

@@ -10,6 +10,9 @@ rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list", "delete"]

View File

@@ -70,13 +70,14 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer
sharedInformerFactory.WaitForCacheSync(stopChannel)
strategyFuncs := map[string]strategyFunction{
"RemoveDuplicates": strategies.RemoveDuplicatePods,
"LowNodeUtilization": strategies.LowNodeUtilization,
"RemovePodsViolatingInterPodAntiAffinity": strategies.RemovePodsViolatingInterPodAntiAffinity,
"RemovePodsViolatingNodeAffinity": strategies.RemovePodsViolatingNodeAffinity,
"RemovePodsViolatingNodeTaints": strategies.RemovePodsViolatingNodeTaints,
"RemovePodsHavingTooManyRestarts": strategies.RemovePodsHavingTooManyRestarts,
"PodLifeTime": strategies.PodLifeTime,
"RemoveDuplicates": strategies.RemoveDuplicatePods,
"LowNodeUtilization": strategies.LowNodeUtilization,
"RemovePodsViolatingInterPodAntiAffinity": strategies.RemovePodsViolatingInterPodAntiAffinity,
"RemovePodsViolatingNodeAffinity": strategies.RemovePodsViolatingNodeAffinity,
"RemovePodsViolatingNodeTaints": strategies.RemovePodsViolatingNodeTaints,
"RemovePodsHavingTooManyRestarts": strategies.RemovePodsHavingTooManyRestarts,
"PodLifeTime": strategies.PodLifeTime,
"RemovePodsViolatingTopologySpreadConstraint": strategies.RemovePodsViolatingTopologySpreadConstraint,
}
nodeSelector := rs.NodeSelector

View File

@@ -0,0 +1,355 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package strategies
import (
"context"
"fmt"
"math"
"sort"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
"sigs.k8s.io/descheduler/pkg/api"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
"sigs.k8s.io/descheduler/pkg/utils"
)
// AntiAffinityTerm's topology key value used in predicate metadata
type topologyPair struct {
key string
value string
}
type topology struct {
pair topologyPair
pods []*v1.Pod
}
func validateAndParseTopologySpreadParams(ctx context.Context, client clientset.Interface, params *api.StrategyParameters) (int32, sets.String, sets.String, error) {
var includedNamespaces, excludedNamespaces sets.String
if params == nil {
return 0, includedNamespaces, excludedNamespaces, nil
}
// At most one of include/exclude can be set
if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 {
return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("only one of Include/Exclude namespaces can be set")
}
if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set")
}
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, params)
if err != nil {
return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("failed to get threshold priority from strategy's params: %+v", err)
}
if params.Namespaces != nil {
includedNamespaces = sets.NewString(params.Namespaces.Include...)
excludedNamespaces = sets.NewString(params.Namespaces.Exclude...)
}
return thresholdPriority, includedNamespaces, excludedNamespaces, nil
}
func RemovePodsViolatingTopologySpreadConstraint(
ctx context.Context,
client clientset.Interface,
strategy api.DeschedulerStrategy,
nodes []*v1.Node,
podEvictor *evictions.PodEvictor,
) {
thresholdPriority, includedNamespaces, excludedNamespaces, err := validateAndParseTopologySpreadParams(ctx, client, strategy.Params)
if err != nil {
klog.ErrorS(err, "Invalid PodLifeTime parameters")
return
}
nodeMap := make(map[string]*v1.Node, len(nodes))
for _, node := range nodes {
nodeMap[node.Name] = node
}
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority))
// 1. for each namespace for which there is Topology Constraint
// 2. for each TopologySpreadyConstraint in that namespace
// { find all evictable pods in that namespace
// { 3. for each evictable pod in that namespace
// 4. If the pod matches this TopologySpreadConstraint LabelSelector
// 5. If the pod nodeName is present in the nodeMap
// 6. create a topoPair with key as this TopologySpreadConstraint.TopologyKey and value as this pod's Node Label Value for this TopologyKey
// 7. add the pod with key as this topoPair
// 8. find the min number of pods in any topoPair for this topologyKey
// iterate through all topoPairs for this topologyKey and diff currentPods -minPods <=maxSkew
// if diff > maxSkew, add this pod in the current bucket for eviction
// First record all of the constraints by namespace
namespaces, err := client.CoreV1().Namespaces().List(ctx, metav1.ListOptions{})
if err != nil {
klog.ErrorS(err, "Couldn't list namespaces")
return
}
podsForEviction := make(map[*v1.Pod]struct{})
// 1. for each namespace...
for _, namespace := range namespaces.Items {
if (!includedNamespaces.Has(namespace.Name) || excludedNamespaces.Has(namespace.Name)) && (includedNamespaces.Len()+excludedNamespaces.Len() > 0) {
continue
}
namespacePods, err := client.CoreV1().Pods(namespace.Name).List(ctx, metav1.ListOptions{})
if err != nil {
klog.ErrorS(err, "Couldn't list pods in namespace", "namespace", namespace)
continue
}
// ...where there is a topology constraint
//namespaceTopologySpreadConstrainPods := make([]v1.Pod, 0, len(namespacePods.Items))
namespaceTopologySpreadConstraints := make(map[v1.TopologySpreadConstraint]struct{})
for _, pod := range namespacePods.Items {
for _, constraint := range pod.Spec.TopologySpreadConstraints {
// Only deal with hard topology constraints
// TODO(@damemi): add support for soft constraints
if constraint.WhenUnsatisfiable != v1.DoNotSchedule {
continue
}
namespaceTopologySpreadConstraints[constraint] = struct{}{}
}
}
if len(namespaceTopologySpreadConstraints) == 0 {
continue
}
// 2. for each topologySpreadConstraint in that namespace
for constraint := range namespaceTopologySpreadConstraints {
constraintTopologies := make(map[topologyPair][]*v1.Pod)
// pre-populate the topologyPair map with all the topologies available from the nodeMap
// (we can't just build it from existing pods' nodes because a topology may have 0 pods)
for _, node := range nodeMap {
if val, ok := node.Labels[constraint.TopologyKey]; ok {
constraintTopologies[topologyPair{key: constraint.TopologyKey, value: val}] = make([]*v1.Pod, 0)
}
}
selector, err := metav1.LabelSelectorAsSelector(constraint.LabelSelector)
if err != nil {
klog.ErrorS(err, "Couldn't parse label selector as selector", "selector", constraint.LabelSelector)
continue
}
// 3. for each evictable pod in that namespace
// (this loop is where we count the number of pods per topologyValue that match this constraint'selector selector)
var sumPods float64
for i := range namespacePods.Items {
// 4. if the pod matches this TopologySpreadConstraint LabelSelector
if !selector.Matches(labels.Set(namespacePods.Items[i].Labels)) {
continue
}
// 5. If the pod'selector node matches this constraint'selector topologyKey, create a topoPair and add the pod
node, ok := nodeMap[namespacePods.Items[i].Spec.NodeName]
if !ok {
// If ok is false, node is nil in which case node.Labels will panic. In which case a pod is yet to be scheduled. So it's safe to just continue here.
continue
}
nodeValue, ok := node.Labels[constraint.TopologyKey]
if !ok {
continue
}
// 6. create a topoPair with key as this TopologySpreadConstraint
topoPair := topologyPair{key: constraint.TopologyKey, value: nodeValue}
// 7. add the pod with key as this topoPair
constraintTopologies[topoPair] = append(constraintTopologies[topoPair], &namespacePods.Items[i])
sumPods++
}
balanceDomains(podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodeMap)
}
}
for pod := range podsForEviction {
if _, err := podEvictor.EvictPod(ctx, pod, nodeMap[pod.Spec.NodeName], "PodTopologySpread"); err != nil && !evictable.IsEvictable(pod) {
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
break
}
}
}
// balanceDomains determines how many pods (minimum) should be evicted from large domains to achieve an ideal balance within maxSkew
// To actually determine how many pods need to be moved, we sort the topology domains in ascending length
// [2, 5, 3, 8, 5, 7]
//
// Would end up like:
// [2, 3, 5, 5, 7, 8]
//
// We then start at i=[0] and j=[len(list)-1] and compare the 2 topology sizes.
// If the diff of the size of the domains is more than the maxSkew, we will move up to half that skew,
// or the available pods from the higher domain, or the number required to bring the smaller domain up to the average,
// whichever number is less.
//
// (Note, we will only move as many pods from a domain as possible without bringing it below the ideal average,
// and we will not bring any smaller domain above the average)
// If the diff is within the skew, we move to the next highest domain.
// If the higher domain can't give any more without falling below the average, we move to the next lowest "high" domain
//
// Following this, the above topology domains end up "sorted" as:
// [5, 5, 5, 5, 5, 5]
// (assuming even distribution by the scheduler of the evicted pods)
func balanceDomains(
podsForEviction map[*v1.Pod]struct{},
constraint v1.TopologySpreadConstraint,
constraintTopologies map[topologyPair][]*v1.Pod,
sumPods float64,
isEvictable func(*v1.Pod) bool,
nodeMap map[string]*v1.Node) {
idealAvg := sumPods / float64(len(constraintTopologies))
sortedDomains := sortDomains(constraintTopologies, isEvictable)
// i is the index for belowOrEqualAvg
// j is the index for aboveAvg
i := 0
j := len(sortedDomains) - 1
for i < j {
// if j has no more to give without falling below the ideal average, move to next aboveAvg
if float64(len(sortedDomains[j].pods)) < idealAvg {
j--
}
// skew = actual difference between the domains
skew := float64(len(sortedDomains[j].pods) - len(sortedDomains[i].pods))
// if k and j are within the maxSkew of each other, move to next belowOrEqualAvg
if int32(skew) <= constraint.MaxSkew {
i++
continue
}
// the most that can be given from aboveAvg is:
// 1. up to half the distance between them, minus MaxSkew, rounded up
// 2. how many it has remaining without falling below the average rounded up, or
// 3. how many can be added without bringing the smaller domain above the average rounded up,
// whichever is less
// (This is the basic principle of keeping all sizes within ~skew of the average)
aboveAvg := math.Ceil(float64(len(sortedDomains[j].pods)) - idealAvg)
belowAvg := math.Ceil(idealAvg - float64(len(sortedDomains[i].pods)))
smallestDiff := math.Min(aboveAvg, belowAvg)
halfSkew := math.Ceil((skew - float64(constraint.MaxSkew)) / 2)
movePods := int(math.Min(smallestDiff, halfSkew))
if movePods <= 0 {
i++
continue
}
// remove pods from the higher topology and add them to the list of pods to be evicted
// also (just for tracking), add them to the list of pods in the lower topology
aboveToEvict := sortedDomains[j].pods[len(sortedDomains[j].pods)-movePods:]
for k := range aboveToEvict {
// if the pod has a hard nodeAffinity or nodeSelector that only matches this node,
// don't bother evicting it as it will just end up back on the same node
// however we still account for it "being evicted" so the algorithm can complete
// TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod
// but still try to get the required # of movePods (instead of just chopping that value off the slice above)
if aboveToEvict[k].Spec.NodeSelector != nil ||
(aboveToEvict[k].Spec.Affinity != nil &&
aboveToEvict[k].Spec.Affinity.NodeAffinity != nil &&
aboveToEvict[k].Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil &&
nodesPodFitsOnBesidesCurrent(aboveToEvict[k], nodeMap) == 0) {
continue
}
podsForEviction[aboveToEvict[k]] = struct{}{}
}
sortedDomains[j].pods = sortedDomains[j].pods[:len(sortedDomains[j].pods)-movePods]
sortedDomains[i].pods = append(sortedDomains[i].pods, aboveToEvict...)
}
}
// nodesPodFitsOnBesidesCurrent counts the number of nodes this pod could fit on based on its affinity
// It excludes the current node because, for the sake of domain balancing only, we care about if there is any other
// place it could theoretically fit.
// If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading
func nodesPodFitsOnBesidesCurrent(pod *v1.Pod, nodeMap map[string]*v1.Node) int {
count := 0
for _, node := range nodeMap {
if nodeutil.PodFitsCurrentNode(pod, node) && node != nodeMap[pod.Spec.NodeName] {
count++
}
}
return count
}
// sortDomains sorts and splits the list of topology domains based on their size
// it also sorts the list of pods within the domains based on their node affinity/selector and priority in the following order:
// 1. non-evictable pods
// 2. pods with selectors or affinity
// 3. pods in descending priority
// 4. all other pods
// We then pop pods off the back of the list for eviction
func sortDomains(constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(*v1.Pod) bool) []topology {
sortedTopologies := make([]topology, 0, len(constraintTopologyPairs))
// sort the topologies and return 2 lists: those <= the average and those > the average (> list inverted)
for pair, list := range constraintTopologyPairs {
// Sort the pods within the domain so that the lowest priority pods are considered first for eviction,
// followed by the highest priority,
// followed by the lowest priority pods with affinity or nodeSelector,
// followed by the highest priority pods with affinity or nodeSelector
sort.Slice(list, func(i, j int) bool {
// any non-evictable pods should be considered last (ie, first in the list)
if !isEvictable(list[i]) || !isEvictable(list[j]) {
// false - i is the only non-evictable, so return true to put it first
// true - j is non-evictable, so return false to put j before i
// if true and both and non-evictable, order doesn't matter
return !(isEvictable(list[i]) && !isEvictable(list[j]))
}
// if both pods have selectors/affinity, compare them by their priority
if hasSelectorOrAffinity(*list[i]) == hasSelectorOrAffinity(*list[j]) {
comparePodsByPriority(list[i], list[j])
}
return hasSelectorOrAffinity(*list[i]) && !hasSelectorOrAffinity(*list[j])
})
sortedTopologies = append(sortedTopologies, topology{pair: pair, pods: list})
}
// create an ascending slice of all key-value toplogy pairs
sort.Slice(sortedTopologies, func(i, j int) bool {
return len(sortedTopologies[i].pods) < len(sortedTopologies[j].pods)
})
return sortedTopologies
}
func hasSelectorOrAffinity(pod v1.Pod) bool {
return pod.Spec.NodeSelector != nil || (pod.Spec.Affinity != nil && pod.Spec.Affinity.NodeAffinity != nil)
}
// comparePodsByPriority is a helper to the sort function to compare 2 pods based on their priority values
// It will sort the pods in DESCENDING order of priority, since in our logic we evict pods from the back
// of the list first.
func comparePodsByPriority(iPod, jPod *v1.Pod) bool {
if iPod.Spec.Priority != nil && jPod.Spec.Priority != nil {
// a LOWER priority value should be evicted FIRST
return *iPod.Spec.Priority > *jPod.Spec.Priority
} else if iPod.Spec.Priority != nil && jPod.Spec.Priority == nil {
// i should come before j
return true
} else if iPod.Spec.Priority == nil && jPod.Spec.Priority != nil {
// j should come before i
return false
} else {
// it doesn't matter. just return true
return true
}
}

View File

@@ -0,0 +1,340 @@
package strategies
import (
"context"
"fmt"
"sigs.k8s.io/descheduler/pkg/api"
"testing"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
"sigs.k8s.io/descheduler/test"
)
func TestTopologySpreadConstraint(t *testing.T) {
ctx := context.Background()
testCases := []struct {
name string
pods []*v1.Pod
expectedEvictedCount int
nodes []*v1.Node
strategy api.DeschedulerStrategy
namespaces []string
}{
{
name: "2 domains, sizes [3,1], maxSkew=1, move 1 pod to achieve [2,2]",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
},
{
count: 2,
node: "n1",
labels: map[string]string{"foo": "bar"},
},
{
count: 1,
node: "n2",
labels: map[string]string{"foo": "bar"},
},
}),
expectedEvictedCount: 1,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [5,2], maxSkew=1, move 1 pod to achieve [4,3]",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
},
{
count: 4,
node: "n1",
labels: map[string]string{"foo": "bar"},
},
{
count: 2,
node: "n2",
labels: map[string]string{"foo": "bar"},
},
}),
expectedEvictedCount: 1,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [4,0], maxSkew=1, move 2 pods to achieve [2,2]",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
},
{
count: 3,
node: "n1",
labels: map[string]string{"foo": "bar"},
},
}),
expectedEvictedCount: 2,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [4,0], maxSkew=1, only move 1 pod since pods with nodeSelector and nodeAffinity aren't evicted",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
nodeSelector: map[string]string{"zone": "zoneA"},
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
nodeSelector: map[string]string{"zone": "zoneA"},
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
nodeAffinity: &v1.Affinity{NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{NodeSelectorTerms: []v1.NodeSelectorTerm{
{MatchExpressions: []v1.NodeSelectorRequirement{{Key: "foo", Values: []string{"bar"}, Operator: v1.NodeSelectorOpIn}}},
}},
}},
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
},
}),
expectedEvictedCount: 1,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "3 domains, sizes [0, 1, 100], maxSkew=1, move 66 pods to get [34, 33, 34]",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
test.BuildTestNode("n3", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneC" }),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n2",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
},
{
count: 100,
node: "n3",
labels: map[string]string{"foo": "bar"},
},
}),
expectedEvictedCount: 66,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "4 domains, sizes [0, 1, 3, 5], should move 3 to get [2, 2, 3, 2]",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
test.BuildTestNode("n3", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneC" }),
test.BuildTestNode("n4", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneD" }),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n2",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
},
{
count: 3,
node: "n3",
labels: map[string]string{"foo": "bar"},
},
{
count: 5,
node: "n4",
labels: map[string]string{"foo": "bar"},
},
}),
expectedEvictedCount: 3,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "2 domains size [2 6], maxSkew=2, should move 1 to get [3 5]",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 2,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
},
{
count: 6,
node: "n2",
labels: map[string]string{"foo": "bar"},
},
}),
expectedEvictedCount: 1,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
podList := make([]v1.Pod, 0, len(tc.pods))
for _, pod := range tc.pods {
podList = append(podList, *pod)
}
return true, &v1.PodList{Items: podList}, nil
})
fakeClient.Fake.AddReactor("list", "namespaces", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.NamespaceList{Items: []v1.Namespace{{ObjectMeta: metav1.ObjectMeta{Name: "ns1", Namespace: "ns1"}}}}, nil
})
podEvictor := evictions.NewPodEvictor(
fakeClient,
"v1",
false,
100,
tc.nodes,
false,
)
RemovePodsViolatingTopologySpreadConstraint(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor)
podsEvicted := podEvictor.TotalEvicted()
if podsEvicted != tc.expectedEvictedCount {
t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", tc.name, tc.expectedEvictedCount, podsEvicted)
}
})
}
}
type testPodList struct {
count int
node string
labels map[string]string
constraints []v1.TopologySpreadConstraint
nodeSelector map[string]string
nodeAffinity *v1.Affinity
}
func createTestPods(testPods []testPodList) []*v1.Pod {
ownerRef1 := test.GetReplicaSetOwnerRefList()
pods := make([]*v1.Pod, 0)
podNum := 0
for _, tp := range testPods {
for i := 0; i < tp.count; i++ {
pods = append(pods,
test.BuildTestPod(fmt.Sprintf("pod-%d", podNum), 100, 0, tp.node, func(p *v1.Pod) {
p.Labels = make(map[string]string)
p.Labels = tp.labels
p.Namespace = "ns1"
p.ObjectMeta.OwnerReferences = ownerRef1
p.Spec.TopologySpreadConstraints = tp.constraints
p.Spec.NodeSelector = tp.nodeSelector
p.Spec.Affinity = tp.nodeAffinity
}))
podNum++
}
}
return pods
}