From 41083621581501a68a71d9a5baf239cae53a23ab Mon Sep 17 00:00:00 2001
From: Mayank Kumar <mayank.kumar@salesforce.com>
Date: Thu, 23 May 2019 13:46:09 -0700
Subject: [PATCH] Add RemovePodsViolatingTopologySpreadConstraint strategy

This adds a strategy to balance pod topology domains based on the scheduler's
PodTopologySpread constraints. It attempts to find the minimum number of pods
that should be sent for eviction by comparing the largest domains in a topology
with the smallest domains in that topology.
---
 README.md                                     |  59 ++-
 go.sum                                        |   1 +
 kubernetes/base/rbac.yaml                     |   3 +
 pkg/descheduler/descheduler.go                |  15 +-
 .../strategies/topologyspreadconstraint.go    | 355 ++++++++++++++++++
 .../topologyspreadconstraint_test.go          | 340 +++++++++++++++++
 6 files changed, 750 insertions(+), 23 deletions(-)
 create mode 100644 pkg/descheduler/strategies/topologyspreadconstraint.go
 create mode 100644 pkg/descheduler/strategies/topologyspreadconstraint_test.go

diff --git a/README.md b/README.md
index e6ff22ff3..f0176ce33 100644
--- a/README.md
+++ b/README.md
@@ -37,6 +37,7 @@ Table of Contents
      * [RemovePodsViolatingInterPodAntiAffinity](#removepodsviolatinginterpodantiaffinity)
      * [RemovePodsViolatingNodeAffinity](#removepodsviolatingnodeaffinity)
      * [RemovePodsViolatingNodeTaints](#removepodsviolatingnodetaints)
+     * [RemovePodsViolatingTopologySpreadConstraint](#removepodsviolatingtopologyspreadconstraint)
      * [RemovePodsHavingTooManyRestarts](#removepodshavingtoomanyrestarts)
      * [PodLifeTime](#podlifetime)
   * [Filter Pods](#filter-pods)
@@ -102,17 +103,17 @@ See the [user guide](docs/user-guide.md) in the `/docs` directory.
 ## Policy and Strategies
 
 Descheduler's policy is configurable and includes strategies that can be enabled or disabled.
-Seven strategies `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`,
-`RemovePodsViolatingNodeAffinity`, `RemovePodsViolatingNodeTaints`, `RemovePodsHavingTooManyRestarts`, and `PodLifeTime`
-are currently implemented. As part of the policy, the parameters associated with the strategies can be configured too.
-By default, all strategies are enabled.
+Eight strategies `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`,
+`RemovePodsViolatingNodeAffinity`, `RemovePodsViolatingNodeTaints`, `RemovePodsViolatingTopologySpreadConstraint`,
+`RemovePodsHavingTooManyRestarts`, and `PodLifeTime` are currently implemented. As part of the policy, the
+parameters associated with the strategies can be configured too. By default, all strategies are enabled.
 
 The policy also includes common configuration for all the strategies:
 - `nodeSelector` - limiting the nodes which are processed
 - `evictLocalStoragePods` - allowing to evict pods with local storage
 - `maxNoOfPodsToEvictPerNode` - maximum number of pods evicted from each node (summed through all strategies)
 
-```
+```yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 nodeSelector: prod=dev
@@ -144,7 +145,7 @@ has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considere
 |`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
 
 **Example:**
-```
+```yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 strategies:
@@ -186,7 +187,7 @@ These thresholds, `thresholds` and `targetThresholds`, could be tuned as per you
 
 **Example:**
 
-```
+```yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 strategies:
@@ -236,7 +237,7 @@ node.
 
 **Example:**
 
-```
+```yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 strategies:
@@ -273,7 +274,7 @@ podA gets evicted from nodeA.
 
 **Example:**
 
-```
+```yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 strategies:
@@ -301,7 +302,7 @@ and will be evicted.
 
 **Example:**
 
-````
+````yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 strategies:
@@ -309,6 +310,31 @@ strategies:
     enabled: true
 ````
 
+### RemovePodsViolatingTopologySpreadConstraint
+
+This strategy makes sure that pods violating [topology spread constraints](https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/)
+are evicted from nodes. Specifically, it tries to evict the minimum number of pods required to balance topology domains to within each constraint's `maxSkew`.
+This strategy requires k8s version 1.18 at a minimum.
+
+**Parameters:**
+
+|Name|Type|
+|---|---|
+|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
+|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
+|`namespaces`|(see [namespace filtering](#namespace-filtering))|
+
+**Example:**
+
+```yaml
+apiVersion: "descheduler/v1alpha1"
+kind: "DeschedulerPolicy"
+strategies:
+  "RemovePodsViolatingTopologySpreadConstraint":
+     enabled: true
+```
+
+
 ### RemovePodsHavingTooManyRestarts
 
 This strategy makes sure that pods having too many restarts are removed from nodes. For example a pod with EBS/PD that 
@@ -328,7 +354,7 @@ which determines whether init container restarts should be factored into that ca
 
 **Example:**
 
-```
+```yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 strategies:
@@ -359,7 +385,7 @@ to `Running` and `Pending`.
 
 **Example:**
 
-```
+```yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 strategies:
@@ -383,10 +409,11 @@ The following strategies accept a `namespaces` parameter which allows to specify
 * `RemovePodsViolatingNodeAffinity`
 * `RemovePodsViolatingInterPodAntiAffinity`
 * `RemoveDuplicates`
+* `RemovePodsViolatingTopologySpreadConstraint`
 
 For example:
 
-```
+```yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 strategies:
@@ -404,7 +431,7 @@ strategies:
 In the examples `PodLifeTime` gets executed only over `namespace1` and `namespace2`.
 The similar holds for `exclude` field:
 
-```
+```yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 strategies:
@@ -432,7 +459,7 @@ is set to the value of `system-cluster-critical` priority class.
 E.g.
 
 Setting `thresholdPriority`
-```
+```yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 strategies:
@@ -445,7 +472,7 @@ strategies:
 ```
 
 Setting `thresholdPriorityClassName`
-```
+```yaml
 apiVersion: "descheduler/v1alpha1"
 kind: "DeschedulerPolicy"
 strategies:
diff --git a/go.sum b/go.sum
index f2139bf14..e5b3d28df 100644
--- a/go.sum
+++ b/go.sum
@@ -126,6 +126,7 @@ github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7a
 github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
 github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls=
 github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
 github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903 h1:LbsanbbD6LieFkXbj9YNNBupiGHJgFeLpO0j0Fza1h8=
 github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
diff --git a/kubernetes/base/rbac.yaml b/kubernetes/base/rbac.yaml
index f07af760a..7a9e31fec 100644
--- a/kubernetes/base/rbac.yaml
+++ b/kubernetes/base/rbac.yaml
@@ -10,6 +10,9 @@ rules:
 - apiGroups: [""]
   resources: ["nodes"]
   verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["namespaces"]
+  verbs: ["get", "list"]
 - apiGroups: [""]
   resources: ["pods"]
   verbs: ["get", "watch", "list", "delete"]
diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go
index dff3e62a4..a6d86fd92 100644
--- a/pkg/descheduler/descheduler.go
+++ b/pkg/descheduler/descheduler.go
@@ -70,13 +70,14 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer
 	sharedInformerFactory.WaitForCacheSync(stopChannel)
 
 	strategyFuncs := map[string]strategyFunction{
-		"RemoveDuplicates":                        strategies.RemoveDuplicatePods,
-		"LowNodeUtilization":                      strategies.LowNodeUtilization,
-		"RemovePodsViolatingInterPodAntiAffinity": strategies.RemovePodsViolatingInterPodAntiAffinity,
-		"RemovePodsViolatingNodeAffinity":         strategies.RemovePodsViolatingNodeAffinity,
-		"RemovePodsViolatingNodeTaints":           strategies.RemovePodsViolatingNodeTaints,
-		"RemovePodsHavingTooManyRestarts":         strategies.RemovePodsHavingTooManyRestarts,
-		"PodLifeTime":                             strategies.PodLifeTime,
+		"RemoveDuplicates":                            strategies.RemoveDuplicatePods,
+		"LowNodeUtilization":                          strategies.LowNodeUtilization,
+		"RemovePodsViolatingInterPodAntiAffinity":     strategies.RemovePodsViolatingInterPodAntiAffinity,
+		"RemovePodsViolatingNodeAffinity":             strategies.RemovePodsViolatingNodeAffinity,
+		"RemovePodsViolatingNodeTaints":               strategies.RemovePodsViolatingNodeTaints,
+		"RemovePodsHavingTooManyRestarts":             strategies.RemovePodsHavingTooManyRestarts,
+		"PodLifeTime":                                 strategies.PodLifeTime,
+		"RemovePodsViolatingTopologySpreadConstraint": strategies.RemovePodsViolatingTopologySpreadConstraint,
 	}
 
 	nodeSelector := rs.NodeSelector
diff --git a/pkg/descheduler/strategies/topologyspreadconstraint.go b/pkg/descheduler/strategies/topologyspreadconstraint.go
new file mode 100644
index 000000000..ce43c54b5
--- /dev/null
+++ b/pkg/descheduler/strategies/topologyspreadconstraint.go
@@ -0,0 +1,355 @@
+/*
+Copyright 2020 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package strategies
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"sort"
+
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/util/sets"
+	clientset "k8s.io/client-go/kubernetes"
+	"k8s.io/klog/v2"
+	"sigs.k8s.io/descheduler/pkg/api"
+	"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
+	nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
+	"sigs.k8s.io/descheduler/pkg/utils"
+)
+
+// AntiAffinityTerm's topology key value used in predicate metadata
+type topologyPair struct {
+	key   string
+	value string
+}
+
+type topology struct {
+	pair topologyPair
+	pods []*v1.Pod
+}
+
+func validateAndParseTopologySpreadParams(ctx context.Context, client clientset.Interface, params *api.StrategyParameters) (int32, sets.String, sets.String, error) {
+	var includedNamespaces, excludedNamespaces sets.String
+	if params == nil {
+		return 0, includedNamespaces, excludedNamespaces, nil
+	}
+	// At most one of include/exclude can be set
+	if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 {
+		return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("only one of Include/Exclude namespaces can be set")
+	}
+	if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
+		return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set")
+	}
+	thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, params)
+	if err != nil {
+		return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("failed to get threshold priority from strategy's params: %+v", err)
+	}
+	if params.Namespaces != nil {
+		includedNamespaces = sets.NewString(params.Namespaces.Include...)
+		excludedNamespaces = sets.NewString(params.Namespaces.Exclude...)
+	}
+
+	return thresholdPriority, includedNamespaces, excludedNamespaces, nil
+}
+
+func RemovePodsViolatingTopologySpreadConstraint(
+	ctx context.Context,
+	client clientset.Interface,
+	strategy api.DeschedulerStrategy,
+	nodes []*v1.Node,
+	podEvictor *evictions.PodEvictor,
+) {
+	thresholdPriority, includedNamespaces, excludedNamespaces, err := validateAndParseTopologySpreadParams(ctx, client, strategy.Params)
+	if err != nil {
+		klog.ErrorS(err, "Invalid PodLifeTime parameters")
+		return
+	}
+
+	nodeMap := make(map[string]*v1.Node, len(nodes))
+	for _, node := range nodes {
+		nodeMap[node.Name] = node
+	}
+	evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority))
+
+	// 1. for each namespace for which there is Topology Constraint
+	// 2. for each TopologySpreadyConstraint in that namespace
+	//  { find all evictable pods in that namespace
+	//  { 3. for each evictable pod in that namespace
+	// 4. If the pod matches this TopologySpreadConstraint LabelSelector
+	// 5. If the pod nodeName is present in the nodeMap
+	// 6. create a topoPair with key as this TopologySpreadConstraint.TopologyKey and value as this pod's Node Label Value for this TopologyKey
+	// 7. add the pod with key as this topoPair
+	// 8. find the min number of pods in any topoPair for this topologyKey
+	// iterate through all topoPairs for this topologyKey and diff currentPods -minPods <=maxSkew
+	// if diff > maxSkew, add this pod in the current bucket for eviction
+
+	// First record all of the constraints by namespace
+	namespaces, err := client.CoreV1().Namespaces().List(ctx, metav1.ListOptions{})
+	if err != nil {
+		klog.ErrorS(err, "Couldn't list namespaces")
+		return
+	}
+	podsForEviction := make(map[*v1.Pod]struct{})
+	// 1. for each namespace...
+	for _, namespace := range namespaces.Items {
+		if (!includedNamespaces.Has(namespace.Name) || excludedNamespaces.Has(namespace.Name)) && (includedNamespaces.Len()+excludedNamespaces.Len() > 0) {
+			continue
+		}
+		namespacePods, err := client.CoreV1().Pods(namespace.Name).List(ctx, metav1.ListOptions{})
+		if err != nil {
+			klog.ErrorS(err, "Couldn't list pods in namespace", "namespace", namespace)
+			continue
+		}
+
+		// ...where there is a topology constraint
+		//namespaceTopologySpreadConstrainPods := make([]v1.Pod, 0, len(namespacePods.Items))
+		namespaceTopologySpreadConstraints := make(map[v1.TopologySpreadConstraint]struct{})
+		for _, pod := range namespacePods.Items {
+			for _, constraint := range pod.Spec.TopologySpreadConstraints {
+				// Only deal with hard topology constraints
+				// TODO(@damemi): add support for soft constraints
+				if constraint.WhenUnsatisfiable != v1.DoNotSchedule {
+					continue
+				}
+				namespaceTopologySpreadConstraints[constraint] = struct{}{}
+			}
+		}
+		if len(namespaceTopologySpreadConstraints) == 0 {
+			continue
+		}
+
+		// 2. for each topologySpreadConstraint in that namespace
+		for constraint := range namespaceTopologySpreadConstraints {
+			constraintTopologies := make(map[topologyPair][]*v1.Pod)
+			// pre-populate the topologyPair map with all the topologies available from the nodeMap
+			// (we can't just build it from existing pods' nodes because a topology may have 0 pods)
+			for _, node := range nodeMap {
+				if val, ok := node.Labels[constraint.TopologyKey]; ok {
+					constraintTopologies[topologyPair{key: constraint.TopologyKey, value: val}] = make([]*v1.Pod, 0)
+				}
+			}
+
+			selector, err := metav1.LabelSelectorAsSelector(constraint.LabelSelector)
+			if err != nil {
+				klog.ErrorS(err, "Couldn't parse label selector as selector", "selector", constraint.LabelSelector)
+				continue
+			}
+
+			// 3. for each evictable pod in that namespace
+			// (this loop is where we count the number of pods per topologyValue that match this constraint'selector selector)
+			var sumPods float64
+			for i := range namespacePods.Items {
+				// 4. if the pod matches this TopologySpreadConstraint LabelSelector
+				if !selector.Matches(labels.Set(namespacePods.Items[i].Labels)) {
+					continue
+				}
+
+				// 5. If the pod'selector node matches this constraint'selector topologyKey, create a topoPair and add the pod
+				node, ok := nodeMap[namespacePods.Items[i].Spec.NodeName]
+				if !ok {
+					// If ok is false, node is nil in which case node.Labels will panic. In which case a pod is yet to be scheduled. So it's safe to just continue here.
+					continue
+				}
+				nodeValue, ok := node.Labels[constraint.TopologyKey]
+				if !ok {
+					continue
+				}
+				// 6. create a topoPair with key as this TopologySpreadConstraint
+				topoPair := topologyPair{key: constraint.TopologyKey, value: nodeValue}
+				// 7. add the pod with key as this topoPair
+				constraintTopologies[topoPair] = append(constraintTopologies[topoPair], &namespacePods.Items[i])
+				sumPods++
+			}
+			balanceDomains(podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodeMap)
+		}
+	}
+
+	for pod := range podsForEviction {
+		if _, err := podEvictor.EvictPod(ctx, pod, nodeMap[pod.Spec.NodeName], "PodTopologySpread"); err != nil && !evictable.IsEvictable(pod) {
+			klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
+			break
+		}
+	}
+}
+
+// balanceDomains determines how many pods (minimum) should be evicted from large domains to achieve an ideal balance within maxSkew
+// To actually determine how many pods need to be moved, we sort the topology domains in ascending length
+// [2, 5, 3, 8, 5, 7]
+//
+// Would end up like:
+// [2, 3, 5, 5, 7, 8]
+//
+// We then start at i=[0] and j=[len(list)-1] and compare the 2 topology sizes.
+// If the diff of the size of the domains is more than the maxSkew, we will move up to half that skew,
+// or the available pods from the higher domain, or the number required to bring the smaller domain up to the average,
+// whichever number is less.
+//
+// (Note, we will only move as many pods from a domain as possible without bringing it below the ideal average,
+//  and we will not bring any smaller domain above the average)
+// If the diff is within the skew, we move to the next highest domain.
+// If the higher domain can't give any more without falling below the average, we move to the next lowest "high" domain
+//
+// Following this, the above topology domains end up "sorted" as:
+// [5, 5, 5, 5, 5, 5]
+// (assuming even distribution by the scheduler of the evicted pods)
+func balanceDomains(
+	podsForEviction map[*v1.Pod]struct{},
+	constraint v1.TopologySpreadConstraint,
+	constraintTopologies map[topologyPair][]*v1.Pod,
+	sumPods float64,
+	isEvictable func(*v1.Pod) bool,
+	nodeMap map[string]*v1.Node) {
+	idealAvg := sumPods / float64(len(constraintTopologies))
+	sortedDomains := sortDomains(constraintTopologies, isEvictable)
+	// i is the index for belowOrEqualAvg
+	// j is the index for aboveAvg
+	i := 0
+	j := len(sortedDomains) - 1
+	for i < j {
+		// if j has no more to give without falling below the ideal average, move to next aboveAvg
+		if float64(len(sortedDomains[j].pods)) < idealAvg {
+			j--
+		}
+
+		// skew = actual difference between the domains
+		skew := float64(len(sortedDomains[j].pods) - len(sortedDomains[i].pods))
+
+		// if k and j are within the maxSkew of each other, move to next belowOrEqualAvg
+		if int32(skew) <= constraint.MaxSkew {
+			i++
+			continue
+		}
+
+		// the most that can be given from aboveAvg is:
+		// 1. up to half the distance between them, minus MaxSkew, rounded up
+		// 2. how many it has remaining without falling below the average rounded up, or
+		// 3. how many can be added without bringing the smaller domain above the average rounded up,
+		// whichever is less
+		// (This is the basic principle of keeping all sizes within ~skew of the average)
+		aboveAvg := math.Ceil(float64(len(sortedDomains[j].pods)) - idealAvg)
+		belowAvg := math.Ceil(idealAvg - float64(len(sortedDomains[i].pods)))
+		smallestDiff := math.Min(aboveAvg, belowAvg)
+		halfSkew := math.Ceil((skew - float64(constraint.MaxSkew)) / 2)
+		movePods := int(math.Min(smallestDiff, halfSkew))
+		if movePods <= 0 {
+			i++
+			continue
+		}
+
+		// remove pods from the higher topology and add them to the list of pods to be evicted
+		// also (just for tracking), add them to the list of pods in the lower topology
+		aboveToEvict := sortedDomains[j].pods[len(sortedDomains[j].pods)-movePods:]
+		for k := range aboveToEvict {
+			// if the pod has a hard nodeAffinity or nodeSelector that only matches this node,
+			// don't bother evicting it as it will just end up back on the same node
+			// however we still account for it "being evicted" so the algorithm can complete
+			// TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod
+			// but still try to get the required # of movePods (instead of just chopping that value off the slice above)
+			if aboveToEvict[k].Spec.NodeSelector != nil ||
+				(aboveToEvict[k].Spec.Affinity != nil &&
+					aboveToEvict[k].Spec.Affinity.NodeAffinity != nil &&
+					aboveToEvict[k].Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil &&
+					nodesPodFitsOnBesidesCurrent(aboveToEvict[k], nodeMap) == 0) {
+				continue
+			}
+			podsForEviction[aboveToEvict[k]] = struct{}{}
+		}
+		sortedDomains[j].pods = sortedDomains[j].pods[:len(sortedDomains[j].pods)-movePods]
+		sortedDomains[i].pods = append(sortedDomains[i].pods, aboveToEvict...)
+	}
+}
+
+// nodesPodFitsOnBesidesCurrent counts the number of nodes this pod could fit on based on its affinity
+// It excludes the current node because, for the sake of domain balancing only, we care about if there is any other
+// place it could theoretically fit.
+// If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading
+func nodesPodFitsOnBesidesCurrent(pod *v1.Pod, nodeMap map[string]*v1.Node) int {
+	count := 0
+	for _, node := range nodeMap {
+		if nodeutil.PodFitsCurrentNode(pod, node) && node != nodeMap[pod.Spec.NodeName] {
+			count++
+		}
+	}
+	return count
+}
+
+// sortDomains sorts and splits the list of topology domains based on their size
+// it also sorts the list of pods within the domains based on their node affinity/selector and priority in the following order:
+// 1. non-evictable pods
+// 2. pods with selectors or affinity
+// 3. pods in descending priority
+// 4. all other pods
+// We then pop pods off the back of the list for eviction
+func sortDomains(constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(*v1.Pod) bool) []topology {
+	sortedTopologies := make([]topology, 0, len(constraintTopologyPairs))
+	// sort the topologies and return 2 lists: those <= the average and those > the average (> list inverted)
+	for pair, list := range constraintTopologyPairs {
+		// Sort the pods within the domain so that the lowest priority pods are considered first for eviction,
+		// followed by the highest priority,
+		// followed by the lowest priority pods with affinity or nodeSelector,
+		// followed by the highest priority pods with affinity or nodeSelector
+		sort.Slice(list, func(i, j int) bool {
+			// any non-evictable pods should be considered last (ie, first in the list)
+			if !isEvictable(list[i]) || !isEvictable(list[j]) {
+				// false - i is the only non-evictable, so return true to put it first
+				// true - j is non-evictable, so return false to put j before i
+				// if true and both and non-evictable, order doesn't matter
+				return !(isEvictable(list[i]) && !isEvictable(list[j]))
+			}
+
+			// if both pods have selectors/affinity, compare them by their priority
+			if hasSelectorOrAffinity(*list[i]) == hasSelectorOrAffinity(*list[j]) {
+				comparePodsByPriority(list[i], list[j])
+			}
+			return hasSelectorOrAffinity(*list[i]) && !hasSelectorOrAffinity(*list[j])
+		})
+		sortedTopologies = append(sortedTopologies, topology{pair: pair, pods: list})
+	}
+
+	// create an ascending slice of all key-value toplogy pairs
+	sort.Slice(sortedTopologies, func(i, j int) bool {
+		return len(sortedTopologies[i].pods) < len(sortedTopologies[j].pods)
+	})
+
+	return sortedTopologies
+}
+
+func hasSelectorOrAffinity(pod v1.Pod) bool {
+	return pod.Spec.NodeSelector != nil || (pod.Spec.Affinity != nil && pod.Spec.Affinity.NodeAffinity != nil)
+}
+
+// comparePodsByPriority is a helper to the sort function to compare 2 pods based on their priority values
+// It will sort the pods in DESCENDING order of priority, since in our logic we evict pods from the back
+// of the list first.
+func comparePodsByPriority(iPod, jPod *v1.Pod) bool {
+	if iPod.Spec.Priority != nil && jPod.Spec.Priority != nil {
+		// a LOWER priority value should be evicted FIRST
+		return *iPod.Spec.Priority > *jPod.Spec.Priority
+	} else if iPod.Spec.Priority != nil && jPod.Spec.Priority == nil {
+		// i should come before j
+		return true
+	} else if iPod.Spec.Priority == nil && jPod.Spec.Priority != nil {
+		// j should come before i
+		return false
+	} else {
+		// it doesn't matter. just return true
+		return true
+	}
+}
diff --git a/pkg/descheduler/strategies/topologyspreadconstraint_test.go b/pkg/descheduler/strategies/topologyspreadconstraint_test.go
new file mode 100644
index 000000000..93057a230
--- /dev/null
+++ b/pkg/descheduler/strategies/topologyspreadconstraint_test.go
@@ -0,0 +1,340 @@
+package strategies
+
+import (
+	"context"
+	"fmt"
+	"sigs.k8s.io/descheduler/pkg/api"
+	"testing"
+
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/client-go/kubernetes/fake"
+	core "k8s.io/client-go/testing"
+	"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
+	"sigs.k8s.io/descheduler/test"
+)
+
+func TestTopologySpreadConstraint(t *testing.T) {
+	ctx := context.Background()
+	testCases := []struct {
+		name                 string
+		pods                 []*v1.Pod
+		expectedEvictedCount int
+		nodes                []*v1.Node
+		strategy             api.DeschedulerStrategy
+		namespaces           []string
+	}{
+		{
+			name: "2 domains, sizes [3,1], maxSkew=1, move 1 pod to achieve [2,2]",
+			nodes: []*v1.Node{
+				test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
+				test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
+			},
+			pods: createTestPods([]testPodList{
+				{
+					count:  1,
+					node:   "n1",
+					labels: map[string]string{"foo": "bar"},
+					constraints: []v1.TopologySpreadConstraint{
+						{
+							MaxSkew:           1,
+							TopologyKey:       "zone",
+							WhenUnsatisfiable: v1.DoNotSchedule,
+							LabelSelector:     &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
+						},
+					},
+				},
+				{
+					count:  2,
+					node:   "n1",
+					labels: map[string]string{"foo": "bar"},
+				},
+				{
+					count:  1,
+					node:   "n2",
+					labels: map[string]string{"foo": "bar"},
+				},
+			}),
+			expectedEvictedCount: 1,
+			strategy:             api.DeschedulerStrategy{},
+			namespaces:           []string{"ns1"},
+		},
+		{
+			name: "2 domains, sizes [5,2], maxSkew=1, move 1 pod to achieve [4,3]",
+			nodes: []*v1.Node{
+				test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
+				test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
+			},
+			pods: createTestPods([]testPodList{
+				{
+					count:  1,
+					node:   "n1",
+					labels: map[string]string{"foo": "bar"},
+					constraints: []v1.TopologySpreadConstraint{
+						{
+							MaxSkew:           1,
+							TopologyKey:       "zone",
+							WhenUnsatisfiable: v1.DoNotSchedule,
+							LabelSelector:     &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
+						},
+					},
+				},
+				{
+					count:  4,
+					node:   "n1",
+					labels: map[string]string{"foo": "bar"},
+				},
+				{
+					count:  2,
+					node:   "n2",
+					labels: map[string]string{"foo": "bar"},
+				},
+			}),
+			expectedEvictedCount: 1,
+			strategy:             api.DeschedulerStrategy{},
+			namespaces:           []string{"ns1"},
+		},
+		{
+			name: "2 domains, sizes [4,0], maxSkew=1, move 2 pods to achieve [2,2]",
+			nodes: []*v1.Node{
+				test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
+				test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
+			},
+			pods: createTestPods([]testPodList{
+				{
+					count:  1,
+					node:   "n1",
+					labels: map[string]string{"foo": "bar"},
+					constraints: []v1.TopologySpreadConstraint{
+						{
+							MaxSkew:           1,
+							TopologyKey:       "zone",
+							WhenUnsatisfiable: v1.DoNotSchedule,
+							LabelSelector:     &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
+						},
+					},
+				},
+				{
+					count:  3,
+					node:   "n1",
+					labels: map[string]string{"foo": "bar"},
+				},
+			}),
+			expectedEvictedCount: 2,
+			strategy:             api.DeschedulerStrategy{},
+			namespaces:           []string{"ns1"},
+		},
+		{
+			name: "2 domains, sizes [4,0], maxSkew=1, only move 1 pod since pods with nodeSelector and nodeAffinity aren't evicted",
+			nodes: []*v1.Node{
+				test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
+				test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
+			},
+			pods: createTestPods([]testPodList{
+				{
+					count:  1,
+					node:   "n1",
+					labels: map[string]string{"foo": "bar"},
+					constraints: []v1.TopologySpreadConstraint{
+						{
+							MaxSkew:           1,
+							TopologyKey:       "zone",
+							WhenUnsatisfiable: v1.DoNotSchedule,
+							LabelSelector:     &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
+						},
+					},
+					nodeSelector: map[string]string{"zone": "zoneA"},
+				},
+				{
+					count:        1,
+					node:         "n1",
+					labels:       map[string]string{"foo": "bar"},
+					nodeSelector: map[string]string{"zone": "zoneA"},
+				},
+				{
+					count:  1,
+					node:   "n1",
+					labels: map[string]string{"foo": "bar"},
+					nodeAffinity: &v1.Affinity{NodeAffinity: &v1.NodeAffinity{
+						RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{NodeSelectorTerms: []v1.NodeSelectorTerm{
+							{MatchExpressions: []v1.NodeSelectorRequirement{{Key: "foo", Values: []string{"bar"}, Operator: v1.NodeSelectorOpIn}}},
+						}},
+					}},
+				},
+				{
+					count:  1,
+					node:   "n1",
+					labels: map[string]string{"foo": "bar"},
+				},
+			}),
+			expectedEvictedCount: 1,
+			strategy:             api.DeschedulerStrategy{},
+			namespaces:           []string{"ns1"},
+		},
+		{
+			name: "3 domains, sizes [0, 1, 100], maxSkew=1, move 66 pods to get [34, 33, 34]",
+			nodes: []*v1.Node{
+				test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
+				test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
+				test.BuildTestNode("n3", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneC" }),
+			},
+			pods: createTestPods([]testPodList{
+				{
+					count:  1,
+					node:   "n2",
+					labels: map[string]string{"foo": "bar"},
+					constraints: []v1.TopologySpreadConstraint{
+						{
+							MaxSkew:           1,
+							TopologyKey:       "zone",
+							WhenUnsatisfiable: v1.DoNotSchedule,
+							LabelSelector:     &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
+						},
+					},
+				},
+				{
+					count:  100,
+					node:   "n3",
+					labels: map[string]string{"foo": "bar"},
+				},
+			}),
+			expectedEvictedCount: 66,
+			strategy:             api.DeschedulerStrategy{},
+			namespaces:           []string{"ns1"},
+		},
+		{
+			name: "4 domains, sizes [0, 1, 3, 5], should move 3 to get [2, 2, 3, 2]",
+			nodes: []*v1.Node{
+				test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
+				test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
+				test.BuildTestNode("n3", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneC" }),
+				test.BuildTestNode("n4", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneD" }),
+			},
+			pods: createTestPods([]testPodList{
+				{
+					count:  1,
+					node:   "n2",
+					labels: map[string]string{"foo": "bar"},
+					constraints: []v1.TopologySpreadConstraint{
+						{
+							MaxSkew:           1,
+							TopologyKey:       "zone",
+							WhenUnsatisfiable: v1.DoNotSchedule,
+							LabelSelector:     &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
+						},
+					},
+				},
+				{
+					count:  3,
+					node:   "n3",
+					labels: map[string]string{"foo": "bar"},
+				},
+				{
+					count:  5,
+					node:   "n4",
+					labels: map[string]string{"foo": "bar"},
+				},
+			}),
+			expectedEvictedCount: 3,
+			strategy:             api.DeschedulerStrategy{},
+			namespaces:           []string{"ns1"},
+		},
+		{
+			name: "2 domains size [2 6], maxSkew=2, should move 1 to get [3 5]",
+			nodes: []*v1.Node{
+				test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
+				test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
+			},
+			pods: createTestPods([]testPodList{
+				{
+					count:  1,
+					node:   "n1",
+					labels: map[string]string{"foo": "bar"},
+					constraints: []v1.TopologySpreadConstraint{
+						{
+							MaxSkew:           2,
+							TopologyKey:       "zone",
+							WhenUnsatisfiable: v1.DoNotSchedule,
+							LabelSelector:     &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
+						},
+					},
+				},
+				{
+					count:  1,
+					node:   "n1",
+					labels: map[string]string{"foo": "bar"},
+				},
+				{
+					count:  6,
+					node:   "n2",
+					labels: map[string]string{"foo": "bar"},
+				},
+			}),
+			expectedEvictedCount: 1,
+			strategy:             api.DeschedulerStrategy{},
+			namespaces:           []string{"ns1"},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			fakeClient := &fake.Clientset{}
+			fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
+				podList := make([]v1.Pod, 0, len(tc.pods))
+				for _, pod := range tc.pods {
+					podList = append(podList, *pod)
+				}
+				return true, &v1.PodList{Items: podList}, nil
+			})
+			fakeClient.Fake.AddReactor("list", "namespaces", func(action core.Action) (bool, runtime.Object, error) {
+				return true, &v1.NamespaceList{Items: []v1.Namespace{{ObjectMeta: metav1.ObjectMeta{Name: "ns1", Namespace: "ns1"}}}}, nil
+			})
+
+			podEvictor := evictions.NewPodEvictor(
+				fakeClient,
+				"v1",
+				false,
+				100,
+				tc.nodes,
+				false,
+			)
+			RemovePodsViolatingTopologySpreadConstraint(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor)
+			podsEvicted := podEvictor.TotalEvicted()
+			if podsEvicted != tc.expectedEvictedCount {
+				t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", tc.name, tc.expectedEvictedCount, podsEvicted)
+			}
+		})
+	}
+}
+
+type testPodList struct {
+	count        int
+	node         string
+	labels       map[string]string
+	constraints  []v1.TopologySpreadConstraint
+	nodeSelector map[string]string
+	nodeAffinity *v1.Affinity
+}
+
+func createTestPods(testPods []testPodList) []*v1.Pod {
+	ownerRef1 := test.GetReplicaSetOwnerRefList()
+	pods := make([]*v1.Pod, 0)
+	podNum := 0
+	for _, tp := range testPods {
+		for i := 0; i < tp.count; i++ {
+			pods = append(pods,
+				test.BuildTestPod(fmt.Sprintf("pod-%d", podNum), 100, 0, tp.node, func(p *v1.Pod) {
+					p.Labels = make(map[string]string)
+					p.Labels = tp.labels
+					p.Namespace = "ns1"
+					p.ObjectMeta.OwnerReferences = ownerRef1
+					p.Spec.TopologySpreadConstraints = tp.constraints
+					p.Spec.NodeSelector = tp.nodeSelector
+					p.Spec.Affinity = tp.nodeAffinity
+				}))
+			podNum++
+		}
+	}
+	return pods
+}