1
0
mirror of https://github.com/kubernetes-sigs/descheduler.git synced 2026-01-26 05:14:13 +01:00

Addressing review comments

Both LowNode and HighNode utilization strategies evict only as many pods
as there's free resources on other nodes. Thus, the resource fit test
is always true by definition.
This commit is contained in:
Jan Chaloupka
2022-04-28 12:00:28 +02:00
parent 16eb9063b6
commit 3eca2782d4
7 changed files with 70 additions and 176 deletions

View File

@@ -737,9 +737,9 @@ The following strategies accept a `nodeFit` boolean parameter which can optimize
If set to `true` the descheduler will consider whether or not the pods that meet eviction criteria will fit on other nodes before evicting them. If a pod cannot be rescheduled to another node, it will not be evicted. Currently the following criteria are considered when setting `nodeFit` to `true`:
- A `nodeSelector` on the pod
- Any `Tolerations` on the pod and any `Taints` on the other nodes
- Any `tolerations` on the pod and any `taints` on the other nodes
- `nodeAffinity` on the pod
- Resource `Requests` made by the pod and the resources available on other nodes
- Resource `requests` made by the pod and the resources available on other nodes
- Whether any of the other nodes are marked as `unschedulable`
E.g.

View File

@@ -99,28 +99,27 @@ func IsReady(node *v1.Node) bool {
return true
}
// NodeFit returns true if the provided pod can probably be scheduled onto the provided node.
// NodeFit returns true if the provided pod can be scheduled onto the provided node.
// This function is used when the NodeFit pod filtering feature of the Descheduler is enabled.
// This function currently considers a subset of the Kubernetes Scheduler's predicates when
// deciding if a pod would fit on a node, but more predicates may be added in the future.
func NodeFit(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) []error {
// Check node selector and required affinity
var errors []error
ok, err := utils.PodMatchNodeSelector(pod, node)
if err != nil {
if ok, err := utils.PodMatchNodeSelector(pod, node); err != nil {
errors = append(errors, err)
} else if !ok {
errors = append(errors, fmt.Errorf("pod node selector does not match the node label"))
}
// Check taints (we only care about NoSchedule and NoExecute taints)
ok = utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool {
ok := utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool {
return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute
})
if !ok {
errors = append(errors, fmt.Errorf("pod does not tolerate taints on the node"))
}
// Check if the pod can fit on a node based off it's requests
ok, reqErrors := FitsRequest(nodeIndexer, pod, node)
ok, reqErrors := fitsRequest(nodeIndexer, pod, node)
if !ok {
errors = append(errors, reqErrors...)
}
@@ -132,7 +131,7 @@ func NodeFit(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v
return errors
}
// PodFitsAnyOtherNode checks if the given pod will probably fit any of the given nodes, besides the node
// PodFitsAnyOtherNode checks if the given pod will fit any of the given nodes, besides the node
// the pod is already running on. The predicates used to determine if the pod will fit can be found in the NodeFit function.
func PodFitsAnyOtherNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nodes []*v1.Node) bool {
for _, node := range nodes {
@@ -155,7 +154,7 @@ func PodFitsAnyOtherNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.
return false
}
// PodFitsAnyNode checks if the given pod will probably fit any of the given nodes. The predicates used
// PodFitsAnyNode checks if the given pod will fit any of the given nodes. The predicates used
// to determine if the pod will fit can be found in the NodeFit function.
func PodFitsAnyNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nodes []*v1.Node) bool {
for _, node := range nodes {
@@ -173,7 +172,7 @@ func PodFitsAnyNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod,
return false
}
// PodFitsCurrentNode checks if the given pod will probably fit onto the given node. The predicates used
// PodFitsCurrentNode checks if the given pod will fit onto the given node. The predicates used
// to determine if the pod will fit can be found in the NodeFit function.
func PodFitsCurrentNode(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) bool {
errors := NodeFit(nodeIndexer, pod, node)
@@ -195,9 +194,9 @@ func IsNodeUnschedulable(node *v1.Node) bool {
return node.Spec.Unschedulable
}
// FitsRequest determines if a pod can fit on a node based on that pod's requests. It returns true if
// fitsRequest determines if a pod can fit on a node based on its resource requests. It returns true if
// the pod will fit.
func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) (bool, []error) {
func fitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) (bool, []error) {
var insufficientResources []error
// Get pod requests
@@ -207,7 +206,7 @@ func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nod
resourceNames = append(resourceNames, name)
}
availableResources, err := NodeAvailableResources(nodeIndexer, node, resourceNames)
availableResources, err := nodeAvailableResources(nodeIndexer, node, resourceNames)
if err != nil {
return false, []error{err}
}
@@ -215,15 +214,8 @@ func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nod
podFitsOnNode := true
for _, resource := range resourceNames {
podResourceRequest := podRequests[resource]
var requestTooLarge bool
switch resource {
case v1.ResourceCPU:
requestTooLarge = podResourceRequest.MilliValue() > availableResources[resource].MilliValue()
default:
requestTooLarge = podResourceRequest.Value() > availableResources[resource].Value()
}
if requestTooLarge {
availableResource, ok := availableResources[resource]
if !ok || podResourceRequest.MilliValue() > availableResource.MilliValue() {
insufficientResources = append(insufficientResources, fmt.Errorf("insufficient %v", resource))
podFitsOnNode = false
}
@@ -231,18 +223,34 @@ func FitsRequest(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, nod
return podFitsOnNode, insufficientResources
}
// NodeAvailableResources returns resources mapped to the quanitity available on the node.
func NodeAvailableResources(nodeIndexer podutil.GetPodsAssignedToNodeFunc, node *v1.Node, resourceNames []v1.ResourceName) (map[v1.ResourceName]*resource.Quantity, error) {
// nodeAvailableResources returns resources mapped to the quanitity available on the node.
func nodeAvailableResources(nodeIndexer podutil.GetPodsAssignedToNodeFunc, node *v1.Node, resourceNames []v1.ResourceName) (map[v1.ResourceName]*resource.Quantity, error) {
podsOnNode, err := podutil.ListPodsOnANode(node.Name, nodeIndexer, nil)
if err != nil {
return nil, err
}
aggregatePodRequests := AggregatePodRequests(podsOnNode, resourceNames)
return nodeRemainingResources(node, aggregatePodRequests, resourceNames), nil
nodeUtilization := NodeUtilization(podsOnNode, resourceNames)
remainingResources := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(node.Status.Allocatable.Cpu().MilliValue()-nodeUtilization[v1.ResourceCPU].MilliValue(), resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(node.Status.Allocatable.Memory().Value()-nodeUtilization[v1.ResourceMemory].Value(), resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(node.Status.Allocatable.Pods().Value()-nodeUtilization[v1.ResourcePods].Value(), resource.DecimalSI),
}
for _, name := range resourceNames {
if !IsBasicResource(name) {
if _, exists := node.Status.Allocatable[name]; exists {
allocatableResource := node.Status.Allocatable[name]
remainingResources[name] = resource.NewQuantity(allocatableResource.Value()-nodeUtilization[name].Value(), resource.DecimalSI)
} else {
remainingResources[name] = resource.NewQuantity(0, resource.DecimalSI)
}
}
}
return remainingResources, nil
}
// AggregatePodRequests returns the resources requested by the given pods. Only resources supplied in the resourceNames parameter are calculated.
func AggregatePodRequests(pods []*v1.Pod, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity {
// NodeUtilization returns the resources requested by the given pods. Only resources supplied in the resourceNames parameter are calculated.
func NodeUtilization(pods []*v1.Pod, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity {
totalReqs := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(0, resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(0, resource.BinarySI),
@@ -269,22 +277,6 @@ func AggregatePodRequests(pods []*v1.Pod, resourceNames []v1.ResourceName) map[v
return totalReqs
}
func nodeRemainingResources(node *v1.Node, aggregatePodRequests map[v1.ResourceName]*resource.Quantity, resourceNames []v1.ResourceName) map[v1.ResourceName]*resource.Quantity {
remainingResources := map[v1.ResourceName]*resource.Quantity{
v1.ResourceCPU: resource.NewMilliQuantity(node.Status.Allocatable.Cpu().MilliValue()-aggregatePodRequests[v1.ResourceCPU].MilliValue(), resource.DecimalSI),
v1.ResourceMemory: resource.NewQuantity(node.Status.Allocatable.Memory().Value()-aggregatePodRequests[v1.ResourceMemory].Value(), resource.BinarySI),
v1.ResourcePods: resource.NewQuantity(node.Status.Allocatable.Pods().Value()-aggregatePodRequests[v1.ResourcePods].Value(), resource.DecimalSI),
}
for _, name := range resourceNames {
if !IsBasicResource(name) {
allocatableResource := node.Status.Allocatable[name]
remainingResources[name] = resource.NewQuantity(allocatableResource.Value()-aggregatePodRequests[name].Value(), resource.DecimalSI)
}
}
return remainingResources
}
// IsBasicResource checks if resource is basic native.
func IsBasicResource(name v1.ResourceName) bool {
switch name {

View File

@@ -26,7 +26,6 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
core "k8s.io/client-go/testing"
podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod"
"sigs.k8s.io/descheduler/test"
)
@@ -196,11 +195,6 @@ func TestPodFitsCurrentNode(t *testing.T) {
},
}
fakeClient := &fake.Clientset{}
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
return true, &v1.PodList{Items: nil}, nil
})
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())

View File

@@ -70,13 +70,17 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
Unschedulable: true,
}
})
node5 := test.BuildTestNode("n5", 1, 1, 1, nil)
node5 := test.BuildTestNode("n5", 2000, 3000, 10, nil)
node5.Spec.Taints = []v1.Taint{
createPreferNoScheduleTaint("testTaint", "test", 1),
}
node6 := test.BuildTestNode("n6", 1, 1, 1, nil)
node6.Spec.Taints = []v1.Taint{
createPreferNoScheduleTaint("testTaint", "test", 1),
}
p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil)
p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil)
p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil)
@@ -293,7 +297,7 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
{
description: "Critical and non critical pods, pods not tolerating node taint can't be evicted because the only available node does not have enough resources.",
pods: []*v1.Pod{p2, p7, p9, p10},
nodes: []*v1.Node{node1, node5},
nodes: []*v1.Node{node1, node6},
evictLocalStoragePods: false,
evictSystemCriticalPods: true,
expectedEvictedPodCount: 0, //p2 and p7 can't be evicted

View File

@@ -695,112 +695,6 @@ func TestLowNodeUtilization(t *testing.T) {
expectedPodsEvicted: 2,
evictedPods: []string{},
},
{
name: "without priorities, but only other node doesn't match pod node affinity for p4 and p5",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 50,
v1.ResourcePods: 50,
},
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 500, 200, 9, nil),
test.BuildTestNode(n2NodeName, 200, 200, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeSelectorKey: notMatchingNodeSelectorValue,
}
}),
},
pods: []*v1.Pod{
test.BuildTestPod("p1", 10, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p2", 10, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p3", 10, 0, n1NodeName, test.SetRSOwnerRef),
// These won't be evicted.
test.BuildTestPod("p4", 400, 100, n1NodeName, func(pod *v1.Pod) {
// A pod that requests too much CPU
test.SetNormalOwnerRef(pod)
}),
test.BuildTestPod("p5", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with local storage.
test.SetNormalOwnerRef(pod)
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
}
// A Mirror Pod.
pod.Annotations = test.GetMirrorPodAnnotation()
}),
test.BuildTestPod("p6", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
test.BuildTestPod("p9", 0, 0, n2NodeName, test.SetRSOwnerRef),
},
expectedPodsEvicted: 3,
},
{
name: "without priorities, but only other node doesn't match pod node affinity for p4 and p5",
thresholds: api.ResourceThresholds{
v1.ResourceCPU: 30,
v1.ResourcePods: 30,
},
targetThresholds: api.ResourceThresholds{
v1.ResourceCPU: 50,
v1.ResourcePods: 50,
},
nodes: []*v1.Node{
test.BuildTestNode(n1NodeName, 500, 200, 9, nil),
test.BuildTestNode(n2NodeName, 200, 200, 10, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
nodeSelectorKey: notMatchingNodeSelectorValue,
}
}),
},
pods: []*v1.Pod{
test.BuildTestPod("p1", 10, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p2", 10, 0, n1NodeName, test.SetRSOwnerRef),
test.BuildTestPod("p3", 10, 0, n1NodeName, test.SetRSOwnerRef),
// These won't be evicted.
test.BuildTestPod("p4", 400, 100, n1NodeName, func(pod *v1.Pod) {
// A pod that requests too much CPU
test.SetNormalOwnerRef(pod)
}),
test.BuildTestPod("p5", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A pod with local storage.
test.SetNormalOwnerRef(pod)
pod.Spec.Volumes = []v1.Volume{
{
Name: "sample",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{Path: "somePath"},
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)},
},
},
}
// A Mirror Pod.
pod.Annotations = test.GetMirrorPodAnnotation()
}),
test.BuildTestPod("p6", 400, 0, n1NodeName, func(pod *v1.Pod) {
// A Critical Pod.
pod.Namespace = "kube-system"
priority := utils.SystemCriticalPriority
pod.Spec.Priority = &priority
}),
test.BuildTestPod("p9", 0, 0, n2NodeName, test.SetRSOwnerRef),
},
expectedPodsEvicted: 3,
},
}
for _, test := range testCases {

View File

@@ -157,7 +157,7 @@ func getNodeUsage(
nodeUsageList = append(nodeUsageList, NodeUsage{
node: node,
usage: nodeUtilization(node, pods, resourceNames),
usage: nodeutil.NodeUtilization(pods, resourceNames),
allPods: pods,
})
}
@@ -279,7 +279,7 @@ func evictPodsFromSourceNodes(
for _, node := range sourceNodes {
klog.V(3).InfoS("Evicting pods from node", "node", klog.KObj(node.node), "usage", node.usage)
nonRemovablePods, removablePods := classifyPods(ctx, node.allPods, podFilter)
nonRemovablePods, removablePods := classifyPods(node.allPods, podFilter)
klog.V(2).InfoS("Pods on node", "node", klog.KObj(node.node), "allPods", len(node.allPods), "nonRemovablePods", len(nonRemovablePods), "removablePods", len(removablePods))
if len(removablePods) == 0 {
@@ -413,7 +413,7 @@ func getResourceNames(thresholds api.ResourceThresholds) []v1.ResourceName {
return resourceNames
}
func classifyPods(ctx context.Context, pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*v1.Pod) {
func classifyPods(pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*v1.Pod) {
var nonRemovablePods, removablePods []*v1.Pod
for _, pod := range pods {
@@ -437,7 +437,7 @@ func averageNodeBasicresources(nodes []*v1.Node, getPodsAssignedToNode podutil.G
numberOfNodes--
continue
}
usage := nodeUtilization(node, pods, resourceNames)
usage := nodeutil.NodeUtilization(pods, resourceNames)
nodeCapacity := node.Status.Capacity
if len(node.Status.Allocatable) > 0 {
nodeCapacity = node.Status.Allocatable

View File

@@ -168,7 +168,7 @@ func RemovePodsViolatingTopologySpreadConstraint(
klog.V(2).InfoS("Skipping topology constraint because it is already balanced", "constraint", constraint)
continue
}
balanceDomains(ctx, client, getPodsAssignedToNode, podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodes)
balanceDomains(client, getPodsAssignedToNode, podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodes)
}
}
@@ -223,7 +223,6 @@ func topologyIsBalanced(topology map[topologyPair][]*v1.Pod, constraint v1.Topol
// [5, 5, 5, 5, 5, 5]
// (assuming even distribution by the scheduler of the evicted pods)
func balanceDomains(
ctx context.Context,
client clientset.Interface,
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
podsForEviction map[*v1.Pod]struct{},
@@ -234,7 +233,7 @@ func balanceDomains(
nodes []*v1.Node) {
idealAvg := sumPods / float64(len(constraintTopologies))
sortedDomains := sortDomains(ctx, constraintTopologies, isEvictable)
sortedDomains := sortDomains(constraintTopologies, isEvictable)
// i is the index for belowOrEqualAvg
// j is the index for aboveAvg
i := 0
@@ -274,6 +273,17 @@ func balanceDomains(
// also (just for tracking), add them to the list of pods in the lower topology
aboveToEvict := sortedDomains[j].pods[len(sortedDomains[j].pods)-movePods:]
for k := range aboveToEvict {
// PodFitsAnyOtherNode excludes the current node because, for the sake of domain balancing only, we care about if there is any other
// place it could theoretically fit.
// If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading
// Also, if the pod has a hard nodeAffinity/nodeSelector/toleration that only matches this node,
// don't bother evicting it as it will just end up back on the same node
// however we still account for it "being evicted" so the algorithm can complete
// TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod
// but still try to get the required # of movePods (instead of just chopping that value off the slice above).
// In other words, PTS can perform suboptimally if some of its chosen pods don't fit on other nodes.
// This is because the chosen pods aren't sorted, but immovable pods still count as "evicted" toward the PTS algorithm.
// So, a better selection heuristic could improve performance.
if !node.PodFitsAnyOtherNode(getPodsAssignedToNode, aboveToEvict[k], nodes) {
klog.V(2).InfoS("ignoring pod for eviction as it does not fit on any other node", "pod", klog.KObj(aboveToEvict[k]))
continue
@@ -293,7 +303,7 @@ func balanceDomains(
// 3. pods in descending priority
// 4. all other pods
// We then pop pods off the back of the list for eviction
func sortDomains(ctx context.Context, constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(pod *v1.Pod) bool) []topology {
func sortDomains(constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(pod *v1.Pod) bool) []topology {
sortedTopologies := make([]topology, 0, len(constraintTopologyPairs))
// sort the topologies and return 2 lists: those <= the average and those > the average (> list inverted)
for pair, list := range constraintTopologyPairs {