diff --git a/pkg/descheduler/strategies/lownodeutilization.go b/pkg/descheduler/strategies/lownodeutilization.go index 10aa2dd36..a840171c8 100644 --- a/pkg/descheduler/strategies/lownodeutilization.go +++ b/pkg/descheduler/strategies/lownodeutilization.go @@ -41,6 +41,7 @@ type NodeUsageMap struct { bPods []*v1.Pod gPods []*v1.Pod } + type NodePodsMap map[*v1.Node][]*v1.Pod func LowNodeUtilization(ds *options.DeschedulerServer, strategy api.DeschedulerStrategy, evictionPolicyGroupVersion string, nodes []*v1.Node, nodepodCount nodePodEvictedCount) { @@ -59,7 +60,7 @@ func LowNodeUtilization(ds *options.DeschedulerServer, strategy api.DeschedulerS return } - npm := CreateNodePodsMap(ds.Client, nodes) + npm := createNodePodsMap(ds.Client, nodes) lowNodes, targetNodes := classifyNodes(npm, thresholds, targetThresholds) glog.V(1).Infof("Criteria for a node under utilization: CPU: %v, Mem: %v, Pods: %v", @@ -151,6 +152,9 @@ func classifyNodes(npm NodePodsMap, thresholds api.ResourceThresholds, targetThr return lowNodes, targetNodes } +// evictPodsFromTargetNodes evicts pods based on priority, if all the pods on the node have priority, if not +// evicts them based on QoS as fallback option. +// TODO: @ravig Break this function into smaller functions. func evictPodsFromTargetNodes(client clientset.Interface, evictionPolicyGroupVersion string, targetNodes, lowNodes []NodeUsageMap, targetThresholds api.ResourceThresholds, dryRun bool, maxPodsToEvict int, nodepodCount nodePodEvictedCount) int { podsEvicted := 0 @@ -191,12 +195,27 @@ func evictPodsFromTargetNodes(client clientset.Interface, evictionPolicyGroupVer glog.V(3).Infof("evicting pods from node %#v with usage: %#v", node.node.Name, node.usage) currentPodsEvicted := nodepodCount[node.node] - // evict best effort pods - evictPods(node.bePods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCpu, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict) - // evict burstable pods - evictPods(node.bPods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCpu, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict) - // evict guaranteed pods - evictPods(node.gPods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCpu, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict) + // Check if one pod has priority, if yes, assume that all pods have priority and evict pods based on priority. + if node.allPods[0].Spec.Priority != nil { + glog.V(1).Infof("All pods have priority associated with them. Evicting pods based on priority") + evictablePods := make([]*v1.Pod, 0) + evictablePods = append(append(node.bPods, node.bePods...), node.gPods...) + + // sort the evictable Pods based on priority. This also sorts them based on QoS. If there are multiple pods with same priority, they are sorted based on QoS tiers. + sortPodsBasedOnPriority(evictablePods) + evictPods(evictablePods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCpu, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict) + } else { + // TODO: Remove this when we support only priority. + // Falling back to evicting pods based on priority. + glog.V(1).Infof("Evicting pods based on QoS") + glog.V(1).Infof("There are %v non-evictable pods on the node", len(node.nonRemovablePods)) + // evict best effort pods + evictPods(node.bePods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCpu, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict) + // evict burstable pods + evictPods(node.bPods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCpu, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict) + // evict guaranteed pods + evictPods(node.gPods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCpu, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict) + } nodepodCount[node.node] = currentPodsEvicted podsEvicted = podsEvicted + nodepodCount[node.node] glog.V(1).Infof("%v pods evicted from node %#v with usage %v", nodepodCount[node.node], node.node.Name, node.usage) @@ -269,7 +288,30 @@ func SortNodesByUsage(nodes []NodeUsageMap) { }) } -func CreateNodePodsMap(client clientset.Interface, nodes []*v1.Node) NodePodsMap { +// sortPodsBasedOnPriority sorts pods based on priority and if their priorities are equal, they are sorted based on QoS tiers. +func sortPodsBasedOnPriority(evictablePods []*v1.Pod) { + sort.Slice(evictablePods, func(i, j int) bool { + if evictablePods[i].Spec.Priority == nil && evictablePods[j].Spec.Priority != nil { + return true + } + if evictablePods[j].Spec.Priority == nil && evictablePods[i].Spec.Priority != nil { + return false + } + if (evictablePods[j].Spec.Priority == nil && evictablePods[i].Spec.Priority == nil) || (*evictablePods[i].Spec.Priority == *evictablePods[j].Spec.Priority) { + if podutil.IsBestEffortPod(evictablePods[i]) { + return true + } + if podutil.IsBurstablePod(evictablePods[i]) && podutil.IsGuaranteedPod(evictablePods[j]) { + return true + } + return false + } + return *evictablePods[i].Spec.Priority < *evictablePods[j].Spec.Priority + }) +} + +// createNodePodsMap returns nodepodsmap with evictable pods on node. +func createNodePodsMap(client clientset.Interface, nodes []*v1.Node) NodePodsMap { npm := NodePodsMap{} for _, node := range nodes { pods, err := podutil.ListPodsOnANode(client, node) @@ -308,6 +350,7 @@ func IsNodeWithLowUtilization(nodeThresholds api.ResourceThresholds, thresholds return true } +// Nodeutilization returns the current usage of node. func NodeUtilization(node *v1.Node, pods []*v1.Pod) (api.ResourceThresholds, []*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod) { bePods := []*v1.Pod{} nonRemovablePods := []*v1.Pod{} diff --git a/pkg/descheduler/strategies/lownodeutilization_test.go b/pkg/descheduler/strategies/lownodeutilization_test.go index 098f671b2..c33a0653e 100644 --- a/pkg/descheduler/strategies/lownodeutilization_test.go +++ b/pkg/descheduler/strategies/lownodeutilization_test.go @@ -28,10 +28,11 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes/fake" core "k8s.io/client-go/testing" + "reflect" ) // TODO: Make this table driven. -func TestLowNodeUtilization(t *testing.T) { +func TestLowNodeUtilizationWithoutPriority(t *testing.T) { var thresholds = make(api.ResourceThresholds) var targetThresholds = make(api.ResourceThresholds) thresholds[v1.ResourceCPU] = 30 @@ -110,7 +111,7 @@ func TestLowNodeUtilization(t *testing.T) { return true, nil, fmt.Errorf("Wrong node: %v", getAction.GetName()) }) expectedPodsEvicted := 3 - npm := CreateNodePodsMap(fakeClient, []*v1.Node{n1, n2, n3}) + npm := createNodePodsMap(fakeClient, []*v1.Node{n1, n2, n3}) lowNodes, targetNodes := classifyNodes(npm, thresholds, targetThresholds) if len(lowNodes) != 1 { t.Errorf("After ignoring unschedulable nodes, expected only one node to be under utilized.") @@ -126,6 +127,154 @@ func TestLowNodeUtilization(t *testing.T) { } +// TODO: Make this table driven. +func TestLowNodeUtilizationWithPriorities(t *testing.T) { + var thresholds = make(api.ResourceThresholds) + var targetThresholds = make(api.ResourceThresholds) + thresholds[v1.ResourceCPU] = 30 + thresholds[v1.ResourcePods] = 30 + targetThresholds[v1.ResourceCPU] = 50 + targetThresholds[v1.ResourcePods] = 50 + lowPriority := int32(0) + highPriority := int32(10000) + n1 := test.BuildTestNode("n1", 4000, 3000, 9) + n2 := test.BuildTestNode("n2", 4000, 3000, 10) + n3 := test.BuildTestNode("n3", 4000, 3000, 10) + // Making n3 node unschedulable so that it won't counted in lowUtilized nodes list. + n3.Spec.Unschedulable = true + p1 := test.BuildTestPod("p1", 400, 0, n1.Name) + p1.Spec.Priority = &highPriority + p2 := test.BuildTestPod("p2", 400, 0, n1.Name) + p2.Spec.Priority = &highPriority + p3 := test.BuildTestPod("p3", 400, 0, n1.Name) + p3.Spec.Priority = &highPriority + p4 := test.BuildTestPod("p4", 400, 0, n1.Name) + p4.Spec.Priority = &highPriority + p5 := test.BuildTestPod("p5", 400, 0, n1.Name) + p5.Spec.Priority = &lowPriority + + // These won't be evicted. + p6 := test.BuildTestPod("p6", 400, 0, n1.Name) + p6.Spec.Priority = &highPriority + p7 := test.BuildTestPod("p7", 400, 0, n1.Name) + p7.Spec.Priority = &lowPriority + p8 := test.BuildTestPod("p8", 400, 0, n1.Name) + p8.Spec.Priority = &lowPriority + + p1.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList() + p2.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList() + p3.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList() + p4.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList() + p5.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList() + // The following 4 pods won't get evicted. + // A daemonset. + p6.ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList() + // A pod with local storage. + p7.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + p7.Spec.Volumes = []v1.Volume{ + { + Name: "sample", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, + }, + }, + } + // A Mirror Pod. + p7.Annotations = test.GetMirrorPodAnnotation() + // A Critical Pod. + p8.Namespace = "kube-system" + p8.Annotations = test.GetCriticalPodAnnotation() + p9 := test.BuildTestPod("p9", 400, 0, n1.Name) + p9.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList() + fakeClient := &fake.Clientset{} + fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { + list := action.(core.ListAction) + fieldString := list.GetListRestrictions().Fields.String() + if strings.Contains(fieldString, "n1") { + return true, &v1.PodList{Items: []v1.Pod{*p1, *p2, *p3, *p4, *p5, *p6, *p7, *p8}}, nil + } + if strings.Contains(fieldString, "n2") { + return true, &v1.PodList{Items: []v1.Pod{*p9}}, nil + } + if strings.Contains(fieldString, "n3") { + return true, &v1.PodList{Items: []v1.Pod{}}, nil + } + return true, nil, fmt.Errorf("Failed to list: %v", list) + }) + fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) { + getAction := action.(core.GetAction) + switch getAction.GetName() { + case n1.Name: + return true, n1, nil + case n2.Name: + return true, n2, nil + case n3.Name: + return true, n3, nil + } + return true, nil, fmt.Errorf("Wrong node: %v", getAction.GetName()) + }) + expectedPodsEvicted := 3 + npm := createNodePodsMap(fakeClient, []*v1.Node{n1, n2, n3}) + lowNodes, targetNodes := classifyNodes(npm, thresholds, targetThresholds) + if len(lowNodes) != 1 { + t.Errorf("After ignoring unschedulable nodes, expected only one node to be under utilized.") + } + npe := nodePodEvictedCount{} + npe[n1] = 0 + npe[n2] = 0 + npe[n3] = 0 + podsEvicted := evictPodsFromTargetNodes(fakeClient, "v1", targetNodes, lowNodes, targetThresholds, false, 3, npe) + if expectedPodsEvicted != podsEvicted { + t.Errorf("Expected %#v pods to be evicted but %#v got evicted", expectedPodsEvicted, podsEvicted) + } + +} + +func TestSortPodsByPriority(t *testing.T) { + n1 := test.BuildTestNode("n1", 4000, 3000, 9) + lowPriority := int32(0) + highPriority := int32(10000) + p1 := test.BuildTestPod("p1", 400, 0, n1.Name) + p1.Spec.Priority = &lowPriority + + // BestEffort + p2 := test.BuildTestPod("p2", 400, 0, n1.Name) + p2.Spec.Priority = &highPriority + + p2.Spec.Containers[0].Resources.Requests = nil + p2.Spec.Containers[0].Resources.Limits = nil + + // Burstable + p3 := test.BuildTestPod("p3", 400, 0, n1.Name) + p3.Spec.Priority = &highPriority + + // Guaranteed + p4 := test.BuildTestPod("p4", 400, 100, n1.Name) + p4.Spec.Priority = &highPriority + p4.Spec.Containers[0].Resources.Limits[v1.ResourceCPU] = *resource.NewMilliQuantity(400, resource.DecimalSI) + p4.Spec.Containers[0].Resources.Limits[v1.ResourceMemory] = *resource.NewQuantity(100, resource.DecimalSI) + + // Best effort with nil priorities. + p5 := test.BuildTestPod("p5", 400, 100, n1.Name) + p5.Spec.Priority = nil + p6 := test.BuildTestPod("p6", 400, 100, n1.Name) + p6.Spec.Containers[0].Resources.Limits[v1.ResourceCPU] = *resource.NewMilliQuantity(400, resource.DecimalSI) + p6.Spec.Containers[0].Resources.Limits[v1.ResourceMemory] = *resource.NewQuantity(100, resource.DecimalSI) + p6.Spec.Priority = nil + + podList := []*v1.Pod{p4, p3, p2, p1, p6, p5} + + sortPodsBasedOnPriority(podList) + for _, pod := range podList { + fmt.Println(pod) + } + if !reflect.DeepEqual(podList[len(podList)-1], p4) { + t.Errorf("Expected last pod in sorted list to be %v which of highest priority and guaranteed but got %v", p4, podList[len(podList)-1]) + } +} + func TestValidateThresholds(t *testing.T) { tests := []struct { name string diff --git a/test/test_utils.go b/test/test_utils.go index 8ed64a792..136213aa3 100644 --- a/test/test_utils.go +++ b/test/test_utils.go @@ -37,6 +37,7 @@ func BuildTestPod(name string, cpu int64, memory int64, nodeName string) *v1.Pod { Resources: v1.ResourceRequirements{ Requests: v1.ResourceList{}, + Limits: v1.ResourceList{}, }, }, },