diff --git a/go.mod b/go.mod index 564998eb4..de23a7fcf 100644 --- a/go.mod +++ b/go.mod @@ -32,6 +32,8 @@ require ( sigs.k8s.io/yaml v1.4.0 ) +require golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 + require ( cel.dev/expr v0.18.0 // indirect github.com/BurntSushi/toml v0.3.1 // indirect @@ -98,7 +100,6 @@ require ( go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect golang.org/x/crypto v0.31.0 // indirect - golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect golang.org/x/mod v0.21.0 // indirect golang.org/x/net v0.30.0 // indirect golang.org/x/oauth2 v0.23.0 // indirect diff --git a/pkg/framework/plugins/nodeutilization/highnodeutilization.go b/pkg/framework/plugins/nodeutilization/highnodeutilization.go index e9e412150..f030a15ed 100644 --- a/pkg/framework/plugins/nodeutilization/highnodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/highnodeutilization.go @@ -28,72 +28,100 @@ import ( nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" + "sigs.k8s.io/descheduler/pkg/framework/plugins/nodeutilization/normalizer" frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types" ) const HighNodeUtilizationPluginName = "HighNodeUtilization" -// HighNodeUtilization evicts pods from under utilized nodes so that scheduler can schedule according to its plugin. -// Note that CPU/Memory requests are used to calculate nodes' utilization and not the actual resource usage. - -type HighNodeUtilization struct { - handle frameworktypes.Handle - args *HighNodeUtilizationArgs - podFilter func(pod *v1.Pod) bool - underutilizationCriteria []interface{} - resourceNames []v1.ResourceName - targetThresholds api.ResourceThresholds - usageClient usageClient -} - +// this lines makes sure that HighNodeUtilization implements the BalancePlugin +// interface. var _ frameworktypes.BalancePlugin = &HighNodeUtilization{} -// NewHighNodeUtilization builds plugin from its arguments while passing a handle -func NewHighNodeUtilization(args runtime.Object, handle frameworktypes.Handle) (frameworktypes.Plugin, error) { - highNodeUtilizatioArgs, ok := args.(*HighNodeUtilizationArgs) +// HighNodeUtilization evicts pods from under utilized nodes so that scheduler +// can schedule according to its plugin. Note that CPU/Memory requests are used +// to calculate nodes' utilization and not the actual resource usage. +type HighNodeUtilization struct { + handle frameworktypes.Handle + args *HighNodeUtilizationArgs + podFilter func(pod *v1.Pod) bool + criteria []any + resourceNames []v1.ResourceName + highThresholds api.ResourceThresholds + usageClient usageClient +} + +// NewHighNodeUtilization builds plugin from its arguments while passing a handle. +func NewHighNodeUtilization( + genericArgs runtime.Object, handle frameworktypes.Handle, +) (frameworktypes.Plugin, error) { + args, ok := genericArgs.(*HighNodeUtilizationArgs) if !ok { - return nil, fmt.Errorf("want args to be of type HighNodeUtilizationArgs, got %T", args) + return nil, fmt.Errorf( + "want args to be of type HighNodeUtilizationArgs, got %T", + genericArgs, + ) } - targetThresholds := make(api.ResourceThresholds) - setDefaultForThresholds(highNodeUtilizatioArgs.Thresholds, targetThresholds) - resourceNames := getResourceNames(targetThresholds) - - underutilizationCriteria := []interface{}{ - "CPU", highNodeUtilizatioArgs.Thresholds[v1.ResourceCPU], - "Mem", highNodeUtilizatioArgs.Thresholds[v1.ResourceMemory], - "Pods", highNodeUtilizatioArgs.Thresholds[v1.ResourcePods], - } - for name := range highNodeUtilizatioArgs.Thresholds { - if !nodeutil.IsBasicResource(name) { - underutilizationCriteria = append(underutilizationCriteria, string(name), int64(highNodeUtilizatioArgs.Thresholds[name])) - } + // this plugins worries only about thresholds but the nodeplugins + // package was made to take two thresholds into account, one for low + // and another for high usage. here we make sure we set the high + // threshold to the maximum value for all resources for which we have a + // threshold. + highThresholds := make(api.ResourceThresholds) + for rname := range args.Thresholds { + highThresholds[rname] = MaxResourcePercentage } - podFilter, err := podutil.NewOptions(). + // criteria is a list of thresholds that are used to determine if a node + // is underutilized. it is used only for logging purposes. + criteria := []any{} + for rname, rvalue := range args.Thresholds { + criteria = append(criteria, rname, rvalue) + } + + podFilter, err := podutil. + NewOptions(). WithFilter(handle.Evictor().Filter). BuildFilterFunc() if err != nil { return nil, fmt.Errorf("error initializing pod filter function: %v", err) } + // resourceNames is a list of all resource names this plugin cares + // about. we care about the resources for which we have a threshold and + // all we consider the basic resources (cpu, memory, pods). + resourceNames := uniquifyResourceNames( + append( + getResourceNames(args.Thresholds), + v1.ResourceCPU, + v1.ResourceMemory, + v1.ResourcePods, + ), + ) + return &HighNodeUtilization{ - handle: handle, - args: highNodeUtilizatioArgs, - resourceNames: resourceNames, - targetThresholds: targetThresholds, - underutilizationCriteria: underutilizationCriteria, - podFilter: podFilter, - usageClient: newRequestedUsageClient(resourceNames, handle.GetPodsAssignedToNodeFunc()), + handle: handle, + args: args, + resourceNames: resourceNames, + highThresholds: highThresholds, + criteria: criteria, + podFilter: podFilter, + usageClient: newRequestedUsageClient( + resourceNames, + handle.GetPodsAssignedToNodeFunc(), + ), }, nil } -// Name retrieves the plugin name +// Name retrieves the plugin name. func (h *HighNodeUtilization) Name() string { return HighNodeUtilizationPluginName } -// Balance extension point implementation for the plugin +// Balance holds the main logic of the plugin. It evicts pods from under +// utilized nodes. The goal here is to concentrate pods in fewer nodes so that +// less nodes are used. func (h *HighNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *frameworktypes.Status { if err := h.usageClient.sync(ctx, nodes); err != nil { return &frameworktypes.Status{ @@ -101,21 +129,35 @@ func (h *HighNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fr } } + // take a picture of the current state of the nodes, everything else + // here is based on this snapshot. nodesMap, nodesUsageMap, podListMap := getNodeUsageSnapshot(nodes, h.usageClient) - nodeThresholdsMap := getStaticNodeThresholds(nodes, h.args.Thresholds, h.targetThresholds) - nodesUsageAsNodeThresholdsMap := nodeUsageToResourceThresholds(nodesUsageMap, nodesMap) + capacities := referencedResourceListForNodesCapacity(nodes) + + // node usages are not presented as percentages over the capacity. + // we need to normalize them to be able to compare them with the + // thresholds. thresholds are already provided by the user in + // percentage. + usage, thresholds := assessNodesUsagesAndStaticThresholds( + nodesUsageMap, capacities, h.args.Thresholds, h.highThresholds, + ) + + // classify nodes in two groups: underutilized and schedulable. we will + // later try to move pods from the first group to the second. nodeGroups := classifyNodeUsage( - nodesUsageAsNodeThresholdsMap, - nodeThresholdsMap, + usage, thresholds, []classifierFnc{ - // underutilized nodes + // underutilized nodes. func(nodeName string, usage, threshold api.ResourceThresholds) bool { return isNodeBelowThreshold(usage, threshold) }, - // every other node that is schedulable + // schedulable nodes. func(nodeName string, usage, threshold api.ResourceThresholds) bool { if nodeutil.IsNodeUnschedulable(nodesMap[nodeName]) { - klog.V(2).InfoS("Node is unschedulable", "node", klog.KObj(nodesMap[nodeName])) + klog.V(2).InfoS( + "Node is unschedulable", + "node", klog.KObj(nodesMap[nodeName]), + ) return false } return true @@ -123,69 +165,88 @@ func (h *HighNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fr }, ) - // convert groups node []NodeInfo + // the nodeplugin package works by means of NodeInfo structures. these + // structures hold a series of information about the nodes. now that + // we have classified the nodes, we can build the NodeInfo structures + // for each group. NodeInfo structs carry usage and available resources + // for each node. nodeInfos := make([][]NodeInfo, 2) category := []string{"underutilized", "overutilized"} for i := range nodeGroups { for nodeName := range nodeGroups[i] { - klog.InfoS("Node is "+category[i], "node", klog.KObj(nodesMap[nodeName]), "usage", nodesUsageMap[nodeName], "usagePercentage", resourceUsagePercentages(nodesUsageMap[nodeName], nodesMap[nodeName], true)) + klog.InfoS( + "Node has been classified", + "category", category[i], + "node", klog.KObj(nodesMap[nodeName]), + "usage", nodesUsageMap[nodeName], + "usagePercentage", normalizer.Round(usage[nodeName]), + ) nodeInfos[i] = append(nodeInfos[i], NodeInfo{ NodeUsage: NodeUsage{ node: nodesMap[nodeName], - usage: nodesUsageMap[nodeName], // get back the original node usage + usage: nodesUsageMap[nodeName], allPods: podListMap[nodeName], }, - thresholds: NodeThresholds{ - lowResourceThreshold: resourceThresholdsToNodeUsage(nodeThresholdsMap[nodeName][0], nodesMap[nodeName]), - highResourceThreshold: resourceThresholdsToNodeUsage(nodeThresholdsMap[nodeName][1], nodesMap[nodeName]), - }, + available: capNodeCapacitiesToThreshold( + nodesMap[nodeName], + thresholds[nodeName][1], + h.resourceNames, + ), }) } } - sourceNodes := nodeInfos[0] - highNodes := nodeInfos[1] + lowNodes, schedulableNodes := nodeInfos[0], nodeInfos[1] - // log message in one line - klog.V(1).InfoS("Criteria for a node below target utilization", h.underutilizationCriteria...) - klog.V(1).InfoS("Number of underutilized nodes", "totalNumber", len(sourceNodes)) + klog.V(1).InfoS("Criteria for a node below target utilization", h.criteria...) + klog.V(1).InfoS("Number of underutilized nodes", "totalNumber", len(lowNodes)) - if len(sourceNodes) == 0 { - klog.V(1).InfoS("No node is underutilized, nothing to do here, you might tune your thresholds further") + if len(lowNodes) == 0 { + klog.V(1).InfoS( + "No node is underutilized, nothing to do here, you might tune your thresholds further", + ) return nil } - if len(sourceNodes) <= h.args.NumberOfNodes { - klog.V(1).InfoS("Number of nodes underutilized is less or equal than NumberOfNodes, nothing to do here", "underutilizedNodes", len(sourceNodes), "numberOfNodes", h.args.NumberOfNodes) + + if len(lowNodes) <= h.args.NumberOfNodes { + klog.V(1).InfoS( + "Number of nodes underutilized is less or equal than NumberOfNodes, nothing to do here", + "underutilizedNodes", len(lowNodes), + "numberOfNodes", h.args.NumberOfNodes, + ) return nil } - if len(sourceNodes) == len(nodes) { + + if len(lowNodes) == len(nodes) { klog.V(1).InfoS("All nodes are underutilized, nothing to do here") return nil } - if len(highNodes) == 0 { + + if len(schedulableNodes) == 0 { klog.V(1).InfoS("No node is available to schedule the pods, nothing to do here") return nil } - // stop if the total available usage has dropped to zero - no more pods can be scheduled - continueEvictionCond := func(nodeInfo NodeInfo, totalAvailableUsage api.ReferencedResourceList) bool { - for name := range totalAvailableUsage { - if totalAvailableUsage[name].CmpInt64(0) < 1 { + // stops the eviction process if the total available capacity sage has + // dropped to zero - no more pods can be scheduled. this will signalize + // to stop if any of the available resources has dropped to zero. + continueEvictionCond := func(_ NodeInfo, avail api.ReferencedResourceList) bool { + for name := range avail { + if avail[name].CmpInt64(0) < 1 { return false } } - return true } - // Sort the nodes by the usage in ascending order - sortNodesByUsage(sourceNodes, true) + // sorts the nodes by the usage in ascending order. + sortNodesByUsage(lowNodes, true) evictPodsFromSourceNodes( ctx, h.args.EvictableNamespaces, - sourceNodes, - highNodes, + lowNodes, + schedulableNodes, h.handle.Evictor(), evictions.EvictOptions{StrategyName: HighNodeUtilizationPluginName}, h.podFilter, @@ -197,27 +258,3 @@ func (h *HighNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fr return nil } - -func setDefaultForThresholds(thresholds, targetThresholds api.ResourceThresholds) { - // check if Pods/CPU/Mem are set, if not, set them to 100 - if _, ok := thresholds[v1.ResourcePods]; !ok { - thresholds[v1.ResourcePods] = MaxResourcePercentage - } - if _, ok := thresholds[v1.ResourceCPU]; !ok { - thresholds[v1.ResourceCPU] = MaxResourcePercentage - } - if _, ok := thresholds[v1.ResourceMemory]; !ok { - thresholds[v1.ResourceMemory] = MaxResourcePercentage - } - - // Default targetThreshold resource values to 100 - targetThresholds[v1.ResourcePods] = MaxResourcePercentage - targetThresholds[v1.ResourceCPU] = MaxResourcePercentage - targetThresholds[v1.ResourceMemory] = MaxResourcePercentage - - for name := range thresholds { - if !nodeutil.IsBasicResource(name) { - targetThresholds[name] = MaxResourcePercentage - } - } -} diff --git a/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go b/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go index 27a8f26a2..f436f4516 100644 --- a/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go +++ b/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go @@ -244,7 +244,7 @@ func TestHighNodeUtilization(t *testing.T) { }, // All pods are assumed to be burstable (test.BuildTestNode always sets both cpu/memory resource requests to some value) pods: []*v1.Pod{ - test.BuildTestPod("p1", 400, 0, n1NodeName, func(pod *v1.Pod) { + test.BuildTestPod("p1", 0, 0, n1NodeName, func(pod *v1.Pod) { test.SetRSOwnerRef(pod) test.MakeBestEffortPod(pod) }), diff --git a/pkg/framework/plugins/nodeutilization/lownodeutilization.go b/pkg/framework/plugins/nodeutilization/lownodeutilization.go index 970aea3ca..fcbbdf56c 100644 --- a/pkg/framework/plugins/nodeutilization/lownodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/lownodeutilization.go @@ -28,122 +28,121 @@ import ( "sigs.k8s.io/descheduler/pkg/descheduler/evictions" nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" + "sigs.k8s.io/descheduler/pkg/framework/plugins/nodeutilization/normalizer" frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types" ) const LowNodeUtilizationPluginName = "LowNodeUtilization" -// LowNodeUtilization evicts pods from overutilized nodes to underutilized nodes. Note that CPU/Memory requests are used -// to calculate nodes' utilization and not the actual resource usage. - -type LowNodeUtilization struct { - handle frameworktypes.Handle - args *LowNodeUtilizationArgs - podFilter func(pod *v1.Pod) bool - underutilizationCriteria []interface{} - overutilizationCriteria []interface{} - resourceNames []v1.ResourceName - usageClient usageClient -} - +// this lines makes sure that HighNodeUtilization implements the BalancePlugin +// interface. var _ frameworktypes.BalancePlugin = &LowNodeUtilization{} -// NewLowNodeUtilization builds plugin from its arguments while passing a handle -func NewLowNodeUtilization(args runtime.Object, handle frameworktypes.Handle) (frameworktypes.Plugin, error) { - lowNodeUtilizationArgsArgs, ok := args.(*LowNodeUtilizationArgs) +// LowNodeUtilization evicts pods from overutilized nodes to underutilized +// nodes. Note that CPU/Memory requests are used to calculate nodes' +// utilization and not the actual resource usage. +type LowNodeUtilization struct { + handle frameworktypes.Handle + args *LowNodeUtilizationArgs + podFilter func(pod *v1.Pod) bool + underCriteria []any + overCriteria []any + resourceNames []v1.ResourceName + extendedResourceNames []v1.ResourceName + usageClient usageClient +} + +// NewLowNodeUtilization builds plugin from its arguments while passing a +// handle. this plugin aims to move workload from overutilized nodes to +// underutilized nodes. +func NewLowNodeUtilization( + genericArgs runtime.Object, handle frameworktypes.Handle, +) (frameworktypes.Plugin, error) { + args, ok := genericArgs.(*LowNodeUtilizationArgs) if !ok { - return nil, fmt.Errorf("want args to be of type LowNodeUtilizationArgs, got %T", args) + return nil, fmt.Errorf( + "want args to be of type LowNodeUtilizationArgs, got %T", + genericArgs, + ) } - metricsUtilization := lowNodeUtilizationArgsArgs.MetricsUtilization - if metricsUtilization != nil && metricsUtilization.Source == api.PrometheusMetrics { - if metricsUtilization.Prometheus != nil && metricsUtilization.Prometheus.Query != "" { - uResourceNames := getResourceNames(lowNodeUtilizationArgsArgs.Thresholds) - oResourceNames := getResourceNames(lowNodeUtilizationArgsArgs.TargetThresholds) - if len(uResourceNames) != 1 || uResourceNames[0] != MetricResource { - return nil, fmt.Errorf("thresholds are expected to specify a single instance of %q resource, got %v instead", MetricResource, uResourceNames) - } - if len(oResourceNames) != 1 || oResourceNames[0] != MetricResource { - return nil, fmt.Errorf("targetThresholds are expected to specify a single instance of %q resource, got %v instead", MetricResource, oResourceNames) - } - } else { - return nil, fmt.Errorf("prometheus query is missing") + // resourceNames holds a list of resources for which the user has + // provided thresholds for. extendedResourceNames holds those as well + // as cpu, memory and pods if no prometheus collection is used. + resourceNames := getResourceNames(args.Thresholds) + extendedResourceNames := resourceNames + + // if we are using prometheus we need to validate we have everything we + // need. if we aren't then we need to make sure we are also collecting + // data for cpu, memory and pods. + metrics := args.MetricsUtilization + if metrics != nil && metrics.Source == api.PrometheusMetrics { + if err := validatePrometheusMetricsUtilization(args); err != nil { + return nil, err } } else { - setDefaultForLNUThresholds(lowNodeUtilizationArgsArgs.Thresholds, lowNodeUtilizationArgsArgs.TargetThresholds, lowNodeUtilizationArgsArgs.UseDeviationThresholds) + extendedResourceNames = uniquifyResourceNames( + append( + resourceNames, + v1.ResourceCPU, + v1.ResourceMemory, + v1.ResourcePods, + ), + ) } - underutilizationCriteria := []interface{}{ - "CPU", lowNodeUtilizationArgsArgs.Thresholds[v1.ResourceCPU], - "Mem", lowNodeUtilizationArgsArgs.Thresholds[v1.ResourceMemory], - "Pods", lowNodeUtilizationArgsArgs.Thresholds[v1.ResourcePods], + // underCriteria and overCriteria are slices used for logging purposes. + // we assemble them only once. + underCriteria, overCriteria := []any{}, []any{} + for name := range args.Thresholds { + underCriteria = append(underCriteria, name, args.Thresholds[name]) } - for name := range lowNodeUtilizationArgsArgs.Thresholds { - if !nodeutil.IsBasicResource(name) { - underutilizationCriteria = append(underutilizationCriteria, string(name), int64(lowNodeUtilizationArgsArgs.Thresholds[name])) - } + for name := range args.TargetThresholds { + overCriteria = append(overCriteria, name, args.TargetThresholds[name]) } - overutilizationCriteria := []interface{}{ - "CPU", lowNodeUtilizationArgsArgs.TargetThresholds[v1.ResourceCPU], - "Mem", lowNodeUtilizationArgsArgs.TargetThresholds[v1.ResourceMemory], - "Pods", lowNodeUtilizationArgsArgs.TargetThresholds[v1.ResourcePods], - } - for name := range lowNodeUtilizationArgsArgs.TargetThresholds { - if !nodeutil.IsBasicResource(name) { - overutilizationCriteria = append(overutilizationCriteria, string(name), int64(lowNodeUtilizationArgsArgs.TargetThresholds[name])) - } - } - - podFilter, err := podutil.NewOptions(). + podFilter, err := podutil. + NewOptions(). WithFilter(handle.Evictor().Filter). BuildFilterFunc() if err != nil { return nil, fmt.Errorf("error initializing pod filter function: %v", err) } - resourceNames := getResourceNames(lowNodeUtilizationArgsArgs.Thresholds) - - var usageClient usageClient - // MetricsServer is deprecated, removed once dropped - if metricsUtilization != nil { - switch { - case metricsUtilization.MetricsServer, metricsUtilization.Source == api.KubernetesMetrics: - if handle.MetricsCollector() == nil { - return nil, fmt.Errorf("metrics client not initialized") - } - usageClient = newActualUsageClient(resourceNames, handle.GetPodsAssignedToNodeFunc(), handle.MetricsCollector()) - case metricsUtilization.Source == api.PrometheusMetrics: - if handle.PrometheusClient() == nil { - return nil, fmt.Errorf("prometheus client not initialized") - } - usageClient = newPrometheusUsageClient(handle.GetPodsAssignedToNodeFunc(), handle.PrometheusClient(), metricsUtilization.Prometheus.Query) - case metricsUtilization.Source != "": - return nil, fmt.Errorf("unrecognized metrics source") - default: - return nil, fmt.Errorf("metrics source is empty") + // this plugins supports different ways of collecting usage data. each + // different way provides its own "usageClient". here we make sure we + // have the correct one or an error is triggered. XXX MetricsServer is + // deprecated, removed once dropped. + var usageClient usageClient = newRequestedUsageClient( + extendedResourceNames, handle.GetPodsAssignedToNodeFunc(), + ) + if metrics != nil { + usageClient, err = usageClientForMetrics(args, handle, extendedResourceNames) + if err != nil { + return nil, err } - } else { - usageClient = newRequestedUsageClient(resourceNames, handle.GetPodsAssignedToNodeFunc()) } return &LowNodeUtilization{ - handle: handle, - args: lowNodeUtilizationArgsArgs, - underutilizationCriteria: underutilizationCriteria, - overutilizationCriteria: overutilizationCriteria, - resourceNames: resourceNames, - podFilter: podFilter, - usageClient: usageClient, + handle: handle, + args: args, + underCriteria: underCriteria, + overCriteria: overCriteria, + resourceNames: resourceNames, + extendedResourceNames: extendedResourceNames, + podFilter: podFilter, + usageClient: usageClient, }, nil } -// Name retrieves the plugin name +// Name retrieves the plugin name. func (l *LowNodeUtilization) Name() string { return LowNodeUtilizationPluginName } -// Balance extension point implementation for the plugin +// Balance holds the main logic of the plugin. It evicts pods from over +// utilized nodes to under utilized nodes. The goal here is to evenly +// distribute pods across nodes. func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *frameworktypes.Status { if err := l.usageClient.sync(ctx, nodes); err != nil { return &frameworktypes.Status{ @@ -151,87 +150,130 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra } } + // starts by taking a snapshot ofthe nodes usage. we will use this + // snapshot to assess the nodes usage and classify them as + // underutilized or overutilized. nodesMap, nodesUsageMap, podListMap := getNodeUsageSnapshot(nodes, l.usageClient) - var nodeThresholdsMap map[string][]api.ResourceThresholds + capacities := referencedResourceListForNodesCapacity(nodes) + + // usage, by default, is exposed in absolute values. we need to normalize + // them (convert them to percentages) to be able to compare them with the + // user provided thresholds. thresholds are already provided in percentage + // in the <0; 100> interval. + var usage map[string]api.ResourceThresholds + var thresholds map[string][]api.ResourceThresholds if l.args.UseDeviationThresholds { - thresholds, average := getNodeThresholdsFromAverageNodeUsage(nodes, l.usageClient, l.args.Thresholds, l.args.TargetThresholds) - klog.InfoS("Average utilization through all nodes", "utilization", average) - // All nodes are expected to have the same thresholds - for nodeName := range thresholds { - klog.InfoS("Underutilization threshold based on average utilization", "threshold", thresholds[nodeName][0]) - klog.InfoS("Overutilization threshold based on average utilization", "threshold", thresholds[nodeName][1]) - break - } - nodeThresholdsMap = thresholds + // here the thresholds provided by the user represent + // deviations from the average so we need to treat them + // differently. when calculating the average we only + // need to consider the resources for which the user + // has provided thresholds. + usage, thresholds = assessNodesUsagesAndRelativeThresholds( + filterResourceNames(nodesUsageMap, l.resourceNames), + capacities, + l.args.Thresholds, + l.args.TargetThresholds, + ) } else { - nodeThresholdsMap = getStaticNodeThresholds(nodes, l.args.Thresholds, l.args.TargetThresholds) + usage, thresholds = assessNodesUsagesAndStaticThresholds( + nodesUsageMap, + capacities, + l.args.Thresholds, + l.args.TargetThresholds, + ) } - nodesUsageAsNodeThresholdsMap := nodeUsageToResourceThresholds(nodesUsageMap, nodesMap) + // classify nodes in under and over utilized. we will later try to move + // pods from the overutilized nodes to the underutilized ones. nodeGroups := classifyNodeUsage( - nodesUsageAsNodeThresholdsMap, - nodeThresholdsMap, + usage, thresholds, []classifierFnc{ - // underutilization + // underutilization criteria processing. nodes that are + // underutilized but aren't schedulable are ignored. func(nodeName string, usage, threshold api.ResourceThresholds) bool { if nodeutil.IsNodeUnschedulable(nodesMap[nodeName]) { - klog.V(2).InfoS("Node is unschedulable, thus not considered as underutilized", "node", klog.KObj(nodesMap[nodeName])) + klog.V(2).InfoS( + "Node is unschedulable, thus not considered as underutilized", + "node", klog.KObj(nodesMap[nodeName]), + ) return false } return isNodeBelowThreshold(usage, threshold) }, - // overutilization + // overutilization criteria evaluation. func(nodeName string, usage, threshold api.ResourceThresholds) bool { return isNodeAboveThreshold(usage, threshold) }, }, ) - // convert groups node []NodeInfo + // the nodeutilization package was designed to work with NodeInfo + // structs. these structs holds information about how utilized a node + // is. we need to go through the result of the classification and turn + // it into NodeInfo structs. nodeInfos := make([][]NodeInfo, 2) - category := []string{"underutilized", "overutilized"} - listedNodes := map[string]struct{}{} + categories := []string{"underutilized", "overutilized"} + classifiedNodes := map[string]bool{} for i := range nodeGroups { for nodeName := range nodeGroups[i] { - klog.InfoS("Node is "+category[i], "node", klog.KObj(nodesMap[nodeName]), "usage", nodesUsageMap[nodeName], "usagePercentage", resourceUsagePercentages(nodesUsageMap[nodeName], nodesMap[nodeName], true)) - listedNodes[nodeName] = struct{}{} + classifiedNodes[nodeName] = true + + klog.InfoS( + "Node has been classified", + "category", categories[i], + "node", klog.KObj(nodesMap[nodeName]), + "usage", nodesUsageMap[nodeName], + "usagePercentage", normalizer.Round(usage[nodeName]), + ) + nodeInfos[i] = append(nodeInfos[i], NodeInfo{ NodeUsage: NodeUsage{ node: nodesMap[nodeName], - usage: nodesUsageMap[nodeName], // get back the original node usage + usage: nodesUsageMap[nodeName], allPods: podListMap[nodeName], }, - thresholds: NodeThresholds{ - lowResourceThreshold: resourceThresholdsToNodeUsage(nodeThresholdsMap[nodeName][0], nodesMap[nodeName]), - highResourceThreshold: resourceThresholdsToNodeUsage(nodeThresholdsMap[nodeName][1], nodesMap[nodeName]), - }, + available: capNodeCapacitiesToThreshold( + nodesMap[nodeName], + thresholds[nodeName][1], + l.extendedResourceNames, + ), }) } } + + // log nodes that are appropriately utilized. for nodeName := range nodesMap { - if _, ok := listedNodes[nodeName]; !ok { - klog.InfoS("Node is appropriately utilized", "node", klog.KObj(nodesMap[nodeName]), "usage", nodesUsageMap[nodeName], "usagePercentage", resourceUsagePercentages(nodesUsageMap[nodeName], nodesMap[nodeName], true)) + if !classifiedNodes[nodeName] { + klog.InfoS( + "Node is appropriately utilized", + "node", klog.KObj(nodesMap[nodeName]), + "usage", nodesUsageMap[nodeName], + "usagePercentage", normalizer.Round(usage[nodeName]), + ) } } - lowNodes := nodeInfos[0] - sourceNodes := nodeInfos[1] + lowNodes, highNodes := nodeInfos[0], nodeInfos[1] - // log message for nodes with low utilization - klog.V(1).InfoS("Criteria for a node under utilization", l.underutilizationCriteria...) + // log messages for nodes with low and high utilization + klog.V(1).InfoS("Criteria for a node under utilization", l.underCriteria...) klog.V(1).InfoS("Number of underutilized nodes", "totalNumber", len(lowNodes)) - - // log message for over utilized nodes - klog.V(1).InfoS("Criteria for a node above target utilization", l.overutilizationCriteria...) - klog.V(1).InfoS("Number of overutilized nodes", "totalNumber", len(sourceNodes)) + klog.V(1).InfoS("Criteria for a node above target utilization", l.overCriteria...) + klog.V(1).InfoS("Number of overutilized nodes", "totalNumber", len(highNodes)) if len(lowNodes) == 0 { - klog.V(1).InfoS("No node is underutilized, nothing to do here, you might tune your thresholds further") + klog.V(1).InfoS( + "No node is underutilized, nothing to do here, you might tune your thresholds further", + ) return nil } if len(lowNodes) <= l.args.NumberOfNodes { - klog.V(1).InfoS("Number of nodes underutilized is less or equal than NumberOfNodes, nothing to do here", "underutilizedNodes", len(lowNodes), "numberOfNodes", l.args.NumberOfNodes) + klog.V(1).InfoS( + "Number of nodes underutilized is less or equal than NumberOfNodes, nothing to do here", + "underutilizedNodes", len(lowNodes), + "numberOfNodes", l.args.NumberOfNodes, + ) return nil } @@ -240,14 +282,15 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra return nil } - if len(sourceNodes) == 0 { + if len(highNodes) == 0 { klog.V(1).InfoS("All nodes are under target utilization, nothing to do here") return nil } - // stop if node utilization drops below target threshold or any of required capacity (cpu, memory, pods) is moved + // this is a stop condition for the eviction process. we stop as soon + // as the node usage drops below the threshold. continueEvictionCond := func(nodeInfo NodeInfo, totalAvailableUsage api.ReferencedResourceList) bool { - if !isNodeAboveTargetUtilization(nodeInfo.NodeUsage, nodeInfo.thresholds.highResourceThreshold) { + if !isNodeAboveTargetUtilization(nodeInfo.NodeUsage, nodeInfo.available) { return false } for name := range totalAvailableUsage { @@ -259,8 +302,8 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra return true } - // Sort the nodes by the usage in descending order - sortNodesByUsage(sourceNodes, false) + // sort the nodes by the usage in descending order + sortNodesByUsage(highNodes, false) var nodeLimit *uint if l.args.EvictionLimits != nil { @@ -270,12 +313,12 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra evictPodsFromSourceNodes( ctx, l.args.EvictableNamespaces, - sourceNodes, + highNodes, lowNodes, l.handle.Evictor(), evictions.EvictOptions{StrategyName: LowNodeUtilizationPluginName}, l.podFilter, - l.resourceNames, + l.extendedResourceNames, continueEvictionCond, l.usageClient, nodeLimit, @@ -284,33 +327,65 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra return nil } -func setDefaultForLNUThresholds(thresholds, targetThresholds api.ResourceThresholds, useDeviationThresholds bool) { - // check if Pods/CPU/Mem are set, if not, set them to 100 - if _, ok := thresholds[v1.ResourcePods]; !ok { - if useDeviationThresholds { - thresholds[v1.ResourcePods] = MinResourcePercentage - targetThresholds[v1.ResourcePods] = MinResourcePercentage - } else { - thresholds[v1.ResourcePods] = MaxResourcePercentage - targetThresholds[v1.ResourcePods] = MaxResourcePercentage - } +// validatePrometheusMetricsUtilization validates the Prometheus metrics +// utilization. XXX this should be done way earlier than this. +func validatePrometheusMetricsUtilization(args *LowNodeUtilizationArgs) error { + if args.MetricsUtilization.Prometheus == nil { + return fmt.Errorf("prometheus property is missing") } - if _, ok := thresholds[v1.ResourceCPU]; !ok { - if useDeviationThresholds { - thresholds[v1.ResourceCPU] = MinResourcePercentage - targetThresholds[v1.ResourceCPU] = MinResourcePercentage - } else { - thresholds[v1.ResourceCPU] = MaxResourcePercentage - targetThresholds[v1.ResourceCPU] = MaxResourcePercentage - } + + if args.MetricsUtilization.Prometheus.Query == "" { + return fmt.Errorf("prometheus query is missing") } - if _, ok := thresholds[v1.ResourceMemory]; !ok { - if useDeviationThresholds { - thresholds[v1.ResourceMemory] = MinResourcePercentage - targetThresholds[v1.ResourceMemory] = MinResourcePercentage - } else { - thresholds[v1.ResourceMemory] = MaxResourcePercentage - targetThresholds[v1.ResourceMemory] = MaxResourcePercentage + + uResourceNames := getResourceNames(args.Thresholds) + oResourceNames := getResourceNames(args.TargetThresholds) + if len(uResourceNames) != 1 || uResourceNames[0] != MetricResource { + return fmt.Errorf( + "thresholds are expected to specify a single instance of %q resource, got %v instead", + MetricResource, uResourceNames, + ) + } + + if len(oResourceNames) != 1 || oResourceNames[0] != MetricResource { + return fmt.Errorf( + "targetThresholds are expected to specify a single instance of %q resource, got %v instead", + MetricResource, oResourceNames, + ) + } + + return nil +} + +// usageClientForMetrics returns the correct usage client based on the +// metrics source. XXX MetricsServer is deprecated, removed once dropped. +func usageClientForMetrics( + args *LowNodeUtilizationArgs, handle frameworktypes.Handle, resources []v1.ResourceName, +) (usageClient, error) { + metrics := args.MetricsUtilization + switch { + case metrics.MetricsServer, metrics.Source == api.KubernetesMetrics: + if handle.MetricsCollector() == nil { + return nil, fmt.Errorf("metrics client not initialized") } + return newActualUsageClient( + resources, + handle.GetPodsAssignedToNodeFunc(), + handle.MetricsCollector(), + ), nil + + case metrics.Source == api.PrometheusMetrics: + if handle.PrometheusClient() == nil { + return nil, fmt.Errorf("prometheus client not initialized") + } + return newPrometheusUsageClient( + handle.GetPodsAssignedToNodeFunc(), + handle.PrometheusClient(), + metrics.Prometheus.Query, + ), nil + case metrics.Source != "": + return nil, fmt.Errorf("unrecognized metrics source") + default: + return nil, fmt.Errorf("metrics source is empty") } } diff --git a/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go b/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go index 16a609a38..dcf6d60f4 100644 --- a/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go +++ b/pkg/framework/plugins/nodeutilization/lownodeutilization_test.go @@ -1022,6 +1022,79 @@ func TestLowNodeUtilization(t *testing.T) { expectedPodsWithMetricsEvicted: 2, evictedPods: []string{}, }, + { + name: "deviation thresholds and overevicting memory", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 5, + v1.ResourcePods: 5, + }, + targetThresholds: api.ResourceThresholds{ + v1.ResourceCPU: 5, + v1.ResourcePods: 5, + }, + useDeviationThresholds: true, + nodes: []*v1.Node{ + test.BuildTestNode(n1NodeName, 4000, 3000, 10, nil), + test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil), + test.BuildTestNode(n3NodeName, 4000, 3000, 10, test.SetNodeUnschedulable), + }, + // totalcpuusage = 3600m, avgcpuusage = 3600/12000 = 0.3 => 30% + // totalpodsusage = 9, avgpodsusage = 9/30 = 0.3 => 30% + // n1 and n2 are fully memory utilized + pods: []*v1.Pod{ + test.BuildTestPod("p1", 400, 375, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p2", 400, 375, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p3", 400, 375, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p4", 400, 375, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p5", 400, 375, n1NodeName, test.SetRSOwnerRef), + // These won't be evicted. + test.BuildTestPod("p6", 400, 375, n1NodeName, test.SetDSOwnerRef), + test.BuildTestPod("p7", 400, 375, n1NodeName, func(pod *v1.Pod) { + // A pod with local storage. + test.SetNormalOwnerRef(pod) + pod.Spec.Volumes = []v1.Volume{ + { + Name: "sample", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI), + }, + }, + }, + } + // A Mirror Pod. + pod.Annotations = test.GetMirrorPodAnnotation() + }), + test.BuildTestPod("p8", 400, 375, n1NodeName, func(pod *v1.Pod) { + // A Critical Pod. + test.SetNormalOwnerRef(pod) + pod.Namespace = "kube-system" + priority := utils.SystemCriticalPriority + pod.Spec.Priority = &priority + }), + test.BuildTestPod("p9", 400, 3000, n2NodeName, test.SetRSOwnerRef), + }, + nodemetricses: []*v1beta1.NodeMetrics{ + test.BuildNodeMetrics(n1NodeName, 4000, 3000), + test.BuildNodeMetrics(n2NodeName, 4000, 3000), + test.BuildNodeMetrics(n3NodeName, 4000, 3000), + }, + podmetricses: []*v1beta1.PodMetrics{ + test.BuildPodMetrics("p1", 400, 375), + test.BuildPodMetrics("p2", 400, 375), + test.BuildPodMetrics("p3", 400, 375), + test.BuildPodMetrics("p4", 400, 375), + test.BuildPodMetrics("p5", 400, 375), + test.BuildPodMetrics("p6", 400, 375), + test.BuildPodMetrics("p7", 400, 375), + test.BuildPodMetrics("p8", 400, 375), + test.BuildPodMetrics("p9", 400, 3000), + }, + expectedPodsEvicted: 0, + expectedPodsWithMetricsEvicted: 0, + evictedPods: []string{}, + }, { name: "without priorities different evictions for requested and actual resources", thresholds: api.ResourceThresholds{ @@ -1612,6 +1685,43 @@ func TestLowNodeUtilizationWithPrometheusMetrics(t *testing.T) { }, expectedPodsEvicted: 2, }, + { + name: "with instance:node_cpu:rate:sum query and deviation thresholds", + args: &LowNodeUtilizationArgs{ + UseDeviationThresholds: true, + Thresholds: api.ResourceThresholds{MetricResource: 10}, + TargetThresholds: api.ResourceThresholds{MetricResource: 10}, + MetricsUtilization: &MetricsUtilization{ + Source: api.PrometheusMetrics, + Prometheus: &Prometheus{ + Query: "instance:node_cpu:rate:sum", + }, + }, + }, + samples: model.Vector{ + sample("instance:node_cpu:rate:sum", n1NodeName, 1), + sample("instance:node_cpu:rate:sum", n2NodeName, 0.5), + sample("instance:node_cpu:rate:sum", n3NodeName, 0), + }, + nodes: []*v1.Node{ + test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil), + test.BuildTestNode(n2NodeName, 4000, 3000, 10, nil), + test.BuildTestNode(n3NodeName, 4000, 3000, 10, nil), + }, + pods: []*v1.Pod{ + test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetRSOwnerRef), + test.BuildTestPod("p5", 400, 0, n1NodeName, test.SetRSOwnerRef), + // These won't be evicted. + test.BuildTestPod("p6", 400, 0, n1NodeName, test.SetDSOwnerRef), + test.BuildTestPod("p7", 400, 0, n1NodeName, withLocalStorage), + test.BuildTestPod("p8", 400, 0, n1NodeName, withCriticalPod), + test.BuildTestPod("p9", 400, 0, n2NodeName, test.SetRSOwnerRef), + }, + expectedPodsEvicted: 1, + }, } for _, tc := range testCases { diff --git a/pkg/framework/plugins/nodeutilization/nodeutilization.go b/pkg/framework/plugins/nodeutilization/nodeutilization.go index c0d589688..9432cc500 100644 --- a/pkg/framework/plugins/nodeutilization/nodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/nodeutilization.go @@ -18,7 +18,8 @@ package nodeutilization import ( "context" - "math" + "fmt" + "maps" "slices" "sort" @@ -28,9 +29,11 @@ import ( "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" + "k8s.io/utils/ptr" "sigs.k8s.io/descheduler/pkg/descheduler/evictions" nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" + "sigs.k8s.io/descheduler/pkg/framework/plugins/nodeutilization/normalizer" frameworktypes "sigs.k8s.io/descheduler/pkg/framework/types" "sigs.k8s.io/descheduler/pkg/utils" ) @@ -57,114 +60,41 @@ import ( // - thresholds: map[string][]api.ReferencedResourceList // - pod list: map[string][]*v1.Pod // Once the nodes are classified produce the original []NodeInfo so the code is not that much changed (postponing further refactoring once it is needed) -const MetricResource = v1.ResourceName("MetricResource") - -// NodeUsage stores a node's info, pods on it, thresholds and its resource usage -type NodeUsage struct { - node *v1.Node - usage api.ReferencedResourceList - allPods []*v1.Pod -} - -type NodeThresholds struct { - lowResourceThreshold api.ReferencedResourceList - highResourceThreshold api.ReferencedResourceList -} - -type NodeInfo struct { - NodeUsage - thresholds NodeThresholds -} - -type continueEvictionCond func(nodeInfo NodeInfo, totalAvailableUsage api.ReferencedResourceList) bool - const ( + // MetricResource is a special resource name we use to keep track of a + // metric obtained from a third party entity. + MetricResource = v1.ResourceName("MetricResource") // MinResourcePercentage is the minimum value of a resource's percentage MinResourcePercentage = 0 // MaxResourcePercentage is the maximum value of a resource's percentage MaxResourcePercentage = 100 ) -func normalizePercentage(percent api.Percentage) api.Percentage { - if percent > MaxResourcePercentage { - return MaxResourcePercentage - } - if percent < MinResourcePercentage { - return MinResourcePercentage - } - return percent +// NodeUsage stores a node's info, pods on it, thresholds and its resource +// usage. +type NodeUsage struct { + node *v1.Node + usage api.ReferencedResourceList + allPods []*v1.Pod } -func nodeCapacity(node *v1.Node, nodeUsage api.ReferencedResourceList) v1.ResourceList { - capacity := node.Status.Capacity - if len(node.Status.Allocatable) > 0 { - capacity = node.Status.Allocatable - } - // the usage captures the metrics resource - if _, ok := nodeUsage[MetricResource]; ok { - // Make ResourceMetrics 100% => 100 points - capacity[MetricResource] = *resource.NewQuantity(int64(100), resource.DecimalSI) - } - return capacity +// NodeInfo is an entity we use to gather information about a given node. here +// we have its resource usage as well as the amount of available resources. +// we use this struct to carry information around and to make it easier to +// process. +type NodeInfo struct { + NodeUsage + available api.ReferencedResourceList } -func getNodeThresholdsFromAverageNodeUsage( - nodes []*v1.Node, - usageClient usageClient, - lowSpan, highSpan api.ResourceThresholds, -) (map[string][]api.ResourceThresholds, api.ResourceThresholds) { - total := api.ResourceThresholds{} - average := api.ResourceThresholds{} - numberOfNodes := len(nodes) - for _, node := range nodes { - usage := usageClient.nodeUtilization(node.Name) - nodeCapacity := nodeCapacity(node, usage) - for resource, value := range usage { - nodeCapacityValue := nodeCapacity[resource] - if resource == v1.ResourceCPU { - total[resource] += api.Percentage(value.MilliValue()) / api.Percentage(nodeCapacityValue.MilliValue()) * 100.0 - } else { - total[resource] += api.Percentage(value.Value()) / api.Percentage(nodeCapacityValue.Value()) * 100.0 - } - } - } - lowThreshold, highThreshold := api.ResourceThresholds{}, api.ResourceThresholds{} - for resource, value := range total { - average[resource] = value / api.Percentage(numberOfNodes) - // If either of the spans are 0, ignore the resource. I.e. 0%:5% is invalid. - // Any zero span signifies a resource is either not set or is to be ignored. - if lowSpan[resource] == MinResourcePercentage || highSpan[resource] == MinResourcePercentage { - lowThreshold[resource] = 1 - highThreshold[resource] = 1 - } else { - lowThreshold[resource] = normalizePercentage(average[resource] - lowSpan[resource]) - highThreshold[resource] = normalizePercentage(average[resource] + highSpan[resource]) - } - } +// continueEvictionCont is a function that determines if we should keep +// evicting pods or not. +type continueEvictionCond func(NodeInfo, api.ReferencedResourceList) bool - nodeThresholds := make(map[string][]api.ResourceThresholds) - for _, node := range nodes { - nodeThresholds[node.Name] = []api.ResourceThresholds{ - lowThreshold, - highThreshold, - } - } - return nodeThresholds, average -} - -func getStaticNodeThresholds( - nodes []*v1.Node, - thresholdsList ...api.ResourceThresholds, -) map[string][]api.ResourceThresholds { - nodeThresholds := make(map[string][]api.ResourceThresholds) - for _, node := range nodes { - nodeThresholds[node.Name] = append([]api.ResourceThresholds{}, slices.Clone(thresholdsList)...) - } - return nodeThresholds -} - -// getNodeUsageSnapshot separates the snapshot into easily accesible -// data chunks so the node usage can be processed separately. +// getNodeUsageSnapshot separates the snapshot into easily accesible data +// chunks so the node usage can be processed separately. returns a map of +// nodes, a map of their usage and a map of their pods. maps are indexed +// by node name. func getNodeUsageSnapshot( nodes []*v1.Node, usageClient usageClient, @@ -173,10 +103,11 @@ func getNodeUsageSnapshot( map[string]api.ReferencedResourceList, map[string][]*v1.Pod, ) { - nodesMap := make(map[string]*v1.Node) - // node usage needs to be kept in the original resource quantity since converting to percentages and back is losing precision + // XXX node usage needs to be kept in the original resource quantity + // since converting to percentages and back is losing precision. nodesUsageMap := make(map[string]api.ReferencedResourceList) podListMap := make(map[string][]*v1.Pod) + nodesMap := make(map[string]*v1.Node) for _, node := range nodes { nodesMap[node.Name] = node @@ -187,72 +118,13 @@ func getNodeUsageSnapshot( return nodesMap, nodesUsageMap, podListMap } -func resourceThreshold(nodeCapacity v1.ResourceList, resourceName v1.ResourceName, threshold api.Percentage) *resource.Quantity { - defaultFormat := resource.DecimalSI - if resourceName == v1.ResourceMemory { - defaultFormat = resource.BinarySI - } - - resourceCapacityFraction := func(resourceNodeCapacity int64) int64 { - // A threshold is in percentages but in <0;100> interval. - // Performing `threshold * 0.01` will convert <0;100> interval into <0;1>. - // Multiplying it with capacity will give fraction of the capacity corresponding to the given resource threshold in Quantity units. - return int64(float64(threshold) * 0.01 * float64(resourceNodeCapacity)) - } - - resourceCapacityQuantity := nodeCapacity.Name(resourceName, defaultFormat) - - if resourceName == v1.ResourceCPU { - return resource.NewMilliQuantity(resourceCapacityFraction(resourceCapacityQuantity.MilliValue()), defaultFormat) - } - return resource.NewQuantity(resourceCapacityFraction(resourceCapacityQuantity.Value()), defaultFormat) -} - -func resourceThresholdsToNodeUsage(resourceThresholds api.ResourceThresholds, node *v1.Node) api.ReferencedResourceList { - nodeUsage := make(api.ReferencedResourceList) - - nodeCapacity := node.Status.Capacity - if len(node.Status.Allocatable) > 0 { - nodeCapacity = node.Status.Allocatable - } - for resourceName, threshold := range resourceThresholds { - nodeUsage[resourceName] = resourceThreshold(nodeCapacity, resourceName, threshold) - } - - return nodeUsage -} - -func roundTo2Decimals(percentage float64) float64 { - return math.Round(percentage*100) / 100 -} - -func resourceUsagePercentages(nodeUsage api.ReferencedResourceList, node *v1.Node, round bool) api.ResourceThresholds { - nodeCapacity := nodeCapacity(node, nodeUsage) - - resourceUsagePercentage := api.ResourceThresholds{} - for resourceName, resourceUsage := range nodeUsage { - cap := nodeCapacity[resourceName] - if !cap.IsZero() { - value := 100 * float64(resourceUsage.MilliValue()) / float64(cap.MilliValue()) - if round { - value = roundTo2Decimals(float64(value)) - } - resourceUsagePercentage[resourceName] = api.Percentage(value) - } - } - return resourceUsagePercentage -} - -func nodeUsageToResourceThresholds(nodeUsage map[string]api.ReferencedResourceList, nodes map[string]*v1.Node) map[string]api.ResourceThresholds { - resourceThresholds := make(map[string]api.ResourceThresholds) - for nodeName, node := range nodes { - resourceThresholds[nodeName] = resourceUsagePercentages(nodeUsage[nodeName], node, false) - } - return resourceThresholds -} - -type classifierFnc func(nodeName string, value, threshold api.ResourceThresholds) bool +// classifierFnc is a function that classifies a node based on its usage and +// thresholds. returns true if it belongs to the group the classifier +// represents. +type classifierFnc func(string, api.ResourceThresholds, api.ResourceThresholds) bool +// classifyNodeUsage classify nodes into different groups based on classifiers. +// returns one group for each classifier. func classifyNodeUsage( nodeUsageAsNodeThresholds map[string]api.ResourceThresholds, nodeThresholdsMap map[string][]api.ResourceThresholds, @@ -275,9 +147,10 @@ func classifyNodeUsage( return nodeGroups } -func usageToKeysAndValues(usage api.ReferencedResourceList) []interface{} { - // log message in one line - keysAndValues := []interface{}{} +// usageToKeysAndValues converts a ReferencedResourceList into a list of +// keys and values. this is useful for logging. +func usageToKeysAndValues(usage api.ReferencedResourceList) []any { + keysAndValues := []any{} if quantity, exists := usage[v1.ResourceCPU]; exists { keysAndValues = append(keysAndValues, "CPU", quantity.MilliValue()) } @@ -289,15 +162,14 @@ func usageToKeysAndValues(usage api.ReferencedResourceList) []interface{} { } for name := range usage { if !nodeutil.IsBasicResource(name) { - keysAndValues = append(keysAndValues, string(name), usage[name].Value()) + keysAndValues = append(keysAndValues, name, usage[name].Value()) } } return keysAndValues } -// evictPodsFromSourceNodes evicts pods based on priority, if all the pods on the node have priority, if not -// evicts them based on QoS as fallback option. -// TODO: @ravig Break this function into smaller functions. +// evictPodsFromSourceNodes evicts pods based on priority, if all the pods on +// the node have priority, if not evicts them based on QoS as fallback option. func evictPodsFromSourceNodes( ctx context.Context, evictableNamespaces *api.Namespaces, @@ -310,48 +182,65 @@ func evictPodsFromSourceNodes( usageClient usageClient, maxNoOfPodsToEvictPerNode *uint, ) { - // upper bound on total number of pods/cpu/memory and optional extended resources to be moved - totalAvailableUsage := api.ReferencedResourceList{} - for _, resourceName := range resourceNames { - totalAvailableUsage[resourceName] = &resource.Quantity{} + available, err := assessAvailableResourceInNodes(destinationNodes, resourceNames) + if err != nil { + klog.ErrorS(err, "unable to assess available resources in nodes") + return } - taintsOfDestinationNodes := make(map[string][]v1.Taint, len(destinationNodes)) + klog.V(1).InfoS("Total capacity to be moved", usageToKeysAndValues(available)...) + + destinationTaints := make(map[string][]v1.Taint, len(destinationNodes)) for _, node := range destinationNodes { - taintsOfDestinationNodes[node.node.Name] = node.node.Spec.Taints - - for _, name := range resourceNames { - if _, exists := node.usage[name]; !exists { - klog.Errorf("unable to find %q resource in node's %q usage, terminating eviction", name, node.node.Name) - return - } - if _, ok := totalAvailableUsage[name]; !ok { - totalAvailableUsage[name] = resource.NewQuantity(0, resource.DecimalSI) - } - totalAvailableUsage[name].Add(*node.thresholds.highResourceThreshold[name]) - totalAvailableUsage[name].Sub(*node.usage[name]) - } + destinationTaints[node.node.Name] = node.node.Spec.Taints } - // log message in one line - klog.V(1).InfoS("Total capacity to be moved", usageToKeysAndValues(totalAvailableUsage)...) - for _, node := range sourceNodes { - klog.V(3).InfoS("Evicting pods from node", "node", klog.KObj(node.node), "usage", node.usage) + klog.V(3).InfoS( + "Evicting pods from node", + "node", klog.KObj(node.node), + "usage", node.usage, + ) nonRemovablePods, removablePods := classifyPods(node.allPods, podFilter) - klog.V(2).InfoS("Pods on node", "node", klog.KObj(node.node), "allPods", len(node.allPods), "nonRemovablePods", len(nonRemovablePods), "removablePods", len(removablePods)) + klog.V(2).InfoS( + "Pods on node", + "node", klog.KObj(node.node), + "allPods", len(node.allPods), + "nonRemovablePods", len(nonRemovablePods), + "removablePods", len(removablePods), + ) if len(removablePods) == 0 { - klog.V(1).InfoS("No removable pods on node, try next node", "node", klog.KObj(node.node)) + klog.V(1).InfoS( + "No removable pods on node, try next node", + "node", klog.KObj(node.node), + ) continue } - klog.V(1).InfoS("Evicting pods based on priority, if they have same priority, they'll be evicted based on QoS tiers") - // sort the evictable Pods based on priority. This also sorts them based on QoS. If there are multiple pods with same priority, they are sorted based on QoS tiers. + klog.V(1).InfoS( + "Evicting pods based on priority, if they have same priority, they'll be evicted based on QoS tiers", + ) + + // sort the evictable Pods based on priority. This also sorts + // them based on QoS. If there are multiple pods with same + // priority, they are sorted based on QoS tiers. podutil.SortPodsBasedOnPriorityLowToHigh(removablePods) - err := evictPods(ctx, evictableNamespaces, removablePods, node, totalAvailableUsage, taintsOfDestinationNodes, podEvictor, evictOptions, continueEviction, usageClient, maxNoOfPodsToEvictPerNode) - if err != nil { + + if err := evictPods( + ctx, + evictableNamespaces, + removablePods, + node, + available, + destinationTaints, + podEvictor, + evictOptions, + continueEviction, + usageClient, + maxNoOfPodsToEvictPerNode, + ); err != nil { switch err.(type) { case *evictions.EvictionTotalLimitError: return @@ -361,103 +250,136 @@ func evictPodsFromSourceNodes( } } +// evictPods keeps evicting pods until the continueEviction function returns +// false or we can't or shouldn't evict any more pods. available node resources +// are updated after each eviction. func evictPods( ctx context.Context, evictableNamespaces *api.Namespaces, inputPods []*v1.Pod, nodeInfo NodeInfo, totalAvailableUsage api.ReferencedResourceList, - taintsOfLowNodes map[string][]v1.Taint, + destinationTaints map[string][]v1.Taint, podEvictor frameworktypes.Evictor, evictOptions evictions.EvictOptions, continueEviction continueEvictionCond, usageClient usageClient, maxNoOfPodsToEvictPerNode *uint, ) error { + // preemptive check to see if we should continue evicting pods. + if !continueEviction(nodeInfo, totalAvailableUsage) { + return nil + } + + // some namespaces can be excluded from the eviction process. var excludedNamespaces sets.Set[string] if evictableNamespaces != nil { excludedNamespaces = sets.New(evictableNamespaces.Exclude...) } var evictionCounter uint = 0 - if continueEviction(nodeInfo, totalAvailableUsage) { - for _, pod := range inputPods { - if maxNoOfPodsToEvictPerNode != nil && evictionCounter >= *maxNoOfPodsToEvictPerNode { - klog.V(3).InfoS("Max number of evictions per node per plugin reached", "limit", *maxNoOfPodsToEvictPerNode) - break - } - if !utils.PodToleratesTaints(pod, taintsOfLowNodes) { - klog.V(3).InfoS("Skipping eviction for pod, doesn't tolerate node taint", "pod", klog.KObj(pod)) + for _, pod := range inputPods { + if maxNoOfPodsToEvictPerNode != nil && evictionCounter >= *maxNoOfPodsToEvictPerNode { + klog.V(3).InfoS( + "Max number of evictions per node per plugin reached", + "limit", *maxNoOfPodsToEvictPerNode, + ) + break + } + + if !utils.PodToleratesTaints(pod, destinationTaints) { + klog.V(3).InfoS( + "Skipping eviction for pod, doesn't tolerate node taint", + "pod", klog.KObj(pod), + ) + continue + } + + // verify if we can evict the pod based on the pod evictor + // filter and on the excluded namespaces. + preEvictionFilterWithOptions, err := podutil. + NewOptions(). + WithFilter(podEvictor.PreEvictionFilter). + WithoutNamespaces(excludedNamespaces). + BuildFilterFunc() + if err != nil { + klog.ErrorS(err, "could not build preEvictionFilter with namespace exclusion") + continue + } + + if !preEvictionFilterWithOptions(pod) { + continue + } + + // in case podUsage does not support resource counting (e.g. + // provided metric does not quantify pod resource utilization). + unconstrainedResourceEviction := false + podUsage, err := usageClient.podUsage(pod) + if err != nil { + if _, ok := err.(*notSupportedError); !ok { + klog.Errorf( + "unable to get pod usage for %v/%v: %v", + pod.Namespace, pod.Name, err, + ) continue } + unconstrainedResourceEviction = true + } - preEvictionFilterWithOptions, err := podutil.NewOptions(). - WithFilter(podEvictor.PreEvictionFilter). - WithoutNamespaces(excludedNamespaces). - BuildFilterFunc() - if err != nil { - klog.ErrorS(err, "could not build preEvictionFilter with namespace exclusion") - continue - } - - if !preEvictionFilterWithOptions(pod) { - continue - } - - // In case podUsage does not support resource counting (e.g. provided metric - // does not quantify pod resource utilization). - unconstrainedResourceEviction := false - podUsage, err := usageClient.podUsage(pod) - if err != nil { - if _, ok := err.(*notSupportedError); !ok { - klog.Errorf("unable to get pod usage for %v/%v: %v", pod.Namespace, pod.Name, err) - continue - } - unconstrainedResourceEviction = true - } - err = podEvictor.Evict(ctx, pod, evictOptions) - if err == nil { - if maxNoOfPodsToEvictPerNode == nil && unconstrainedResourceEviction { - klog.V(3).InfoS("Currently, only a single pod eviction is allowed") - break - } - evictionCounter++ - klog.V(3).InfoS("Evicted pods", "pod", klog.KObj(pod)) - if unconstrainedResourceEviction { - continue - } - for name := range totalAvailableUsage { - if name == v1.ResourcePods { - nodeInfo.usage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI)) - totalAvailableUsage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI)) - } else { - nodeInfo.usage[name].Sub(*podUsage[name]) - totalAvailableUsage[name].Sub(*podUsage[name]) - } - } - - keysAndValues := []interface{}{ - "node", nodeInfo.node.Name, - } - keysAndValues = append(keysAndValues, usageToKeysAndValues(nodeInfo.usage)...) - klog.V(3).InfoS("Updated node usage", keysAndValues...) - // check if pods can be still evicted - if !continueEviction(nodeInfo, totalAvailableUsage) { - break - } - continue - } + if err := podEvictor.Evict(ctx, pod, evictOptions); err != nil { switch err.(type) { case *evictions.EvictionNodeLimitError, *evictions.EvictionTotalLimitError: return err default: klog.Errorf("eviction failed: %v", err) + continue } } + + if maxNoOfPodsToEvictPerNode == nil && unconstrainedResourceEviction { + klog.V(3).InfoS("Currently, only a single pod eviction is allowed") + break + } + + evictionCounter++ + klog.V(3).InfoS("Evicted pods", "pod", klog.KObj(pod)) + if unconstrainedResourceEviction { + continue + } + + subtractPodUsageFromNodeAvailability(totalAvailableUsage, &nodeInfo, podUsage) + + keysAndValues := []any{"node", nodeInfo.node.Name} + keysAndValues = append(keysAndValues, usageToKeysAndValues(nodeInfo.usage)...) + klog.V(3).InfoS("Updated node usage", keysAndValues...) + + // make sure we should continue evicting pods. + if !continueEviction(nodeInfo, totalAvailableUsage) { + break + } } return nil } +// subtractPodUsageFromNodeAvailability subtracts the pod usage from the node +// available resources. this is done to keep track of the remaining resources +// that can be used to move pods around. +func subtractPodUsageFromNodeAvailability( + available api.ReferencedResourceList, + nodeInfo *NodeInfo, + podUsage api.ReferencedResourceList, +) { + for name := range available { + if name == v1.ResourcePods { + nodeInfo.usage[name].Sub(*resource.NewQuantity(1, resource.DecimalSI)) + available[name].Sub(*resource.NewQuantity(1, resource.DecimalSI)) + continue + } + nodeInfo.usage[name].Sub(*podUsage[name]) + available[name].Sub(*podUsage[name]) + } +} + // sortNodesByUsage sorts nodes based on usage according to the given plugin. func sortNodesByUsage(nodes []NodeInfo, ascending bool) { sort.Slice(nodes, func(i, j int) bool { @@ -503,8 +425,8 @@ func isNodeAboveTargetUtilization(usage NodeUsage, threshold api.ReferencedResou // isNodeAboveThreshold checks if a node is over a threshold // At least one resource has to be above the threshold func isNodeAboveThreshold(usage, threshold api.ResourceThresholds) bool { - for name, resourceValue := range usage { - if threshold[name] < resourceValue { + for name := range threshold { + if threshold[name] < usage[name] { return true } } @@ -514,8 +436,8 @@ func isNodeAboveThreshold(usage, threshold api.ResourceThresholds) bool { // isNodeBelowThreshold checks if a node is under a threshold // All resources have to be below the threshold func isNodeBelowThreshold(usage, threshold api.ResourceThresholds) bool { - for name, resourceValue := range usage { - if threshold[name] < resourceValue { + for name := range threshold { + if threshold[name] < usage[name] { return false } } @@ -531,6 +453,8 @@ func getResourceNames(thresholds api.ResourceThresholds) []v1.ResourceName { return resourceNames } +// classifyPods classify them in two lists: removable and non-removable. +// Removable pods are those that can be evicted. func classifyPods(pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []*v1.Pod) { var nonRemovablePods, removablePods []*v1.Pod @@ -544,3 +468,282 @@ func classifyPods(pods []*v1.Pod, filter func(pod *v1.Pod) bool) ([]*v1.Pod, []* return nonRemovablePods, removablePods } + +// assessNodesUsagesAndStaticThresholds converts the raw usage data into +// percentage. Returns the usage (pct) and the thresholds (pct) for each +// node. +func assessNodesUsagesAndStaticThresholds( + rawUsages, rawCapacities map[string]api.ReferencedResourceList, + lowSpan, highSpan api.ResourceThresholds, +) (map[string]api.ResourceThresholds, map[string][]api.ResourceThresholds) { + // first we normalize the node usage from the raw data (Mi, Gi, etc) + // into api.Percentage values. + usage := normalizer.Normalize( + rawUsages, rawCapacities, ResourceUsageToResourceThreshold, + ) + + // we are not taking the average and applying deviations to it we can + // simply replicate the same threshold across all nodes and return. + thresholds := normalizer.Replicate( + slices.Collect(maps.Keys(usage)), + []api.ResourceThresholds{lowSpan, highSpan}, + ) + return usage, thresholds +} + +// assessNodesUsagesAndRelativeThresholds converts the raw usage data into +// percentage. Thresholds are calculated based on the average usage. Returns +// the usage (pct) and the thresholds (pct) for each node. +func assessNodesUsagesAndRelativeThresholds( + rawUsages, rawCapacities map[string]api.ReferencedResourceList, + lowSpan, highSpan api.ResourceThresholds, +) (map[string]api.ResourceThresholds, map[string][]api.ResourceThresholds) { + // first we normalize the node usage from the raw data (Mi, Gi, etc) + // into api.Percentage values. + usage := normalizer.Normalize( + rawUsages, rawCapacities, ResourceUsageToResourceThreshold, + ) + + // calculate the average usage and then deviate it according to the + // user provided thresholds. + average := normalizer.Average(usage) + + // calculate the average usage and then deviate it according to the + // user provided thresholds. We also ensure that the value after the + // deviation is at least 1%. this call also replicates the thresholds + // across all nodes. + thresholds := normalizer.Replicate( + slices.Collect(maps.Keys(usage)), + normalizer.Map( + []api.ResourceThresholds{ + normalizer.Sum(average, normalizer.Negate(lowSpan)), + normalizer.Sum(average, highSpan), + }, + func(thresholds api.ResourceThresholds) api.ResourceThresholds { + return normalizer.Clamp(thresholds, 0, 100) + }, + ), + ) + + return usage, thresholds +} + +// referencedResourceListForNodesCapacity returns a ReferencedResourceList for +// the capacity of a list of nodes. If allocatable resources are present, they +// are used instead of capacity. +func referencedResourceListForNodesCapacity(nodes []*v1.Node) map[string]api.ReferencedResourceList { + capacities := map[string]api.ReferencedResourceList{} + for _, node := range nodes { + capacities[node.Name] = referencedResourceListForNodeCapacity(node) + } + return capacities +} + +// referencedResourceListForNodeCapacity returns a ReferencedResourceList for +// the capacity of a node. If allocatable resources are present, they are used +// instead of capacity. +func referencedResourceListForNodeCapacity(node *v1.Node) api.ReferencedResourceList { + capacity := node.Status.Capacity + if len(node.Status.Allocatable) > 0 { + capacity = node.Status.Allocatable + } + + referenced := api.ReferencedResourceList{} + for name, quantity := range capacity { + referenced[name] = ptr.To(quantity) + } + + // XXX the descheduler also manages monitoring queries that are + // supposed to return a value representing a percentage of the + // resource usage. In this case we need to provide a value for + // the MetricResource, which is not present in the node capacity. + referenced[MetricResource] = resource.NewQuantity( + 100, resource.DecimalSI, + ) + + return referenced +} + +// ResourceUsage2ResourceThreshold is an implementation of a Normalizer that +// converts a set of resource usages and totals into percentage. This function +// operates on Quantity Value() for all the resources except CPU, where it uses +// MilliValue(). +func ResourceUsageToResourceThreshold( + usages, totals api.ReferencedResourceList, +) api.ResourceThresholds { + result := api.ResourceThresholds{} + for rname, value := range usages { + if value == nil || totals[rname] == nil { + continue + } + + total := totals[rname] + used, capacity := value.Value(), total.Value() + if rname == v1.ResourceCPU { + used, capacity = value.MilliValue(), total.MilliValue() + } + + var percent float64 + if capacity > 0 { + percent = float64(used) / float64(capacity) * 100 + } + + result[rname] = api.Percentage(percent) + } + return result +} + +// uniquifyResourceNames returns a slice of resource names with duplicates +// removed. +func uniquifyResourceNames(resourceNames []v1.ResourceName) []v1.ResourceName { + resourceNamesMap := map[v1.ResourceName]bool{ + v1.ResourceCPU: true, + v1.ResourceMemory: true, + v1.ResourcePods: true, + } + for _, resourceName := range resourceNames { + resourceNamesMap[resourceName] = true + } + return slices.Collect(maps.Keys(resourceNamesMap)) +} + +// filterResourceNamesFromNodeUsage removes from the node usage slice all keys +// that are not present in the resourceNames slice. +func filterResourceNames( + from map[string]api.ReferencedResourceList, resourceNames []v1.ResourceName, +) map[string]api.ReferencedResourceList { + newNodeUsage := make(map[string]api.ReferencedResourceList) + for nodeName, usage := range from { + newNodeUsage[nodeName] = api.ReferencedResourceList{} + for _, resourceName := range resourceNames { + if _, exists := usage[resourceName]; exists { + newNodeUsage[nodeName][resourceName] = usage[resourceName] + } + } + } + return newNodeUsage +} + +// capNodeCapacitiesToThreshold caps the node capacities to the given +// thresholds. if a threshold is not set for a resource, the full capacity is +// returned. +func capNodeCapacitiesToThreshold( + node *v1.Node, + thresholds api.ResourceThresholds, + resourceNames []v1.ResourceName, +) api.ReferencedResourceList { + capped := api.ReferencedResourceList{} + for _, resourceName := range resourceNames { + capped[resourceName] = capNodeCapacityToThreshold( + node, thresholds, resourceName, + ) + } + return capped +} + +// capNodeCapacityToThreshold caps the node capacity to the given threshold. if +// no threshold is set for the resource, the full capacity is returned. +func capNodeCapacityToThreshold( + node *v1.Node, thresholds api.ResourceThresholds, resourceName v1.ResourceName, +) *resource.Quantity { + capacities := referencedResourceListForNodeCapacity(node) + if _, ok := capacities[resourceName]; !ok { + // if the node knows nothing about the resource we return a + // zero capacity for it. + return resource.NewQuantity(0, resource.DecimalSI) + } + + // if no threshold is set then we simply return the full capacity. + if _, ok := thresholds[resourceName]; !ok { + return capacities[resourceName] + } + + // now that we have a capacity and a threshold we need to do the math + // to cap the former to the latter. + quantity := capacities[resourceName] + threshold := thresholds[resourceName] + + // we have a different format for memory. all the other resources are + // in the DecimalSI format. + format := resource.DecimalSI + if resourceName == v1.ResourceMemory { + format = resource.BinarySI + } + + // this is what we use to cap the capacity. thresholds are expected to + // be in the <0;100> interval. + fraction := func(threshold api.Percentage, capacity int64) int64 { + return int64(float64(threshold) * 0.01 * float64(capacity)) + } + + // here we also vary a little bit. milli is used for cpu, all the rest + // goes with the default. + if resourceName == v1.ResourceCPU { + return resource.NewMilliQuantity( + fraction(threshold, quantity.MilliValue()), + format, + ) + } + + return resource.NewQuantity( + fraction(threshold, quantity.Value()), + format, + ) +} + +// assessAvailableResourceInNodes computes the available resources in all the +// nodes. this is done by summing up all the available resources in all the +// nodes and then subtracting the usage from it. +func assessAvailableResourceInNodes( + nodes []NodeInfo, resources []v1.ResourceName, +) (api.ReferencedResourceList, error) { + // available holds a sum of all the resources that can be used to move + // pods around. e.g. the sum of all available cpu and memory in all + // cluster nodes. + available := api.ReferencedResourceList{} + for _, node := range nodes { + for _, resourceName := range resources { + if _, exists := node.usage[resourceName]; !exists { + return nil, fmt.Errorf( + "unable to find %s resource in node's %s usage, terminating eviction", + resourceName, node.node.Name, + ) + } + + // XXX this should never happen. we better bail out + // here than hard crash with a segfault. + if node.usage[resourceName] == nil { + return nil, fmt.Errorf( + "unable to find %s usage resources, terminating eviction", + resourceName, + ) + } + + // keep the current usage around so we can subtract it + // from the available resources. + usage := *node.usage[resourceName] + + // first time seeing this resource, initialize it. + if _, ok := available[resourceName]; !ok { + available[resourceName] = resource.NewQuantity( + 0, resource.DecimalSI, + ) + } + + // XXX this should never happen. we better bail out + // here than hard crash with a segfault. + if node.available[resourceName] == nil { + return nil, fmt.Errorf( + "unable to find %s available resources, terminating eviction", + resourceName, + ) + } + + // now we add the capacity and then subtract the usage. + available[resourceName].Add(*node.available[resourceName]) + available[resourceName].Sub(usage) + } + } + + return available, nil +} diff --git a/pkg/framework/plugins/nodeutilization/nodeutilization_test.go b/pkg/framework/plugins/nodeutilization/nodeutilization_test.go index 1a6141473..3404ada0e 100644 --- a/pkg/framework/plugins/nodeutilization/nodeutilization_test.go +++ b/pkg/framework/plugins/nodeutilization/nodeutilization_test.go @@ -17,7 +17,7 @@ limitations under the License. package nodeutilization import ( - "math" + "reflect" "testing" v1 "k8s.io/api/core/v1" @@ -56,45 +56,6 @@ var ( extendedResource = v1.ResourceName("example.com/foo") ) -func TestResourceUsagePercentages(t *testing.T) { - resourceUsagePercentage := resourceUsagePercentages( - api.ReferencedResourceList{ - v1.ResourceCPU: resource.NewMilliQuantity(1220, resource.DecimalSI), - v1.ResourceMemory: resource.NewQuantity(3038982964, resource.BinarySI), - v1.ResourcePods: resource.NewQuantity(11, resource.BinarySI), - }, - &v1.Node{ - Status: v1.NodeStatus{ - Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(3977868*1024, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI), - }, - Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(1930, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(3287692*1024, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(29, resource.BinarySI), - }, - }, - }, - true, - ) - - expectedUsageInIntPercentage := map[v1.ResourceName]float64{ - v1.ResourceCPU: 63, - v1.ResourceMemory: 90, - v1.ResourcePods: 37, - } - - for resourceName, percentage := range expectedUsageInIntPercentage { - if math.Floor(float64(resourceUsagePercentage[resourceName])) != percentage { - t.Errorf("Incorrect percentange computation, expected %v, got math.Floor(%v) instead", percentage, resourceUsagePercentage[resourceName]) - } - } - - t.Logf("resourceUsagePercentage: %#v\n", resourceUsagePercentage) -} - func TestSortNodesByUsage(t *testing.T) { tests := []struct { name string @@ -173,3 +134,83 @@ func TestSortNodesByUsage(t *testing.T) { }) } } + +func TestResourceUsageToResourceThreshold(t *testing.T) { + for _, tt := range []struct { + name string + usage api.ReferencedResourceList + capacity api.ReferencedResourceList + expected api.ResourceThresholds + }{ + { + name: "10 percent", + usage: api.ReferencedResourceList{ + v1.ResourceCPU: resource.NewMilliQuantity(100, resource.DecimalSI), + }, + capacity: api.ReferencedResourceList{ + v1.ResourceCPU: resource.NewMilliQuantity(1000, resource.DecimalSI), + }, + expected: api.ResourceThresholds{v1.ResourceCPU: 10}, + }, + { + name: "zeroed out capacity", + usage: api.ReferencedResourceList{ + v1.ResourceCPU: resource.NewMilliQuantity(100, resource.DecimalSI), + }, + capacity: api.ReferencedResourceList{ + v1.ResourceCPU: resource.NewMilliQuantity(0, resource.DecimalSI), + }, + expected: api.ResourceThresholds{v1.ResourceCPU: 0}, + }, + { + name: "non existing usage", + usage: api.ReferencedResourceList{ + "does-not-exist": resource.NewMilliQuantity(100, resource.DecimalSI), + }, + capacity: api.ReferencedResourceList{ + v1.ResourceCPU: resource.NewMilliQuantity(100, resource.DecimalSI), + v1.ResourceMemory: resource.NewMilliQuantity(100, resource.DecimalSI), + }, + expected: api.ResourceThresholds{}, + }, + { + name: "existing and non existing usage", + usage: api.ReferencedResourceList{ + "does-not-exist": resource.NewMilliQuantity(100, resource.DecimalSI), + v1.ResourceCPU: resource.NewMilliQuantity(200, resource.DecimalSI), + }, + capacity: api.ReferencedResourceList{ + v1.ResourceCPU: resource.NewMilliQuantity(1000, resource.DecimalSI), + v1.ResourceMemory: resource.NewMilliQuantity(1000, resource.DecimalSI), + }, + expected: api.ResourceThresholds{v1.ResourceCPU: 20}, + }, + { + name: "nil usage", + usage: api.ReferencedResourceList{ + v1.ResourceCPU: nil, + }, + capacity: api.ReferencedResourceList{ + v1.ResourceCPU: resource.NewMilliQuantity(1000, resource.DecimalSI), + }, + expected: api.ResourceThresholds{}, + }, + { + name: "nil capacity", + usage: api.ReferencedResourceList{ + v1.ResourceCPU: resource.NewMilliQuantity(100, resource.DecimalSI), + }, + capacity: api.ReferencedResourceList{ + v1.ResourceCPU: nil, + }, + expected: api.ResourceThresholds{}, + }, + } { + t.Run(tt.name, func(t *testing.T) { + result := ResourceUsageToResourceThreshold(tt.usage, tt.capacity) + if !reflect.DeepEqual(result, tt.expected) { + t.Errorf("Expected %v, got %v", tt.expected, result) + } + }) + } +} diff --git a/pkg/framework/plugins/nodeutilization/normalizer/normalizer.go b/pkg/framework/plugins/nodeutilization/normalizer/normalizer.go new file mode 100644 index 000000000..294a6cd39 --- /dev/null +++ b/pkg/framework/plugins/nodeutilization/normalizer/normalizer.go @@ -0,0 +1,142 @@ +/* +Copyright 2025 The Kubernetes Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package normalizer + +import ( + "math" + + "golang.org/x/exp/constraints" +) + +// Normalizer is a function that receives two values of the same type and +// return an object of a different type. An usage case can be a function +// that converts a memory usage from mb to % (the first argument would be +// the memory usage in mb and the second argument would be the total memory +// available in mb). +type Normalizer[V, N any] func(V, V) N + +// Values is a map of values indexed by a comparable key. An example of this +// can be a list of resources indexed by a node name. +type Values[K comparable, V any] map[K]V + +// Number is an interface that represents a number. Represents things we +// can do math operations on. +type Number interface { + constraints.Integer | constraints.Float +} + +// Normalize uses a Normalizer function to normalize a set of values. For +// example one may want to convert a set of memory usages from mb to %. +// This function receives a set of usages, a set of totals, and a Normalizer +// function. The function will return a map with the normalized values. +func Normalize[K comparable, V, N any](usages, totals Values[K, V], fn Normalizer[V, N]) map[K]N { + result := Values[K, N]{} + for key, value := range usages { + total, ok := totals[key] + if !ok { + continue + } + result[key] = fn(value, total) + } + return result +} + +// Replicate replicates the provide value for each key in the provided slice. +// Returns a map with the keys and the provided value. +func Replicate[K comparable, V any](keys []K, value V) map[K]V { + result := map[K]V{} + for _, key := range keys { + result[key] = value + } + return result +} + +// Clamp imposes minimum and maximum limits on a set of values. The function +// will return a set of values where each value is between the minimum and +// maximum values (included). Values below minimum are rounded up to the +// minimum value, and values above maximum are rounded down to the maximum +// value. +func Clamp[K comparable, N Number, V ~map[K]N](values V, minimum, maximum N) V { + result := V{} + for key := range values { + value := values[key] + value = N(math.Max(float64(value), float64(minimum))) + value = N(math.Min(float64(value), float64(maximum))) + result[key] = value + } + return result +} + +// Map applies a function to each element of a map of values. Returns a new +// slice with the results of applying the function to each element. +func Map[K comparable, N Number, V ~map[K]N](items []V, fn func(V) V) []V { + result := []V{} + for _, item := range items { + result = append(result, fn(item)) + } + return result +} + +// Negate converts the values of a map to their negated values. +func Negate[K comparable, N Number, V ~map[K]N](values V) V { + result := V{} + for key, value := range values { + result[key] = -value + } + return result +} + +// Round rounds the values of a map to the nearest integer. Calls math.Round on +// each value of the map. +func Round[K comparable, N Number, V ~map[K]N](values V) V { + result := V{} + for key, value := range values { + result[key] = N(math.Round(float64(value))) + } + return result +} + +// Sum sums up the values of two maps. Values are expected to be of Number +// type. Original values are preserved. If a key is present in one map but +// not in the other, the key is ignored. +func Sum[K comparable, N Number, V ~map[K]N](mapA, mapB V) V { + result := V{} + for name, value := range mapA { + result[name] = value + mapB[name] + } + return result +} + +// Average calculates the average of a set of values. This function receives +// a map of values and returns the average of all the values. Average expects +// the values to represent the same unit of measure. You can use this function +// after Normalizing the values. +func Average[J, K comparable, N Number, V ~map[J]N](values map[K]V) V { + counter := map[J]int{} + result := V{} + for _, imap := range values { + for name, value := range imap { + result[name] += value + counter[name]++ + } + } + + for name := range result { + result[name] /= N(counter[name]) + } + + return result +} diff --git a/pkg/framework/plugins/nodeutilization/normalizer/normalizer_test.go b/pkg/framework/plugins/nodeutilization/normalizer/normalizer_test.go new file mode 100644 index 000000000..7af17b6dc --- /dev/null +++ b/pkg/framework/plugins/nodeutilization/normalizer/normalizer_test.go @@ -0,0 +1,649 @@ +/* +Copyright 2025 The Kubernetes Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package normalizer + +import ( + "fmt" + "math" + "reflect" + "testing" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "sigs.k8s.io/descheduler/pkg/api" +) + +func ResourceListUsageNormalizer(usages, totals v1.ResourceList) api.ResourceThresholds { + result := api.ResourceThresholds{} + for rname, value := range usages { + total, ok := totals[rname] + if !ok { + continue + } + + used, avail := value.Value(), total.Value() + if rname == v1.ResourceCPU { + used, avail = value.MilliValue(), total.MilliValue() + } + + pct := math.Max(math.Min(float64(used)/float64(avail)*100, 100), 0) + result[rname] = api.Percentage(pct) + } + return result +} + +func TestNormalizeSimple(t *testing.T) { + for _, tt := range []struct { + name string + usages map[string]float64 + totals map[string]float64 + expected map[string]float64 + normalizer Normalizer[float64, float64] + }{ + { + name: "single normalization", + usages: map[string]float64{"cpu": 1}, + totals: map[string]float64{"cpu": 2}, + expected: map[string]float64{"cpu": 0.5}, + normalizer: func(usage, total float64) float64 { + return usage / total + }, + }, + { + name: "multiple normalizations", + usages: map[string]float64{ + "cpu": 1, + "mem": 6, + }, + totals: map[string]float64{ + "cpu": 2, + "mem": 10, + }, + expected: map[string]float64{ + "cpu": 0.5, + "mem": 0.6, + }, + normalizer: func(usage, total float64) float64 { + return usage / total + }, + }, + { + name: "missing totals for a key", + usages: map[string]float64{ + "cpu": 1, + "mem": 6, + }, + totals: map[string]float64{ + "cpu": 2, + }, + expected: map[string]float64{ + "cpu": 0.5, + }, + normalizer: func(usage, total float64) float64 { + return usage / total + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + result := Normalize(tt.usages, tt.totals, tt.normalizer) + if !reflect.DeepEqual(result, tt.expected) { + t.Fatalf("unexpected result: %v", result) + } + }) + } +} + +func TestNormalize(t *testing.T) { + for _, tt := range []struct { + name string + usages map[string]v1.ResourceList + totals map[string]v1.ResourceList + expected map[string]api.ResourceThresholds + normalizer Normalizer[v1.ResourceList, api.ResourceThresholds] + }{ + { + name: "single normalization", + usages: map[string]v1.ResourceList{ + "node1": {v1.ResourceCPU: resource.MustParse("1")}, + }, + totals: map[string]v1.ResourceList{ + "node1": {v1.ResourceCPU: resource.MustParse("2")}, + }, + expected: map[string]api.ResourceThresholds{ + "node1": {v1.ResourceCPU: 50}, + }, + normalizer: ResourceListUsageNormalizer, + }, + { + name: "multiple normalization", + usages: map[string]v1.ResourceList{ + "node1": { + v1.ResourceCPU: resource.MustParse("1"), + v1.ResourceMemory: resource.MustParse("6"), + v1.ResourcePods: resource.MustParse("2"), + }, + "node2": { + v1.ResourceCPU: resource.MustParse("10"), + v1.ResourceMemory: resource.MustParse("20"), + v1.ResourcePods: resource.MustParse("30"), + }, + }, + totals: map[string]v1.ResourceList{ + "node1": { + v1.ResourceCPU: resource.MustParse("2"), + v1.ResourceMemory: resource.MustParse("6"), + v1.ResourcePods: resource.MustParse("100"), + }, + "node2": { + v1.ResourceCPU: resource.MustParse("100"), + v1.ResourceMemory: resource.MustParse("100"), + v1.ResourcePods: resource.MustParse("100"), + }, + }, + expected: map[string]api.ResourceThresholds{ + "node1": { + v1.ResourceCPU: 50, + v1.ResourceMemory: 100, + v1.ResourcePods: 2, + }, + "node2": { + v1.ResourceCPU: 10, + v1.ResourceMemory: 20, + v1.ResourcePods: 30, + }, + }, + normalizer: ResourceListUsageNormalizer, + }, + { + name: "multiple normalization with over 100% usage", + usages: map[string]v1.ResourceList{ + "node1": { + v1.ResourceCPU: resource.MustParse("120"), + v1.ResourceMemory: resource.MustParse("130"), + v1.ResourcePods: resource.MustParse("140"), + }, + "node2": { + v1.ResourceCPU: resource.MustParse("150"), + v1.ResourceMemory: resource.MustParse("160"), + v1.ResourcePods: resource.MustParse("170"), + }, + }, + totals: Replicate( + []string{"node1", "node2"}, + v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100"), + v1.ResourceMemory: resource.MustParse("100"), + v1.ResourcePods: resource.MustParse("100"), + }, + ), + expected: Replicate( + []string{"node1", "node2"}, + api.ResourceThresholds{ + v1.ResourceCPU: 100, + v1.ResourceMemory: 100, + v1.ResourcePods: 100, + }, + ), + normalizer: ResourceListUsageNormalizer, + }, + { + name: "multiple normalization with over 100% usage and different totals", + usages: map[string]v1.ResourceList{ + "node1": { + v1.ResourceCPU: resource.MustParse("2"), + v1.ResourceMemory: resource.MustParse("2Gi"), + }, + "node2": { + v1.ResourceCPU: resource.MustParse("99"), + v1.ResourceMemory: resource.MustParse("99Gi"), + }, + "node3": { + v1.ResourceCPU: resource.MustParse("8"), + v1.ResourceMemory: resource.MustParse("8Gi"), + }, + }, + totals: map[string]v1.ResourceList{ + "node1": { + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("4Gi"), + }, + "node2": { + v1.ResourceCPU: resource.MustParse("100"), + v1.ResourceMemory: resource.MustParse("100Gi"), + }, + "node3": { + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("4Gi"), + }, + }, + expected: map[string]api.ResourceThresholds{ + "node1": { + v1.ResourceCPU: 50, + v1.ResourceMemory: 50, + }, + "node2": { + v1.ResourceCPU: 99, + v1.ResourceMemory: 99, + }, + "node3": { + v1.ResourceCPU: 100, + v1.ResourceMemory: 100, + }, + }, + normalizer: ResourceListUsageNormalizer, + }, + } { + t.Run(tt.name, func(t *testing.T) { + result := Normalize(tt.usages, tt.totals, tt.normalizer) + if !reflect.DeepEqual(result, tt.expected) { + t.Fatalf("unexpected result: %v", result) + } + }) + } +} + +func TestAverage(t *testing.T) { + for _, tt := range []struct { + name string + usage map[string]v1.ResourceList + limits map[string]v1.ResourceList + expected api.ResourceThresholds + }{ + { + name: "empty usage", + usage: map[string]v1.ResourceList{}, + limits: map[string]v1.ResourceList{}, + expected: api.ResourceThresholds{}, + }, + { + name: "fifty percent usage", + usage: map[string]v1.ResourceList{ + "node1": { + v1.ResourceCPU: resource.MustParse("1"), + v1.ResourceMemory: resource.MustParse("6"), + }, + "node2": { + v1.ResourceCPU: resource.MustParse("2"), + v1.ResourceMemory: resource.MustParse("6"), + }, + }, + limits: map[string]v1.ResourceList{ + "node1": { + v1.ResourceCPU: resource.MustParse("2"), + v1.ResourceMemory: resource.MustParse("12"), + }, + "node2": { + v1.ResourceCPU: resource.MustParse("4"), + v1.ResourceMemory: resource.MustParse("12"), + }, + }, + expected: api.ResourceThresholds{ + v1.ResourceCPU: 50, + v1.ResourceMemory: 50, + }, + }, + { + name: "mixed percent usage", + usage: map[string]v1.ResourceList{ + "node1": { + v1.ResourceCPU: resource.MustParse("10"), + v1.ResourceMemory: resource.MustParse("80"), + v1.ResourcePods: resource.MustParse("20"), + }, + "node2": { + v1.ResourceCPU: resource.MustParse("20"), + v1.ResourceMemory: resource.MustParse("60"), + v1.ResourcePods: resource.MustParse("20"), + }, + }, + limits: Replicate( + []string{"node1", "node2"}, + v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100"), + v1.ResourceMemory: resource.MustParse("100"), + v1.ResourcePods: resource.MustParse("10000"), + }, + ), + expected: api.ResourceThresholds{ + v1.ResourceCPU: 15, + v1.ResourceMemory: 70, + v1.ResourcePods: 0.2, + }, + }, + { + name: "mixed limits", + usage: map[string]v1.ResourceList{ + "node1": { + v1.ResourceCPU: resource.MustParse("10"), + v1.ResourceMemory: resource.MustParse("30"), + v1.ResourcePods: resource.MustParse("200"), + }, + "node2": { + v1.ResourceCPU: resource.MustParse("10"), + v1.ResourceMemory: resource.MustParse("72"), + v1.ResourcePods: resource.MustParse("200"), + }, + }, + limits: map[string]v1.ResourceList{ + "node1": { + v1.ResourceCPU: resource.MustParse("10"), + v1.ResourceMemory: resource.MustParse("100"), + v1.ResourcePods: resource.MustParse("1000"), + }, + "node2": { + v1.ResourceCPU: resource.MustParse("1000"), + v1.ResourceMemory: resource.MustParse("180"), + v1.ResourcePods: resource.MustParse("10"), + }, + }, + expected: api.ResourceThresholds{ + v1.ResourceCPU: 50.5, + v1.ResourceMemory: 35, + v1.ResourcePods: 60, + }, + }, + { + name: "some nodes missing some resources", + usage: map[string]v1.ResourceList{ + "node1": { + "limit-exists-in-all": resource.MustParse("10"), + "limit-exists-in-two": resource.MustParse("11"), + "limit-does-not-exist": resource.MustParse("12"), + "usage-exists-in-all": resource.MustParse("13"), + "usage-exists-in-two": resource.MustParse("20"), + }, + "node2": { + "limit-exists-in-all": resource.MustParse("10"), + "limit-exists-in-two": resource.MustParse("11"), + "limit-does-not-exist": resource.MustParse("12"), + "usage-exists-in-all": resource.MustParse("13"), + "usage-exists-in-two": resource.MustParse("20"), + }, + "node3": { + "limit-exists-in-all": resource.MustParse("10"), + "limit-exists-in-two": resource.MustParse("11"), + "limit-does-not-exist": resource.MustParse("12"), + "usage-exists-in-all": resource.MustParse("13"), + }, + "node4": { + "limit-exists-in-all": resource.MustParse("10"), + "limit-exists-in-two": resource.MustParse("11"), + "limit-does-not-exist": resource.MustParse("12"), + "usage-exists-in-all": resource.MustParse("13"), + }, + "node5": { + "random-usage-without-limit": resource.MustParse("10"), + }, + }, + limits: map[string]v1.ResourceList{ + "node1": { + "limit-exists-in-all": resource.MustParse("100"), + "limit-exists-in-two": resource.MustParse("100"), + "usage-exists-in-all": resource.MustParse("100"), + "usage-exists-in-two": resource.MustParse("100"), + "usage-does-not-exist": resource.MustParse("100"), + }, + "node2": { + "limit-exists-in-all": resource.MustParse("100"), + "limit-exists-in-two": resource.MustParse("100"), + "usage-exists-in-all": resource.MustParse("100"), + "usage-exists-in-two": resource.MustParse("100"), + "usage-does-not-exist": resource.MustParse("100"), + }, + "node3": { + "limit-exists-in-all": resource.MustParse("100"), + "usage-exists-in-all": resource.MustParse("100"), + "usage-exists-in-two": resource.MustParse("100"), + "usage-does-not-exist": resource.MustParse("100"), + }, + "node4": { + "limit-exists-in-all": resource.MustParse("100"), + "usage-exists-in-all": resource.MustParse("100"), + "usage-exists-in-two": resource.MustParse("100"), + "usage-does-not-exist": resource.MustParse("100"), + }, + "node5": { + "random-limit-without-usage": resource.MustParse("100"), + }, + }, + expected: api.ResourceThresholds{ + "limit-exists-in-all": 10, + "limit-exists-in-two": 11, + "usage-exists-in-all": 13, + "usage-exists-in-two": 20, + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + average := Average( + Normalize( + tt.usage, tt.limits, ResourceListUsageNormalizer, + ), + ) + if !reflect.DeepEqual(average, tt.expected) { + t.Fatalf("unexpected result: %v, expected: %v", average, tt.expected) + } + }) + } +} + +func TestSum(t *testing.T) { + for _, tt := range []struct { + name string + data api.ResourceThresholds + deviations []api.ResourceThresholds + expected []api.ResourceThresholds + }{ + { + name: "single deviation", + data: api.ResourceThresholds{ + v1.ResourceCPU: 50, + v1.ResourceMemory: 50, + v1.ResourcePods: 50, + }, + deviations: []api.ResourceThresholds{ + { + v1.ResourceCPU: 1, + v1.ResourceMemory: 1, + v1.ResourcePods: 1, + }, + { + v1.ResourceCPU: 2, + v1.ResourceMemory: 2, + v1.ResourcePods: 2, + }, + { + v1.ResourceCPU: 3, + v1.ResourceMemory: 3, + v1.ResourcePods: 3, + }, + }, + expected: []api.ResourceThresholds{ + { + v1.ResourceCPU: 51, + v1.ResourceMemory: 51, + v1.ResourcePods: 51, + }, + { + v1.ResourceCPU: 52, + v1.ResourceMemory: 52, + v1.ResourcePods: 52, + }, + { + v1.ResourceCPU: 53, + v1.ResourceMemory: 53, + v1.ResourcePods: 53, + }, + }, + }, + { + name: "deviate with negative values", + data: api.ResourceThresholds{ + v1.ResourceCPU: 50, + v1.ResourceMemory: 50, + v1.ResourcePods: 50, + }, + deviations: []api.ResourceThresholds{ + { + v1.ResourceCPU: -2, + v1.ResourceMemory: -2, + v1.ResourcePods: -2, + }, + { + v1.ResourceCPU: -1, + v1.ResourceMemory: -1, + v1.ResourcePods: -1, + }, + { + v1.ResourceCPU: 0, + v1.ResourceMemory: 0, + v1.ResourcePods: 0, + }, + { + v1.ResourceCPU: 1, + v1.ResourceMemory: 1, + v1.ResourcePods: 1, + }, + { + v1.ResourceCPU: 2, + v1.ResourceMemory: 2, + v1.ResourcePods: 2, + }, + }, + expected: []api.ResourceThresholds{ + { + v1.ResourceCPU: 48, + v1.ResourceMemory: 48, + v1.ResourcePods: 48, + }, + { + v1.ResourceCPU: 49, + v1.ResourceMemory: 49, + v1.ResourcePods: 49, + }, + { + v1.ResourceCPU: 50, + v1.ResourceMemory: 50, + v1.ResourcePods: 50, + }, + { + v1.ResourceCPU: 51, + v1.ResourceMemory: 51, + v1.ResourcePods: 51, + }, + { + v1.ResourceCPU: 52, + v1.ResourceMemory: 52, + v1.ResourcePods: 52, + }, + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + result := []api.ResourceThresholds{} + for _, deviation := range tt.deviations { + partial := Sum(tt.data, deviation) + result = append(result, partial) + } + + if len(result) != len(tt.deviations) { + t.Fatalf("unexpected result: %v", result) + } + if !reflect.DeepEqual(result, tt.expected) { + fmt.Printf("%T, %T\n", result, tt.expected) + t.Fatalf("unexpected result: %v", result) + } + }) + } +} + +func TestClamp(t *testing.T) { + for _, tt := range []struct { + name string + data []api.ResourceThresholds + minimum api.Percentage + maximum api.Percentage + expected []api.ResourceThresholds + }{ + { + name: "all over the limit", + data: []api.ResourceThresholds{ + { + v1.ResourceCPU: 50, + v1.ResourceMemory: 50, + v1.ResourcePods: 50, + }, + }, + minimum: 10, + maximum: 20, + expected: []api.ResourceThresholds{ + { + v1.ResourceCPU: 20, + v1.ResourceMemory: 20, + v1.ResourcePods: 20, + }, + }, + }, + { + name: "some over some below the limits", + data: []api.ResourceThresholds{ + { + v1.ResourceCPU: 7, + v1.ResourceMemory: 8, + v1.ResourcePods: 88, + }, + }, + minimum: 10, + maximum: 20, + expected: []api.ResourceThresholds{ + { + v1.ResourceCPU: 10, + v1.ResourceMemory: 10, + v1.ResourcePods: 20, + }, + }, + }, + { + name: "all within the limits", + data: []api.ResourceThresholds{ + { + v1.ResourceCPU: 15, + v1.ResourceMemory: 15, + v1.ResourcePods: 15, + }, + }, + minimum: 10, + maximum: 20, + expected: []api.ResourceThresholds{ + { + v1.ResourceCPU: 15, + v1.ResourceMemory: 15, + v1.ResourcePods: 15, + }, + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + fn := func(thresholds api.ResourceThresholds) api.ResourceThresholds { + return Clamp(thresholds, tt.minimum, tt.maximum) + } + result := Map(tt.data, fn) + if !reflect.DeepEqual(result, tt.expected) { + t.Fatalf("unexpected result: %v", result) + } + }) + } +}