diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go index 9ca71f5e0..eaa403f24 100644 --- a/pkg/descheduler/descheduler.go +++ b/pkg/descheduler/descheduler.go @@ -21,11 +21,20 @@ import ( "fmt" v1 "k8s.io/api/core/v1" + policy "k8s.io/api/policy/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/informers" clientset "k8s.io/client-go/kubernetes" + fakeclientset "k8s.io/client-go/kubernetes/fake" + core "k8s.io/client-go/testing" "k8s.io/klog/v2" + corev1informers "k8s.io/client-go/informers/core/v1" + schedulingv1informers "k8s.io/client-go/informers/scheduling/v1" + "sigs.k8s.io/descheduler/cmd/descheduler/app/options" "sigs.k8s.io/descheduler/metrics" "sigs.k8s.io/descheduler/pkg/api" @@ -67,10 +76,92 @@ func Run(rs *options.DeschedulerServer) error { type strategyFunction func(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc) +func cachedClient( + realClient clientset.Interface, + podInformer corev1informers.PodInformer, + nodeInformer corev1informers.NodeInformer, + namespaceInformer corev1informers.NamespaceInformer, + priorityClassInformer schedulingv1informers.PriorityClassInformer, +) (clientset.Interface, error) { + fakeClient := fakeclientset.NewSimpleClientset() + // simulate a pod eviction by deleting a pod + fakeClient.PrependReactor("create", "pods", func(action core.Action) (bool, runtime.Object, error) { + if action.GetSubresource() == "eviction" { + createAct, matched := action.(core.CreateActionImpl) + if !matched { + return false, nil, fmt.Errorf("unable to convert action to core.CreateActionImpl") + } + eviction, matched := createAct.Object.(*policy.Eviction) + if !matched { + return false, nil, fmt.Errorf("unable to convert action object into *policy.Eviction") + } + if err := fakeClient.Tracker().Delete(action.GetResource(), eviction.GetNamespace(), eviction.GetName()); err != nil { + return false, nil, fmt.Errorf("unable to delete pod %v/%v: %v", eviction.GetNamespace(), eviction.GetName(), err) + } + return true, nil, nil + } + // fallback to the default reactor + return false, nil, nil + }) + + klog.V(3).Infof("Pulling resources for the cached client from the cluster") + pods, err := podInformer.Lister().List(labels.Everything()) + if err != nil { + return nil, fmt.Errorf("unable to list pods: %v", err) + } + + for _, item := range pods { + if _, err := fakeClient.CoreV1().Pods(item.Namespace).Create(context.TODO(), item, metav1.CreateOptions{}); err != nil { + return nil, fmt.Errorf("unable to copy pod: %v", err) + } + } + + nodes, err := nodeInformer.Lister().List(labels.Everything()) + if err != nil { + return nil, fmt.Errorf("unable to list nodes: %v", err) + } + + for _, item := range nodes { + if _, err := fakeClient.CoreV1().Nodes().Create(context.TODO(), item, metav1.CreateOptions{}); err != nil { + return nil, fmt.Errorf("unable to copy node: %v", err) + } + } + + namespaces, err := namespaceInformer.Lister().List(labels.Everything()) + if err != nil { + return nil, fmt.Errorf("unable to list namespaces: %v", err) + } + + for _, item := range namespaces { + if _, err := fakeClient.CoreV1().Namespaces().Create(context.TODO(), item, metav1.CreateOptions{}); err != nil { + return nil, fmt.Errorf("unable to copy node: %v", err) + } + } + + priorityClasses, err := priorityClassInformer.Lister().List(labels.Everything()) + if err != nil { + return nil, fmt.Errorf("unable to list priorityclasses: %v", err) + } + + for _, item := range priorityClasses { + if _, err := fakeClient.SchedulingV1().PriorityClasses().Create(context.TODO(), item, metav1.CreateOptions{}); err != nil { + return nil, fmt.Errorf("unable to copy priorityclass: %v", err) + } + } + + return fakeClient, nil +} + func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer, deschedulerPolicy *api.DeschedulerPolicy, evictionPolicyGroupVersion string, stopChannel chan struct{}) error { sharedInformerFactory := informers.NewSharedInformerFactory(rs.Client, 0) nodeInformer := sharedInformerFactory.Core().V1().Nodes() podInformer := sharedInformerFactory.Core().V1().Pods() + namespaceInformer := sharedInformerFactory.Core().V1().Namespaces() + priorityClassInformer := sharedInformerFactory.Scheduling().V1().PriorityClasses() + + // create the informers + namespaceInformer.Informer() + priorityClassInformer.Informer() getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer) if err != nil { @@ -138,8 +229,39 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer return } + var podEvictorClient clientset.Interface + // When the dry mode is enable, collect all the relevant objects (mostly pods) under a fake client. + // So when evicting pods while running multiple strategies in a row have the cummulative effect + // as is when evicting pods for real. + if rs.DryRun { + klog.V(3).Infof("Building a cached client from the cluster for the dry run") + // Create a new cache so we start from scratch without any leftovers + fakeClient, err := cachedClient(rs.Client, podInformer, nodeInformer, namespaceInformer, priorityClassInformer) + if err != nil { + klog.Error(err) + return + } + + fakeSharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0) + getPodsAssignedToNode, err = podutil.BuildGetPodsAssignedToNodeFunc(fakeSharedInformerFactory.Core().V1().Pods()) + if err != nil { + klog.Errorf("build get pods assigned to node function error: %v", err) + return + } + + fakeCtx, cncl := context.WithCancel(context.TODO()) + defer cncl() + fakeSharedInformerFactory.Start(fakeCtx.Done()) + fakeSharedInformerFactory.WaitForCacheSync(fakeCtx.Done()) + + podEvictorClient = fakeClient + } else { + podEvictorClient = rs.Client + } + + klog.V(3).Infof("Building a pod evictor") podEvictor := evictions.NewPodEvictor( - rs.Client, + podEvictorClient, evictionPolicyGroupVersion, rs.DryRun, deschedulerPolicy.MaxNoOfPodsToEvictPerNode, @@ -149,6 +271,7 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer evictSystemCriticalPods, ignorePvcPods, evictBarePods, + !rs.DisableMetrics, ) for name, strategy := range deschedulerPolicy.Strategies { diff --git a/pkg/descheduler/evictions/evictions.go b/pkg/descheduler/evictions/evictions.go index d1d0380b7..dc2300417 100644 --- a/pkg/descheduler/evictions/evictions.go +++ b/pkg/descheduler/evictions/evictions.go @@ -61,6 +61,7 @@ type PodEvictor struct { evictLocalStoragePods bool evictSystemCriticalPods bool ignorePvcPods bool + metricsEnabled bool } func NewPodEvictor( @@ -74,6 +75,7 @@ func NewPodEvictor( evictSystemCriticalPods bool, ignorePvcPods bool, evictFailedBarePods bool, + metricsEnabled bool, ) *PodEvictor { var nodePodCount = make(nodePodEvictedCount) var namespacePodCount = make(namespacePodEvictCount) @@ -95,6 +97,7 @@ func NewPodEvictor( evictSystemCriticalPods: evictSystemCriticalPods, evictFailedBarePods: evictFailedBarePods, ignorePvcPods: ignorePvcPods, + metricsEnabled: metricsEnabled, } } @@ -121,20 +124,26 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, node *v1.Node, reason += " (" + strings.Join(reasons, ", ") + ")" } if pe.maxPodsToEvictPerNode != nil && pe.nodepodCount[node]+1 > *pe.maxPodsToEvictPerNode { - metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per node reached", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + if pe.metricsEnabled { + metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per node reached", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + } return false, fmt.Errorf("Maximum number %v of evicted pods per %q node reached", *pe.maxPodsToEvictPerNode, node.Name) } if pe.maxPodsToEvictPerNamespace != nil && pe.namespacePodCount[pod.Namespace]+1 > *pe.maxPodsToEvictPerNamespace { - metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per namespace reached", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + if pe.metricsEnabled { + metrics.PodsEvicted.With(map[string]string{"result": "maximum number of pods per namespace reached", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + } return false, fmt.Errorf("Maximum number %v of evicted pods per %q namespace reached", *pe.maxPodsToEvictPerNamespace, pod.Namespace) } - err := evictPod(ctx, pe.client, pod, pe.policyGroupVersion, pe.dryRun) + err := evictPod(ctx, pe.client, pod, pe.policyGroupVersion) if err != nil { // err is used only for logging purposes klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod), "reason", reason) - metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + if pe.metricsEnabled { + metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + } return false, nil } @@ -149,15 +158,14 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, node *v1.Node, eventBroadcaster.StartRecordingToSink(&clientcorev1.EventSinkImpl{Interface: pe.client.CoreV1().Events(pod.Namespace)}) r := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "sigs.k8s.io.descheduler"}) r.Event(pod, v1.EventTypeNormal, "Descheduled", fmt.Sprintf("pod evicted by sigs.k8s.io/descheduler%s", reason)) - metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + if pe.metricsEnabled { + metrics.PodsEvicted.With(map[string]string{"result": "success", "strategy": strategy, "namespace": pod.Namespace, "node": node.Name}).Inc() + } } return true, nil } -func evictPod(ctx context.Context, client clientset.Interface, pod *v1.Pod, policyGroupVersion string, dryRun bool) error { - if dryRun { - return nil - } +func evictPod(ctx context.Context, client clientset.Interface, pod *v1.Pod, policyGroupVersion string) error { deleteOptions := &metav1.DeleteOptions{} // GracePeriodSeconds ? eviction := &policy.Eviction{ diff --git a/pkg/descheduler/evictions/evictions_test.go b/pkg/descheduler/evictions/evictions_test.go index 4a16412e3..d7e8b46dd 100644 --- a/pkg/descheduler/evictions/evictions_test.go +++ b/pkg/descheduler/evictions/evictions_test.go @@ -62,7 +62,7 @@ func TestEvictPod(t *testing.T) { fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { return true, &v1.PodList{Items: test.pods}, nil }) - got := evictPod(ctx, fakeClient, test.pod, "v1", false) + got := evictPod(ctx, fakeClient, test.pod, "v1") if got != test.want { t.Errorf("Test error for Desc: %s. Expected %v pod eviction to be %v, got %v", test.description, test.pod.Name, test.want, got) } diff --git a/pkg/descheduler/strategies/duplicates_test.go b/pkg/descheduler/strategies/duplicates_test.go index 82d8087aa..f2b6467d3 100644 --- a/pkg/descheduler/strategies/duplicates_test.go +++ b/pkg/descheduler/strategies/duplicates_test.go @@ -301,6 +301,7 @@ func TestFindDuplicatePods(t *testing.T) { false, false, false, + false, ) RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor, getPodsAssignedToNode) @@ -727,6 +728,7 @@ func TestRemoveDuplicatesUniformly(t *testing.T) { false, false, false, + false, ) RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor, getPodsAssignedToNode) diff --git a/pkg/descheduler/strategies/failedpods_test.go b/pkg/descheduler/strategies/failedpods_test.go index 7f310ad59..367418499 100644 --- a/pkg/descheduler/strategies/failedpods_test.go +++ b/pkg/descheduler/strategies/failedpods_test.go @@ -265,6 +265,7 @@ func TestRemoveFailedPods(t *testing.T) { false, false, false, + false, ) RemoveFailedPods(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor, getPodsAssignedToNode) diff --git a/pkg/descheduler/strategies/node_affinity_test.go b/pkg/descheduler/strategies/node_affinity_test.go index ab2efc489..df2b636e1 100644 --- a/pkg/descheduler/strategies/node_affinity_test.go +++ b/pkg/descheduler/strategies/node_affinity_test.go @@ -226,6 +226,7 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) { false, false, false, + false, ) RemovePodsViolatingNodeAffinity(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor, getPodsAssignedToNode) diff --git a/pkg/descheduler/strategies/node_taint_test.go b/pkg/descheduler/strategies/node_taint_test.go index 6d67ede10..0dc722214 100644 --- a/pkg/descheduler/strategies/node_taint_test.go +++ b/pkg/descheduler/strategies/node_taint_test.go @@ -263,6 +263,7 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { tc.evictSystemCriticalPods, false, false, + false, ) strategy := api.DeschedulerStrategy{ diff --git a/pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go b/pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go index 1a2f0865c..7d4c8bd60 100644 --- a/pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go +++ b/pkg/descheduler/strategies/nodeutilization/highnodeutilization_test.go @@ -467,6 +467,7 @@ func TestHighNodeUtilization(t *testing.T) { false, false, false, + false, ) strategy := api.DeschedulerStrategy{ @@ -671,6 +672,7 @@ func TestHighNodeUtilizationWithTaints(t *testing.T) { false, false, false, + false, ) HighNodeUtilization(ctx, fakeClient, strategy, item.nodes, podEvictor, getPodsAssignedToNode) diff --git a/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go index 07961d160..8cdcb299b 100644 --- a/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go +++ b/pkg/descheduler/strategies/nodeutilization/lownodeutilization_test.go @@ -724,6 +724,7 @@ func TestLowNodeUtilization(t *testing.T) { false, false, false, + false, ) strategy := api.DeschedulerStrategy{ @@ -1036,6 +1037,7 @@ func TestLowNodeUtilizationWithTaints(t *testing.T) { false, false, false, + false, ) LowNodeUtilization(ctx, fakeClient, strategy, item.nodes, podEvictor, getPodsAssignedToNode) diff --git a/pkg/descheduler/strategies/pod_antiaffinity_test.go b/pkg/descheduler/strategies/pod_antiaffinity_test.go index d70184793..df5f33057 100644 --- a/pkg/descheduler/strategies/pod_antiaffinity_test.go +++ b/pkg/descheduler/strategies/pod_antiaffinity_test.go @@ -213,6 +213,7 @@ func TestPodAntiAffinity(t *testing.T) { false, false, false, + false, ) strategy := api.DeschedulerStrategy{ Params: &api.StrategyParameters{ diff --git a/pkg/descheduler/strategies/pod_lifetime_test.go b/pkg/descheduler/strategies/pod_lifetime_test.go index a30a26a79..697250cba 100644 --- a/pkg/descheduler/strategies/pod_lifetime_test.go +++ b/pkg/descheduler/strategies/pod_lifetime_test.go @@ -302,6 +302,7 @@ func TestPodLifeTime(t *testing.T) { false, tc.ignorePvcPods, false, + false, ) PodLifeTime(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor, getPodsAssignedToNode) diff --git a/pkg/descheduler/strategies/toomanyrestarts_test.go b/pkg/descheduler/strategies/toomanyrestarts_test.go index 39c9ef7dd..aa7bec2e7 100644 --- a/pkg/descheduler/strategies/toomanyrestarts_test.go +++ b/pkg/descheduler/strategies/toomanyrestarts_test.go @@ -238,6 +238,7 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) { false, false, false, + false, ) RemovePodsHavingTooManyRestarts(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor, getPodsAssignedToNode) diff --git a/pkg/descheduler/strategies/topologyspreadconstraint_test.go b/pkg/descheduler/strategies/topologyspreadconstraint_test.go index 6e6675855..22a863651 100644 --- a/pkg/descheduler/strategies/topologyspreadconstraint_test.go +++ b/pkg/descheduler/strategies/topologyspreadconstraint_test.go @@ -906,6 +906,7 @@ func TestTopologySpreadConstraint(t *testing.T) { false, false, false, + false, ) RemovePodsViolatingTopologySpreadConstraint(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor, getPodsAssignedToNode) podsEvicted := podEvictor.TotalEvicted() diff --git a/test/e2e/e2e_duplicatepods_test.go b/test/e2e/e2e_duplicatepods_test.go index f3c991b25..cbb56e676 100644 --- a/test/e2e/e2e_duplicatepods_test.go +++ b/test/e2e/e2e_duplicatepods_test.go @@ -151,6 +151,7 @@ func TestRemoveDuplicates(t *testing.T) { false, false, false, + false, ) t.Log("Running DeschedulerStrategy strategy") diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index ac56bd4da..a832d463b 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -203,6 +203,7 @@ func runPodLifetimeStrategy( evictCritical, false, false, + false, ), getPodsAssignedToNode, ) @@ -1341,5 +1342,6 @@ func initPodEvictorOrFail(t *testing.T, clientSet clientset.Interface, nodes []* false, false, false, + false, ) } diff --git a/test/e2e/e2e_toomanyrestarts_test.go b/test/e2e/e2e_toomanyrestarts_test.go index dfd3389f1..077b6db3b 100644 --- a/test/e2e/e2e_toomanyrestarts_test.go +++ b/test/e2e/e2e_toomanyrestarts_test.go @@ -141,6 +141,7 @@ func TestTooManyRestarts(t *testing.T) { false, false, false, + false, ) // Run RemovePodsHavingTooManyRestarts strategy t.Log("Running RemovePodsHavingTooManyRestarts strategy")