mirror of
https://github.com/kubernetes-sigs/descheduler.git
synced 2026-01-26 21:31:18 +01:00
[nodeutilization]: prometheus usage client with prometheus metrics
This commit is contained in:
@@ -19,7 +19,11 @@ package nodeutilization
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
promapi "github.com/prometheus/client_golang/api"
|
||||
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
||||
"github.com/prometheus/common/model"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
@@ -33,11 +37,33 @@ import (
|
||||
"sigs.k8s.io/descheduler/pkg/utils"
|
||||
)
|
||||
|
||||
type UsageClientType int
|
||||
|
||||
const (
|
||||
requestedUsageClientType UsageClientType = iota
|
||||
actualUsageClientType
|
||||
prometheusUsageClientType
|
||||
)
|
||||
|
||||
type notSupportedError struct {
|
||||
usageClientType UsageClientType
|
||||
}
|
||||
|
||||
func (e notSupportedError) Error() string {
|
||||
return "maximum number of evicted pods per node reached"
|
||||
}
|
||||
|
||||
func newNotSupportedError(usageClientType UsageClientType) *notSupportedError {
|
||||
return ¬SupportedError{
|
||||
usageClientType: usageClientType,
|
||||
}
|
||||
}
|
||||
|
||||
type usageClient interface {
|
||||
// Both low/high node utilization plugins are expected to invoke sync right
|
||||
// after Balance method is invoked. There's no cache invalidation so each
|
||||
// Balance is expected to get the latest data by invoking sync.
|
||||
sync(nodes []*v1.Node) error
|
||||
sync(ctx context.Context, nodes []*v1.Node) error
|
||||
nodeUtilization(node string) api.ReferencedResourceList
|
||||
pods(node string) []*v1.Pod
|
||||
podUsage(pod *v1.Pod) (api.ReferencedResourceList, error)
|
||||
@@ -79,7 +105,7 @@ func (s *requestedUsageClient) podUsage(pod *v1.Pod) (api.ReferencedResourceList
|
||||
return usage, nil
|
||||
}
|
||||
|
||||
func (s *requestedUsageClient) sync(nodes []*v1.Node) error {
|
||||
func (s *requestedUsageClient) sync(ctx context.Context, nodes []*v1.Node) error {
|
||||
s._nodeUtilization = make(map[string]api.ReferencedResourceList)
|
||||
s._pods = make(map[string][]*v1.Pod)
|
||||
|
||||
@@ -165,7 +191,7 @@ func (client *actualUsageClient) podUsage(pod *v1.Pod) (api.ReferencedResourceLi
|
||||
return totalUsage, nil
|
||||
}
|
||||
|
||||
func (client *actualUsageClient) sync(nodes []*v1.Node) error {
|
||||
func (client *actualUsageClient) sync(ctx context.Context, nodes []*v1.Node) error {
|
||||
client._nodeUtilization = make(map[string]api.ReferencedResourceList)
|
||||
client._pods = make(map[string][]*v1.Pod)
|
||||
|
||||
@@ -200,3 +226,95 @@ func (client *actualUsageClient) sync(nodes []*v1.Node) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
type prometheusUsageClient struct {
|
||||
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc
|
||||
promClient promapi.Client
|
||||
promQuery string
|
||||
|
||||
_pods map[string][]*v1.Pod
|
||||
_nodeUtilization map[string]map[v1.ResourceName]*resource.Quantity
|
||||
}
|
||||
|
||||
var _ usageClient = &actualUsageClient{}
|
||||
|
||||
func newPrometheusUsageClient(
|
||||
getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
|
||||
promClient promapi.Client,
|
||||
promQuery string,
|
||||
) *prometheusUsageClient {
|
||||
return &prometheusUsageClient{
|
||||
getPodsAssignedToNode: getPodsAssignedToNode,
|
||||
promClient: promClient,
|
||||
promQuery: promQuery,
|
||||
}
|
||||
}
|
||||
|
||||
func (client *prometheusUsageClient) nodeUtilization(node string) map[v1.ResourceName]*resource.Quantity {
|
||||
return client._nodeUtilization[node]
|
||||
}
|
||||
|
||||
func (client *prometheusUsageClient) pods(node string) []*v1.Pod {
|
||||
return client._pods[node]
|
||||
}
|
||||
|
||||
func (client *prometheusUsageClient) podUsage(pod *v1.Pod) (map[v1.ResourceName]*resource.Quantity, error) {
|
||||
return nil, newNotSupportedError(prometheusUsageClientType)
|
||||
}
|
||||
|
||||
func NodeUsageFromPrometheusMetrics(ctx context.Context, promClient promapi.Client, promQuery string) (map[string]map[v1.ResourceName]*resource.Quantity, error) {
|
||||
results, warnings, err := promv1.NewAPI(promClient).Query(ctx, promQuery, time.Now())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to capture prometheus metrics: %v", err)
|
||||
}
|
||||
if len(warnings) > 0 {
|
||||
klog.Infof("prometheus metrics warnings: %v", warnings)
|
||||
}
|
||||
|
||||
if results.Type() != model.ValVector {
|
||||
return nil, fmt.Errorf("expected query results to be of type %q, got %q instead", model.ValVector, results.Type())
|
||||
}
|
||||
|
||||
nodeUsages := make(map[string]map[v1.ResourceName]*resource.Quantity)
|
||||
for _, sample := range results.(model.Vector) {
|
||||
nodeName, exists := sample.Metric["instance"]
|
||||
if !exists {
|
||||
return nil, fmt.Errorf("The collected metrics sample is missing 'instance' key")
|
||||
}
|
||||
if sample.Value < 0 || sample.Value > 1 {
|
||||
return nil, fmt.Errorf("The collected metrics sample for %q has value %v outside of <0; 1> interval", string(nodeName), sample.Value)
|
||||
}
|
||||
nodeUsages[string(nodeName)] = map[v1.ResourceName]*resource.Quantity{
|
||||
MetricResource: resource.NewQuantity(int64(sample.Value*100), resource.DecimalSI),
|
||||
}
|
||||
}
|
||||
|
||||
return nodeUsages, nil
|
||||
}
|
||||
|
||||
func (client *prometheusUsageClient) sync(ctx context.Context, nodes []*v1.Node) error {
|
||||
client._nodeUtilization = make(map[string]map[v1.ResourceName]*resource.Quantity)
|
||||
client._pods = make(map[string][]*v1.Pod)
|
||||
|
||||
nodeUsages, err := NodeUsageFromPrometheusMetrics(ctx, client.promClient, client.promQuery)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, node := range nodes {
|
||||
if _, exists := nodeUsages[node.Name]; !exists {
|
||||
return fmt.Errorf("unable to find metric entry for %v", node.Name)
|
||||
}
|
||||
pods, err := podutil.ListPodsOnANode(node.Name, client.getPodsAssignedToNode, nil)
|
||||
if err != nil {
|
||||
klog.V(2).InfoS("Node will not be processed, error accessing its pods", "node", klog.KObj(node), "err", err)
|
||||
return fmt.Errorf("error accessing %q node's pods: %v", node.Name, err)
|
||||
}
|
||||
|
||||
// store the snapshot of pods from the same (or the closest) node utilization computation
|
||||
client._pods[node.Name] = pods
|
||||
client._nodeUtilization[node.Name] = nodeUsages[node.Name]
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user