From 22cf265303d7475d3ac67ec4545a38107c9199b9 Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 22:29:32 +0100 Subject: [PATCH 01/12] Add K8s GPU metrics collection design spec Three-source fallback chain: CloudWatch Container Insights, DCGM exporter scrape, and Prometheus query. Per-node fallback with new ruleK8sLowGPUUtil analysis rule. --- .../2026-04-19-k8s-gpu-metrics-design.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/specs/2026-04-19-k8s-gpu-metrics-design.md diff --git a/docs/specs/2026-04-19-k8s-gpu-metrics-design.md b/docs/specs/2026-04-19-k8s-gpu-metrics-design.md new file mode 100644 index 0000000..ed8c3d7 --- /dev/null +++ b/docs/specs/2026-04-19-k8s-gpu-metrics-design.md @@ -0,0 +1,149 @@ +# K8s GPU Metrics Collection + +## Goal + +Collect GPU utilization metrics for Kubernetes GPU nodes discovered by gpuaudit, using a per-node fallback chain of three sources: CloudWatch Container Insights, DCGM exporter scrape, and Prometheus query. Enable utilization-based waste detection for K8s GPU nodes (currently limited to allocation-based detection only). + +## Architecture + +Three metrics sources, tried in priority order **per node** (stop at the first source that returns data for a given node): + +1. **CloudWatch Container Insights** — AWS API call, no in-cluster access needed beyond what we already have. +2. **DCGM exporter scrape** — probe port 9400 on dcgm-exporter pods via K8s API proxy. +3. **Prometheus query** — query a user-configured Prometheus endpoint for historical GPU metrics. + +All three populate the same existing fields: `GPUInstance.AvgGPUUtilization` and `GPUInstance.AvgGPUMemUtilization`. + +## Data Flow + +``` +1. AWS scan → ScanResult (EC2, SageMaker, EKS) +2. K8s scan → []GPUInstance (nodes + allocation) +3. Enrich K8s GPU metrics (fallback chain): + a. CloudWatch Container Insights (if AWS creds available, !skipMetrics) + b. DCGM scrape via K8s API proxy (for nodes still missing metrics) + c. Prometheus query (for remaining nodes, if --prom-url or --prom-endpoint set) +4. AnalyzeAll on K8s instances +5. Merge into result +``` + +Steps 3a through 3c each skip nodes that already have `AvgGPUUtilization` populated by a prior step. + +## Source 1: CloudWatch Container Insights + +Requires the CloudWatch Observability EKS add-on to be installed in the cluster. If not installed, the query returns empty (not an error) and we fall through. + +**Metrics queried:** +- `node_gpu_utilization` (Average) — maps to `AvgGPUUtilization` +- `node_gpu_memory_utilization` (Average) — maps to `AvgGPUMemUtilization` + +**Namespace:** `ContainerInsights` + +**Dimensions:** `ClusterName` + `InstanceId` + +**Implementation:** New function `EnrichK8sGPUMetrics(ctx, client CloudWatchClient, instances []GPUInstance, clusterName string, window MetricWindow)` in `internal/providers/aws/cloudwatch.go`, following the same pattern as `EnrichEC2Metrics` and `EnrichSageMakerMetrics`. + +**Prerequisites per node:** The node must have an EC2 instance ID (extracted from `providerID`). Non-AWS nodes are skipped for this source. + +**Wiring:** Called from `main.go` after the K8s scan returns instances, passing the CloudWatch client from the AWS config. Only called when AWS credentials are available and `!skipMetrics`. + +## Source 2: DCGM Exporter Scrape + +Auto-detected, no user configuration needed. + +**Discovery:** List pods across all namespaces matching labels `app=nvidia-dcgm-exporter` or `app.kubernetes.io/name=dcgm-exporter`. If no pods found, log `"DCGM exporter not detected, skipping"` and fall through to Prometheus. + +**Scraping:** For each GPU node still missing metrics, find the dcgm-exporter pod on that node (match by `pod.Spec.NodeName`), then scrape `/metrics` on port 9400 via the K8s API proxy (`ProxyGet`). + +**Metrics parsed:** +- `DCGM_FI_DEV_GPU_UTIL` — maps to `AvgGPUUtilization` +- `DCGM_FI_DEV_MEM_COPY_UTIL` — maps to `AvgGPUMemUtilization` + +These are point-in-time values, not historical averages. The analysis rule's confidence (0.85 vs 0.9) accounts for this lower fidelity. + +**Prometheus text format parsing:** Use `prometheus/common/expfmt` to parse the scrape response. + +**K8s client extension:** Add `ProxyGet(ctx, namespace, podName, port, path string) ([]byte, error)` to the `K8sClient` interface. Wraps `clientset.CoreV1().Pods(ns).ProxyGet()`. + +**Stderr output:** +``` + Probing DCGM exporter on GPU nodes... + DCGM: got GPU metrics for 3 of 5 remaining nodes +``` + +## Source 3: Prometheus Query + +Only attempted when `--prom-url` or `--prom-endpoint` is provided. No auto-discovery. + +**CLI flags:** +- `--prom-url` — full URL to a Prometheus-compatible API (e.g., `https://prometheus.corp.example.com`, AMP endpoint, Grafana Cloud). Hit directly via HTTP. +- `--prom-endpoint` — in-cluster service as `namespace/service:port` (e.g., `monitoring/prometheus:9090`). Proxied through the K8s API server. + +These flags are mutually exclusive. Error if both are set. + +**Query:** Batch all remaining nodes into one PromQL query: +``` +avg_over_time(DCGM_FI_DEV_GPU_UTIL{node=~"node1|node2|..."}[7d]) +``` +And similarly for `DCGM_FI_DEV_MEM_COPY_UTIL`. + +**API:** HTTP GET to `/api/v1/query`, parse the standard Prometheus JSON response. No client library — plain `net/http` for direct URLs, K8s API proxy for in-cluster endpoints. + +**Stderr output:** +``` + Querying Prometheus at monitoring/prometheus:9090... + Prometheus: got GPU metrics for 2 of 3 remaining nodes +``` + +## Analysis Rule + +New rule `ruleK8sLowGPUUtil` in `internal/analysis/rules.go`: + +- **Source filter:** `SourceK8sNode` only +- **Guard:** `AvgGPUUtilization != nil` (skip nodes where no metrics were collected) +- **Threshold:** average GPU utilization < 10% +- **Signal type:** `low_utilization` +- **Severity:** Critical +- **Confidence:** 0.85 +- **Recommendation:** "GPU utilization averaging X%. Consider bin-packing more workloads, downsizing, or removing from the node pool." +- **Savings estimate:** `MonthlyCost * 0.8` (same rough estimate as SageMaker equivalent) + +**Interplay with `ruleK8sUnallocatedGPU`:** Both rules can fire on the same node. Unallocated detects zero pod scheduling (allocation-based). Low-util detects pods that are scheduled but barely using the GPU (utilization-based). Different problems, different fixes. + +## File Changes + +- **Modify:** `internal/providers/aws/cloudwatch.go` — add `EnrichK8sGPUMetrics()` +- **Create:** `internal/providers/k8s/metrics.go` — DCGM scraping, Prometheus querying, fallback orchestration +- **Create:** `internal/providers/k8s/metrics_test.go` — tests for DCGM and Prometheus paths +- **Modify:** `internal/providers/k8s/discover.go` — extend `K8sClient` interface with `ProxyGet` (DCGM pod discovery uses existing `ListPods` with label selector) +- **Modify:** `internal/providers/k8s/scanner.go` — wire metrics enrichment into the K8s scan, accept new options +- **Modify:** `internal/analysis/rules.go` — add `ruleK8sLowGPUUtil` +- **Modify:** `internal/analysis/rules_test.go` — tests for the new rule +- **Modify:** `cmd/gpuaudit/main.go` — add `--prom-url` and `--prom-endpoint` flags, wire CloudWatch enrichment for K8s instances + +## Error Handling + +- **CloudWatch returns empty:** Not an error. Container Insights add-on probably not installed. Fall through to DCGM. +- **No EC2 instance ID on a node:** Skip CW enrichment for that node (non-AWS or providerID not set). +- **No dcgm-exporter pods found:** Log on stderr, fall through to Prometheus. +- **DCGM scrape fails for a node:** Warn on stderr, continue with other nodes. Don't fail the scan. +- **Prometheus endpoint unreachable:** Warn on stderr, continue without metrics for remaining nodes. +- **Both `--prom-url` and `--prom-endpoint` set:** Return an error at flag validation time. + +## New Dependencies + +- `prometheus/common/expfmt` — for parsing Prometheus text format from DCGM exporter scrapes. Small, well-established library. + +## IAM Policy + +No new IAM permissions required. `EnrichK8sGPUMetrics` uses the existing `cloudwatch:GetMetricData` permission already in the IAM policy output. + +## RBAC + +The K8s API proxy calls (`ProxyGet` to pods) require the `pods/proxy` resource permission. For DCGM scraping: +``` +- apiGroups: [""] + resources: ["pods/proxy"] + verbs: ["get"] +``` +This should be documented and added to any RBAC guide. From ee8e309da5b8c5d77f8da172b2fba6fe650be311 Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 22:41:11 +0100 Subject: [PATCH 02/12] Add K8s GPU metrics collection implementation plan --- .../plans/2026-04-19-k8s-gpu-metrics.md | 1394 +++++++++++++++++ 1 file changed, 1394 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-19-k8s-gpu-metrics.md diff --git a/docs/superpowers/plans/2026-04-19-k8s-gpu-metrics.md b/docs/superpowers/plans/2026-04-19-k8s-gpu-metrics.md new file mode 100644 index 0000000..14c7f2c --- /dev/null +++ b/docs/superpowers/plans/2026-04-19-k8s-gpu-metrics.md @@ -0,0 +1,1394 @@ +# K8s GPU Metrics Collection Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Collect GPU utilization metrics for Kubernetes GPU nodes via a per-node fallback chain (CloudWatch Container Insights → DCGM exporter → Prometheus), and add a utilization-based waste detection rule. + +**Architecture:** Three metrics sources tried in priority order per node, all populating the existing `AvgGPUUtilization` and `AvgGPUMemUtilization` fields on `GPUInstance`. A new analysis rule `ruleK8sLowGPUUtil` flags nodes with GPU utilization < 10%. The fallback chain is wired in `main.go` between K8s discovery and analysis. + +**Tech Stack:** Go, AWS SDK v2 (CloudWatch), client-go (K8s API proxy), prometheus/common/expfmt (Prometheus text parsing), net/http (Prometheus API) + +--- + +## File Structure + +| File | Responsibility | +|------|---------------| +| `internal/providers/aws/cloudwatch.go` | Add `EnrichK8sGPUMetrics()` — CloudWatch Container Insights queries | +| `internal/providers/aws/cloudwatch_test.go` | Tests for `EnrichK8sGPUMetrics()` (new file) | +| `internal/providers/k8s/discover.go` | Extend `K8sClient` interface with `ProxyGet` | +| `internal/providers/k8s/scanner.go` | Extend `ScanOptions` with Prometheus config, export `BuildClientPublic` | +| `internal/providers/k8s/metrics.go` | DCGM scraping, Prometheus querying, fallback orchestration (new file) | +| `internal/providers/k8s/metrics_test.go` | Tests for DCGM and Prometheus paths (new file) | +| `internal/analysis/rules.go` | Add `ruleK8sLowGPUUtil` | +| `internal/analysis/rules_test.go` | Tests for new rule | +| `cmd/gpuaudit/main.go` | Add `--prom-url`, `--prom-endpoint` flags; wire CW enrichment for K8s instances | + +--- + +### Task 1: CloudWatch Container Insights Enrichment + +**Files:** +- Create: `internal/providers/aws/cloudwatch_test.go` +- Modify: `internal/providers/aws/cloudwatch.go:60-80` + +This task adds `EnrichK8sGPUMetrics()` following the exact same pattern as the existing `EnrichEC2Metrics()` and `EnrichSageMakerMetrics()` functions. It queries the `ContainerInsights` namespace for `node_gpu_utilization` and `node_gpu_memory_utilization`. + +- [ ] **Step 1: Write the failing tests** + +Create `internal/providers/aws/cloudwatch_test.go`: + +```go +// Copyright 2026 the gpuaudit authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package aws + +import ( + "context" + "fmt" + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/cloudwatch" + cwtypes "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" + + "github.com/gpuaudit/cli/internal/models" +) + +type mockCloudWatchClient struct { + output *cloudwatch.GetMetricDataOutput + err error +} + +func (m *mockCloudWatchClient) GetMetricData(ctx context.Context, params *cloudwatch.GetMetricDataInput, optFns ...func(*cloudwatch.Options)) (*cloudwatch.GetMetricDataOutput, error) { + if m.err != nil { + return nil, m.err + } + return m.output, nil +} + +func TestEnrichK8sGPUMetrics_PopulatesUtilization(t *testing.T) { + client := &mockCloudWatchClient{ + output: &cloudwatch.GetMetricDataOutput{ + MetricDataResults: []cwtypes.MetricDataResult{ + {Id: aws.String("gpu_util_i_abc123"), Values: []float64{45.0, 50.0, 55.0}}, + {Id: aws.String("gpu_mem_i_abc123"), Values: []float64{30.0, 35.0, 40.0}}, + }, + }, + } + instances := []models.GPUInstance{ + { + InstanceID: "i-abc123", + Source: models.SourceK8sNode, + }, + } + + EnrichK8sGPUMetrics(context.Background(), client, instances, "ml-cluster", DefaultMetricWindow) + + if instances[0].AvgGPUUtilization == nil { + t.Fatal("expected GPU utilization to be populated") + } + if *instances[0].AvgGPUUtilization != 50.0 { + t.Errorf("expected avg GPU util 50.0, got %f", *instances[0].AvgGPUUtilization) + } + if instances[0].AvgGPUMemUtilization == nil { + t.Fatal("expected GPU memory utilization to be populated") + } + if *instances[0].AvgGPUMemUtilization != 35.0 { + t.Errorf("expected avg GPU mem util 35.0, got %f", *instances[0].AvgGPUMemUtilization) + } +} + +func TestEnrichK8sGPUMetrics_SkipsNonK8sNodes(t *testing.T) { + client := &mockCloudWatchClient{ + output: &cloudwatch.GetMetricDataOutput{}, + } + instances := []models.GPUInstance{ + {InstanceID: "i-ec2", Source: models.SourceEC2}, + } + + EnrichK8sGPUMetrics(context.Background(), client, instances, "cluster", DefaultMetricWindow) + + if instances[0].AvgGPUUtilization != nil { + t.Error("expected nil GPU util for non-K8s instance") + } +} + +func TestEnrichK8sGPUMetrics_SkipsNodesWithoutInstanceID(t *testing.T) { + client := &mockCloudWatchClient{ + output: &cloudwatch.GetMetricDataOutput{}, + } + instances := []models.GPUInstance{ + {InstanceID: "node-hostname", Source: models.SourceK8sNode}, + } + + EnrichK8sGPUMetrics(context.Background(), client, instances, "cluster", DefaultMetricWindow) + + if instances[0].AvgGPUUtilization != nil { + t.Error("expected nil GPU util for node without EC2 instance ID") + } +} + +func TestEnrichK8sGPUMetrics_SkipsAlreadyEnriched(t *testing.T) { + gpuUtil := 75.0 + client := &mockCloudWatchClient{ + output: &cloudwatch.GetMetricDataOutput{}, + } + instances := []models.GPUInstance{ + { + InstanceID: "i-abc123", + Source: models.SourceK8sNode, + AvgGPUUtilization: &gpuUtil, + }, + } + + EnrichK8sGPUMetrics(context.Background(), client, instances, "cluster", DefaultMetricWindow) + + if *instances[0].AvgGPUUtilization != 75.0 { + t.Errorf("expected existing value 75.0 to be preserved, got %f", *instances[0].AvgGPUUtilization) + } +} + +func TestEnrichK8sGPUMetrics_HandlesAPIError(t *testing.T) { + client := &mockCloudWatchClient{ + err: fmt.Errorf("access denied"), + } + instances := []models.GPUInstance{ + {InstanceID: "i-abc123", Source: models.SourceK8sNode}, + } + + EnrichK8sGPUMetrics(context.Background(), client, instances, "cluster", DefaultMetricWindow) + + if instances[0].AvgGPUUtilization != nil { + t.Error("expected nil GPU util after API error") + } +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `go test ./internal/providers/aws/ -run TestEnrichK8sGPUMetrics -v` +Expected: FAIL — `EnrichK8sGPUMetrics` not defined + +- [ ] **Step 3: Implement EnrichK8sGPUMetrics** + +Add to `internal/providers/aws/cloudwatch.go`, after the `EnrichSageMakerMetrics` function (after line 80): + +```go +// EnrichK8sGPUMetrics populates GPU utilization metrics on K8s nodes using CloudWatch Container Insights. +func EnrichK8sGPUMetrics(ctx context.Context, client CloudWatchClient, instances []models.GPUInstance, clusterName string, window MetricWindow) { + type nodeRef struct { + index int + instanceID string + } + var nodes []nodeRef + for i := range instances { + inst := &instances[i] + if inst.Source != models.SourceK8sNode { + continue + } + if inst.AvgGPUUtilization != nil { + continue + } + if !strings.HasPrefix(inst.InstanceID, "i-") { + continue + } + nodes = append(nodes, nodeRef{index: i, instanceID: inst.InstanceID}) + } + if len(nodes) == 0 { + return + } + + now := time.Now() + start := now.Add(-window.Duration) + + clusterDim := cwtypes.Dimension{ + Name: aws.String("ClusterName"), + Value: aws.String(clusterName), + } + + for _, node := range nodes { + instanceDim := cwtypes.Dimension{ + Name: aws.String("InstanceId"), + Value: aws.String(node.instanceID), + } + + safeID := strings.ReplaceAll(node.instanceID, "-", "_") + + queries := []cwtypes.MetricDataQuery{ + metricQuery2("gpu_util_"+safeID, "ContainerInsights", "node_gpu_utilization", "Average", window.Period, clusterDim, instanceDim), + metricQuery2("gpu_mem_"+safeID, "ContainerInsights", "node_gpu_memory_utilization", "Average", window.Period, clusterDim, instanceDim), + } + + results, err := fetchMetrics(ctx, client, queries, start, now) + if err != nil { + fmt.Fprintf(os.Stderr, " warning: Container Insights metrics unavailable for %s: %v\n", node.instanceID, err) + continue + } + + instances[node.index].AvgGPUUtilization = results["gpu_util_"+safeID] + instances[node.index].AvgGPUMemUtilization = results["gpu_mem_"+safeID] + } +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/providers/aws/ -run TestEnrichK8sGPUMetrics -v` +Expected: PASS (all 5 tests) + +- [ ] **Step 5: Run full test suite** + +Run: `go test ./...` +Expected: All tests pass + +- [ ] **Step 6: Commit** + +```bash +git add internal/providers/aws/cloudwatch.go internal/providers/aws/cloudwatch_test.go +git commit -m "Add EnrichK8sGPUMetrics for CloudWatch Container Insights GPU metrics" +``` + +--- + +### Task 2: Extend K8sClient Interface with ProxyGet + +**Files:** +- Modify: `internal/providers/k8s/discover.go:24-27` +- Modify: `internal/providers/k8s/scanner.go:91-101` +- Modify: `internal/providers/k8s/discover_test.go:19-30` + +This task adds `ProxyGet` to the `K8sClient` interface and updates the mock and wrapper. This is needed for both DCGM scraping (Task 3) and Prometheus in-cluster queries (Task 4). + +- [ ] **Step 1: Add ProxyGet to the K8sClient interface** + +In `internal/providers/k8s/discover.go`, change the `K8sClient` interface (lines 24-27) from: + +```go +type K8sClient interface { + ListNodes(ctx context.Context, opts metav1.ListOptions) (*corev1.NodeList, error) + ListPods(ctx context.Context, namespace string, opts metav1.ListOptions) (*corev1.PodList, error) +} +``` + +to: + +```go +type K8sClient interface { + ListNodes(ctx context.Context, opts metav1.ListOptions) (*corev1.NodeList, error) + ListPods(ctx context.Context, namespace string, opts metav1.ListOptions) (*corev1.PodList, error) + ProxyGet(ctx context.Context, namespace, podName, port, path string) ([]byte, error) +} +``` + +- [ ] **Step 2: Implement ProxyGet on k8sClientWrapper** + +In `internal/providers/k8s/scanner.go`, add this method after the `ListPods` method (after line 101): + +```go +func (w *k8sClientWrapper) ProxyGet(ctx context.Context, namespace, podName, port, path string) ([]byte, error) { + return w.clientset.CoreV1().Pods(namespace).ProxyGet("http", podName, port, path, nil).DoRaw(ctx) +} +``` + +- [ ] **Step 3: Add ProxyGet to the mock in tests** + +In `internal/providers/k8s/discover_test.go`, change the `mockK8sClient` struct (lines 19-22) from: + +```go +type mockK8sClient struct { + nodes *corev1.NodeList + pods *corev1.PodList +} +``` + +to: + +```go +type mockK8sClient struct { + nodes *corev1.NodeList + pods *corev1.PodList + proxyData map[string][]byte + proxyErr error +} +``` + +And add the method after `ListPods` (after line 30): + +```go +func (m *mockK8sClient) ProxyGet(ctx context.Context, namespace, podName, port, path string) ([]byte, error) { + if m.proxyErr != nil { + return nil, m.proxyErr + } + key := fmt.Sprintf("%s/%s:%s%s", namespace, podName, port, path) + if data, ok := m.proxyData[key]; ok { + return data, nil + } + return nil, fmt.Errorf("no mock data for %s", key) +} +``` + +- [ ] **Step 4: Run tests to verify nothing is broken** + +Run: `go test ./internal/providers/k8s/ -v` +Expected: All existing tests pass + +- [ ] **Step 5: Commit** + +```bash +git add internal/providers/k8s/discover.go internal/providers/k8s/scanner.go internal/providers/k8s/discover_test.go +git commit -m "Add ProxyGet to K8sClient interface for pod API proxy" +``` + +--- + +### Task 3: DCGM Exporter Scraping + +**Files:** +- Create: `internal/providers/k8s/metrics.go` +- Create: `internal/providers/k8s/metrics_test.go` + +This task implements DCGM exporter auto-discovery and metric scraping. It discovers dcgm-exporter pods by label, matches them to GPU nodes, scrapes `/metrics` on port 9400, and parses `DCGM_FI_DEV_GPU_UTIL` and `DCGM_FI_DEV_MEM_COPY_UTIL`. + +- [ ] **Step 1: Add the `prometheus/common` dependency** + +Run: `go get github.com/prometheus/common@latest` + +This will also pull in `github.com/prometheus/client_model` (needed for `dto.MetricFamily`). + +- [ ] **Step 2: Write the failing tests** + +Create `internal/providers/k8s/metrics_test.go`: + +```go +// Copyright 2026 the gpuaudit authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package k8s + +import ( + "context" + "fmt" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/gpuaudit/cli/internal/models" +) + +func dcgmPod(name, namespace, nodeName string) corev1.Pod { + return corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: map[string]string{ + "app.kubernetes.io/name": "dcgm-exporter", + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + }, + } +} + +const sampleDCGMMetrics = `# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization. +# TYPE DCGM_FI_DEV_GPU_UTIL gauge +DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-abc",device="nvidia0",modelName="NVIDIA A10G",Hostname="node1"} 42.0 +DCGM_FI_DEV_GPU_UTIL{gpu="1",UUID="GPU-def",device="nvidia1",modelName="NVIDIA A10G",Hostname="node1"} 38.0 +# HELP DCGM_FI_DEV_MEM_COPY_UTIL GPU memory utilization. +# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-abc",device="nvidia0",modelName="NVIDIA A10G",Hostname="node1"} 55.0 +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="1",UUID="GPU-def",device="nvidia1",modelName="NVIDIA A10G",Hostname="node1"} 60.0 +` + +func TestEnrichDCGMMetrics_PopulatesUtilization(t *testing.T) { + client := &mockK8sClient{ + nodes: &corev1.NodeList{}, + pods: &corev1.PodList{ + Items: []corev1.Pod{ + dcgmPod("dcgm-exporter-abc", "gpu-operator", "i-node1"), + }, + }, + proxyData: map[string][]byte{ + "gpu-operator/dcgm-exporter-abc:9400/metrics": []byte(sampleDCGMMetrics), + }, + } + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode, Name: "cluster/i-node1"}, + } + + enriched := EnrichDCGMMetrics(context.Background(), client, instances) + + if instances[0].AvgGPUUtilization == nil { + t.Fatal("expected GPU utilization to be populated") + } + if *instances[0].AvgGPUUtilization != 40.0 { + t.Errorf("expected avg GPU util 40.0 (average of 42 and 38), got %f", *instances[0].AvgGPUUtilization) + } + if instances[0].AvgGPUMemUtilization == nil { + t.Fatal("expected GPU memory utilization to be populated") + } + if *instances[0].AvgGPUMemUtilization != 57.5 { + t.Errorf("expected avg GPU mem util 57.5 (average of 55 and 60), got %f", *instances[0].AvgGPUMemUtilization) + } + if enriched != 1 { + t.Errorf("expected 1 enriched node, got %d", enriched) + } +} + +func TestEnrichDCGMMetrics_SkipsAlreadyEnriched(t *testing.T) { + gpuUtil := 75.0 + client := &mockK8sClient{ + nodes: &corev1.NodeList{}, + pods: &corev1.PodList{ + Items: []corev1.Pod{ + dcgmPod("dcgm-exporter-abc", "gpu-operator", "i-node1"), + }, + }, + proxyData: map[string][]byte{ + "gpu-operator/dcgm-exporter-abc:9400/metrics": []byte(sampleDCGMMetrics), + }, + } + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode, AvgGPUUtilization: &gpuUtil}, + } + + enriched := EnrichDCGMMetrics(context.Background(), client, instances) + + if *instances[0].AvgGPUUtilization != 75.0 { + t.Error("should not overwrite existing utilization") + } + if enriched != 0 { + t.Errorf("expected 0 enriched nodes, got %d", enriched) + } +} + +func TestEnrichDCGMMetrics_NoDCGMPods(t *testing.T) { + client := &mockK8sClient{ + nodes: &corev1.NodeList{}, + pods: &corev1.PodList{Items: []corev1.Pod{}}, + } + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode}, + } + + enriched := EnrichDCGMMetrics(context.Background(), client, instances) + + if instances[0].AvgGPUUtilization != nil { + t.Error("expected nil when no DCGM pods") + } + if enriched != 0 { + t.Errorf("expected 0 enriched, got %d", enriched) + } +} + +func TestEnrichDCGMMetrics_HandlesScrapeError(t *testing.T) { + client := &mockK8sClient{ + nodes: &corev1.NodeList{}, + pods: &corev1.PodList{ + Items: []corev1.Pod{ + dcgmPod("dcgm-exporter-abc", "gpu-operator", "i-node1"), + }, + }, + proxyErr: fmt.Errorf("connection refused"), + } + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode}, + } + + enriched := EnrichDCGMMetrics(context.Background(), client, instances) + + if instances[0].AvgGPUUtilization != nil { + t.Error("expected nil after scrape error") + } + if enriched != 0 { + t.Errorf("expected 0 enriched, got %d", enriched) + } +} + +func TestParseDCGMMetrics(t *testing.T) { + gpuUtil, memUtil := parseDCGMMetrics([]byte(sampleDCGMMetrics)) + + if gpuUtil == nil { + t.Fatal("expected gpu util") + } + if *gpuUtil != 40.0 { + t.Errorf("expected 40.0, got %f", *gpuUtil) + } + if memUtil == nil { + t.Fatal("expected mem util") + } + if *memUtil != 57.5 { + t.Errorf("expected 57.5, got %f", *memUtil) + } +} + +func TestParseDCGMMetrics_EmptyInput(t *testing.T) { + gpuUtil, memUtil := parseDCGMMetrics([]byte("")) + if gpuUtil != nil || memUtil != nil { + t.Error("expected nil for empty input") + } +} +``` + +- [ ] **Step 3: Run tests to verify they fail** + +Run: `go test ./internal/providers/k8s/ -run "TestEnrichDCGM|TestParseDCGM" -v` +Expected: FAIL — functions not defined + +- [ ] **Step 4: Implement DCGM metrics enrichment** + +Create `internal/providers/k8s/metrics.go`: + +```go +// Copyright 2026 the gpuaudit authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package k8s + +import ( + "bytes" + "context" + "fmt" + "os" + + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/gpuaudit/cli/internal/models" +) + +// EnrichDCGMMetrics discovers dcgm-exporter pods and scrapes GPU metrics for K8s nodes +// that don't already have AvgGPUUtilization populated. Returns the number of nodes enriched. +func EnrichDCGMMetrics(ctx context.Context, client K8sClient, instances []models.GPUInstance) int { + needsMetrics := make(map[string]int) + for i := range instances { + inst := &instances[i] + if inst.Source != models.SourceK8sNode || inst.AvgGPUUtilization != nil { + continue + } + needsMetrics[inst.InstanceID] = i + } + if len(needsMetrics) == 0 { + return 0 + } + + dcgmPods, err := findDCGMPods(ctx, client) + if err != nil { + fmt.Fprintf(os.Stderr, " warning: could not list DCGM exporter pods: %v\n", err) + return 0 + } + if len(dcgmPods) == 0 { + fmt.Fprintf(os.Stderr, " DCGM exporter not detected, skipping\n") + return 0 + } + + fmt.Fprintf(os.Stderr, " Probing DCGM exporter on GPU nodes...\n") + + enriched := 0 + for _, pod := range dcgmPods { + idx, ok := needsMetrics[pod.Spec.NodeName] + if !ok { + continue + } + + data, err := client.ProxyGet(ctx, pod.Namespace, pod.Name, "9400", "/metrics") + if err != nil { + fmt.Fprintf(os.Stderr, " warning: DCGM scrape failed for %s: %v\n", pod.Spec.NodeName, err) + continue + } + + gpuUtil, memUtil := parseDCGMMetrics(data) + if gpuUtil != nil { + instances[idx].AvgGPUUtilization = gpuUtil + instances[idx].AvgGPUMemUtilization = memUtil + enriched++ + } + } + + fmt.Fprintf(os.Stderr, " DCGM: got GPU metrics for %d of %d remaining nodes\n", enriched, len(needsMetrics)) + return enriched +} + +func findDCGMPods(ctx context.Context, client K8sClient) ([]corev1.Pod, error) { + podList, err := client.ListPods(ctx, "", metav1.ListOptions{ + LabelSelector: "app.kubernetes.io/name=dcgm-exporter", + }) + if err != nil { + return nil, err + } + if len(podList.Items) > 0 { + return runningPods(podList.Items), nil + } + + podList, err = client.ListPods(ctx, "", metav1.ListOptions{ + LabelSelector: "app=nvidia-dcgm-exporter", + }) + if err != nil { + return nil, err + } + return runningPods(podList.Items), nil +} + +func runningPods(pods []corev1.Pod) []corev1.Pod { + var result []corev1.Pod + for _, p := range pods { + if p.Status.Phase == corev1.PodRunning { + result = append(result, p) + } + } + return result +} + +func parseDCGMMetrics(data []byte) (gpuUtil, memUtil *float64) { + parser := expfmt.TextParser{} + families, err := parser.TextToMetricFamilies(bytes.NewReader(data)) + if err != nil { + return nil, nil + } + + gpuUtil = avgMetricValue(families["DCGM_FI_DEV_GPU_UTIL"]) + memUtil = avgMetricValue(families["DCGM_FI_DEV_MEM_COPY_UTIL"]) + return gpuUtil, memUtil +} + +func avgMetricValue(family *dto.MetricFamily) *float64 { + if family == nil || len(family.Metric) == 0 { + return nil + } + sum := 0.0 + count := 0 + for _, m := range family.Metric { + if m.Gauge != nil && m.Gauge.Value != nil { + sum += *m.Gauge.Value + count++ + } + } + if count == 0 { + return nil + } + avg := sum / float64(count) + return &avg +} +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `go test ./internal/providers/k8s/ -run "TestEnrichDCGM|TestParseDCGM" -v` +Expected: PASS (all 6 tests) + +- [ ] **Step 6: Run full test suite** + +Run: `go test ./...` +Expected: All tests pass + +- [ ] **Step 7: Commit** + +```bash +git add internal/providers/k8s/metrics.go internal/providers/k8s/metrics_test.go go.mod go.sum +git commit -m "Add DCGM exporter scraping for K8s GPU metrics" +``` + +--- + +### Task 4: Prometheus Query Enrichment + +**Files:** +- Modify: `internal/providers/k8s/metrics.go` +- Modify: `internal/providers/k8s/metrics_test.go` + +This task adds the Prometheus query path — the third fallback. It supports both direct URL (`--prom-url`) and in-cluster service endpoint (`--prom-endpoint`), querying `avg_over_time(DCGM_FI_DEV_GPU_UTIL{node=~"..."}[7d])`. + +- [ ] **Step 1: Write the failing tests** + +Add to `internal/providers/k8s/metrics_test.go`: + +```go +import ( + "net/http" + "net/http/httptest" + "strings" +) +``` + +Add these test functions: + +```go +func TestEnrichPrometheusMetrics_PopulatesFromDirectURL(t *testing.T) { + promResponse := `{ + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"node": "i-node1"}, "value": [1700000000, "65.5"]}, + {"metric": {"node": "i-node2"}, "value": [1700000000, "30.0"]} + ] + } + }` + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/query" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + query := r.URL.Query().Get("query") + if !strings.Contains(query, "DCGM_FI_DEV_GPU_UTIL") { + t.Errorf("unexpected query: %s", query) + } + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(promResponse)) + })) + defer server.Close() + + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode, Name: "cluster/i-node1"}, + {InstanceID: "i-node2", Source: models.SourceK8sNode, Name: "cluster/i-node2"}, + } + opts := PrometheusOptions{URL: server.URL} + + enriched := EnrichPrometheusMetrics(context.Background(), nil, instances, opts) + + if enriched != 2 { + t.Errorf("expected 2 enriched, got %d", enriched) + } + if instances[0].AvgGPUUtilization == nil || *instances[0].AvgGPUUtilization != 65.5 { + t.Errorf("expected node1 GPU util 65.5, got %v", instances[0].AvgGPUUtilization) + } + if instances[1].AvgGPUUtilization == nil || *instances[1].AvgGPUUtilization != 30.0 { + t.Errorf("expected node2 GPU util 30.0, got %v", instances[1].AvgGPUUtilization) + } +} + +func TestEnrichPrometheusMetrics_SkipsAlreadyEnriched(t *testing.T) { + gpuUtil := 80.0 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[]}}`)) + })) + defer server.Close() + + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode, AvgGPUUtilization: &gpuUtil}, + } + opts := PrometheusOptions{URL: server.URL} + + enriched := EnrichPrometheusMetrics(context.Background(), nil, instances, opts) + + if enriched != 0 { + t.Errorf("expected 0 enriched, got %d", enriched) + } +} + +func TestEnrichPrometheusMetrics_NoOptions(t *testing.T) { + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode}, + } + + enriched := EnrichPrometheusMetrics(context.Background(), nil, instances, PrometheusOptions{}) + + if enriched != 0 { + t.Errorf("expected 0 enriched, got %d", enriched) + } +} + +func TestEnrichPrometheusMetrics_InClusterEndpoint(t *testing.T) { + promResponse := `{ + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"node": "i-node1"}, "value": [1700000000, "50.0"]} + ] + } + }` + client := &mockK8sClient{ + nodes: &corev1.NodeList{}, + pods: &corev1.PodList{}, + proxyData: map[string][]byte{ + "monitoring/prometheus:9090/api/v1/query": []byte(promResponse), + }, + } + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode}, + } + opts := PrometheusOptions{Endpoint: "monitoring/prometheus:9090"} + + enriched := EnrichPrometheusMetrics(context.Background(), client, instances, opts) + + if enriched != 1 { + t.Errorf("expected 1 enriched, got %d", enriched) + } + if instances[0].AvgGPUUtilization == nil || *instances[0].AvgGPUUtilization != 50.0 { + t.Errorf("expected 50.0, got %v", instances[0].AvgGPUUtilization) + } +} + +func TestParsePrometheusEndpoint(t *testing.T) { + tests := []struct { + input string + namespace string + service string + port string + wantErr bool + }{ + {"monitoring/prometheus:9090", "monitoring", "prometheus", "9090", false}, + {"kube-system/thanos-query:10902", "kube-system", "thanos-query", "10902", false}, + {"invalid", "", "", "", true}, + {"ns/svc", "", "", "", true}, + } + for _, tt := range tests { + ns, svc, port, err := parsePrometheusEndpoint(tt.input) + if (err != nil) != tt.wantErr { + t.Errorf("parsePrometheusEndpoint(%q): err=%v, wantErr=%v", tt.input, err, tt.wantErr) + continue + } + if ns != tt.namespace || svc != tt.service || port != tt.port { + t.Errorf("parsePrometheusEndpoint(%q) = (%q,%q,%q), want (%q,%q,%q)", + tt.input, ns, svc, port, tt.namespace, tt.service, tt.port) + } + } +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `go test ./internal/providers/k8s/ -run "TestEnrichPrometheus|TestParsePrometheus" -v` +Expected: FAIL — functions not defined + +- [ ] **Step 3: Implement Prometheus metrics enrichment** + +Add to `internal/providers/k8s/metrics.go` (additional imports at the top): + +```go +import ( + "encoding/json" + "io" + "net/http" + "net/url" + "strconv" + "strings" +) +``` + +Add these types and functions: + +```go +// PrometheusOptions configures how to reach a Prometheus-compatible API. +type PrometheusOptions struct { + URL string + Endpoint string +} + +// EnrichPrometheusMetrics queries a Prometheus endpoint for GPU utilization metrics +// for K8s nodes that don't already have AvgGPUUtilization populated. +func EnrichPrometheusMetrics(ctx context.Context, client K8sClient, instances []models.GPUInstance, opts PrometheusOptions) int { + if opts.URL == "" && opts.Endpoint == "" { + return 0 + } + + type nodeRef struct { + index int + name string + } + var nodes []nodeRef + for i := range instances { + inst := &instances[i] + if inst.Source != models.SourceK8sNode || inst.AvgGPUUtilization != nil { + continue + } + nodes = append(nodes, nodeRef{index: i, name: inst.InstanceID}) + } + if len(nodes) == 0 { + return 0 + } + + source := opts.URL + if source == "" { + source = opts.Endpoint + } + fmt.Fprintf(os.Stderr, " Querying Prometheus at %s...\n", source) + + nodeNames := make([]string, len(nodes)) + for i, n := range nodes { + nodeNames[i] = n.name + } + nodeRegex := strings.Join(nodeNames, "|") + + gpuResults := queryPrometheus(ctx, client, opts, + fmt.Sprintf(`avg_over_time(DCGM_FI_DEV_GPU_UTIL{node=~"%s"}[7d])`, nodeRegex)) + memResults := queryPrometheus(ctx, client, opts, + fmt.Sprintf(`avg_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{node=~"%s"}[7d])`, nodeRegex)) + + enriched := 0 + for _, node := range nodes { + if val, ok := gpuResults[node.name]; ok { + instances[node.index].AvgGPUUtilization = &val + if memVal, ok := memResults[node.name]; ok { + instances[node.index].AvgGPUMemUtilization = &memVal + } + enriched++ + } + } + + fmt.Fprintf(os.Stderr, " Prometheus: got GPU metrics for %d of %d remaining nodes\n", enriched, len(nodes)) + return enriched +} + +func queryPrometheus(ctx context.Context, client K8sClient, opts PrometheusOptions, query string) map[string]float64 { + var data []byte + var err error + + if opts.URL != "" { + data, err = queryPrometheusHTTP(ctx, opts.URL, query) + } else { + data, err = queryPrometheusProxy(ctx, client, opts.Endpoint, query) + } + if err != nil { + fmt.Fprintf(os.Stderr, " warning: Prometheus query failed: %v\n", err) + return nil + } + + return parsePrometheusResponse(data) +} + +func queryPrometheusHTTP(ctx context.Context, baseURL, query string) ([]byte, error) { + u := fmt.Sprintf("%s/api/v1/query?query=%s", strings.TrimRight(baseURL, "/"), url.QueryEscape(query)) + req, err := http.NewRequestWithContext(ctx, "GET", u, nil) + if err != nil { + return nil, err + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + return io.ReadAll(resp.Body) +} + +func queryPrometheusProxy(ctx context.Context, client K8sClient, endpoint, query string) ([]byte, error) { + ns, svc, port, err := parsePrometheusEndpoint(endpoint) + if err != nil { + return nil, err + } + path := fmt.Sprintf("/api/v1/query?query=%s", url.QueryEscape(query)) + return client.ProxyGet(ctx, ns, svc, port, path) +} + +func parsePrometheusEndpoint(endpoint string) (namespace, service, port string, err error) { + slashIdx := strings.Index(endpoint, "/") + if slashIdx < 1 { + return "", "", "", fmt.Errorf("invalid endpoint format %q, expected namespace/service:port", endpoint) + } + namespace = endpoint[:slashIdx] + rest := endpoint[slashIdx+1:] + colonIdx := strings.LastIndex(rest, ":") + if colonIdx < 1 { + return "", "", "", fmt.Errorf("invalid endpoint format %q, expected namespace/service:port", endpoint) + } + service = rest[:colonIdx] + port = rest[colonIdx+1:] + return namespace, service, port, nil +} + +func parsePrometheusResponse(data []byte) map[string]float64 { + var resp struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result []struct { + Metric map[string]string `json:"metric"` + Value []json.RawMessage `json:"value"` + } `json:"result"` + } `json:"data"` + } + if err := json.Unmarshal(data, &resp); err != nil { + return nil + } + if resp.Status != "success" { + return nil + } + + results := make(map[string]float64) + for _, r := range resp.Data.Result { + node := r.Metric["node"] + if node == "" || len(r.Value) < 2 { + continue + } + var valStr string + if err := json.Unmarshal(r.Value[1], &valStr); err != nil { + continue + } + val, err := strconv.ParseFloat(valStr, 64) + if err != nil { + continue + } + results[node] = val + } + return results +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/providers/k8s/ -run "TestEnrichPrometheus|TestParsePrometheus" -v` +Expected: PASS (all 5 tests) + +- [ ] **Step 5: Run full test suite** + +Run: `go test ./...` +Expected: All tests pass + +- [ ] **Step 6: Commit** + +```bash +git add internal/providers/k8s/metrics.go internal/providers/k8s/metrics_test.go +git commit -m "Add Prometheus query enrichment for K8s GPU metrics" +``` + +--- + +### Task 5: K8s Low GPU Utilization Analysis Rule + +**Files:** +- Modify: `internal/analysis/rules.go` +- Modify: `internal/analysis/rules_test.go` + +- [ ] **Step 1: Write the failing tests** + +Add to `internal/analysis/rules_test.go`: + +```go +func TestRuleK8sLowGPUUtil_FlagsLowUtilization(t *testing.T) { + inst := models.GPUInstance{ + InstanceID: "i-node1", + Source: models.SourceK8sNode, + State: "ready", + InstanceType: "g5.xlarge", + GPUModel: "A10G", + GPUCount: 1, + GPUAllocated: 1, + MonthlyCost: 734, + AvgGPUUtilization: ptr(3.5), + } + + ruleK8sLowGPUUtil(&inst) + + if len(inst.WasteSignals) != 1 { + t.Fatalf("expected 1 signal, got %d", len(inst.WasteSignals)) + } + if inst.WasteSignals[0].Type != "low_utilization" { + t.Errorf("expected low_utilization, got %s", inst.WasteSignals[0].Type) + } + if inst.WasteSignals[0].Severity != models.SeverityCritical { + t.Errorf("expected critical, got %s", inst.WasteSignals[0].Severity) + } + if inst.WasteSignals[0].Confidence != 0.85 { + t.Errorf("expected confidence 0.85, got %f", inst.WasteSignals[0].Confidence) + } + if len(inst.Recommendations) != 1 { + t.Fatalf("expected 1 recommendation, got %d", len(inst.Recommendations)) + } + if inst.Recommendations[0].MonthlySavings != 734*0.8 { + t.Errorf("expected savings %.0f, got %f", 734*0.8, inst.Recommendations[0].MonthlySavings) + } +} + +func TestRuleK8sLowGPUUtil_SkipsNonK8s(t *testing.T) { + inst := models.GPUInstance{ + Source: models.SourceEC2, + AvgGPUUtilization: ptr(3.5), + } + + ruleK8sLowGPUUtil(&inst) + + if len(inst.WasteSignals) != 0 { + t.Errorf("expected no signals for EC2 instance") + } +} + +func TestRuleK8sLowGPUUtil_SkipsNoMetrics(t *testing.T) { + inst := models.GPUInstance{ + Source: models.SourceK8sNode, + State: "ready", + } + + ruleK8sLowGPUUtil(&inst) + + if len(inst.WasteSignals) != 0 { + t.Errorf("expected no signals when metrics unavailable") + } +} + +func TestRuleK8sLowGPUUtil_SkipsHighUtilization(t *testing.T) { + inst := models.GPUInstance{ + Source: models.SourceK8sNode, + State: "ready", + AvgGPUUtilization: ptr(45.0), + } + + ruleK8sLowGPUUtil(&inst) + + if len(inst.WasteSignals) != 0 { + t.Errorf("expected no signals for well-utilized GPU") + } +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `go test ./internal/analysis/ -run TestRuleK8sLowGPUUtil -v` +Expected: FAIL — `ruleK8sLowGPUUtil` not defined + +- [ ] **Step 3: Implement the rule** + +In `internal/analysis/rules.go`, add `ruleK8sLowGPUUtil` to the rules slice inside `analyzeInstance()` (line 23-31). The full slice should be: + +```go + rules := []func(*models.GPUInstance){ + ruleIdle, + ruleOversizedGPU, + rulePricingMismatch, + ruleStale, + ruleSageMakerLowUtil, + ruleSageMakerOversized, + ruleK8sUnallocatedGPU, + ruleSpotEligible, + ruleK8sLowGPUUtil, + } +``` + +Then add the rule function at the end of the file: + +```go +// Rule 9: K8s GPU node with low GPU utilization (requires DCGM/CW/Prometheus metrics). +func ruleK8sLowGPUUtil(inst *models.GPUInstance) { + if inst.Source != models.SourceK8sNode { + return + } + if inst.AvgGPUUtilization == nil { + return + } + if *inst.AvgGPUUtilization >= 10 { + return + } + + inst.WasteSignals = append(inst.WasteSignals, models.WasteSignal{ + Type: "low_utilization", + Severity: models.SeverityCritical, + Confidence: 0.85, + Evidence: fmt.Sprintf("K8s GPU node utilization averaging %.1f%%. GPUs are allocated but barely used.", *inst.AvgGPUUtilization), + }) + inst.Recommendations = append(inst.Recommendations, models.Recommendation{ + Action: models.ActionDownsize, + Description: fmt.Sprintf("GPU utilization averaging %.1f%%. Consider bin-packing more workloads, downsizing, or removing from the node pool.", *inst.AvgGPUUtilization), + CurrentMonthlyCost: inst.MonthlyCost, + RecommendedMonthlyCost: inst.MonthlyCost * 0.2, + MonthlySavings: inst.MonthlyCost * 0.8, + SavingsPercent: 80, + Risk: models.RiskMedium, + }) +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/analysis/ -run TestRuleK8sLowGPUUtil -v` +Expected: PASS (all 4 tests) + +- [ ] **Step 5: Run full test suite** + +Run: `go test ./...` +Expected: All tests pass + +- [ ] **Step 6: Commit** + +```bash +git add internal/analysis/rules.go internal/analysis/rules_test.go +git commit -m "Add ruleK8sLowGPUUtil for utilization-based K8s GPU waste detection" +``` + +--- + +### Task 6: Wire Everything into CLI and Scan Flow + +**Files:** +- Modify: `cmd/gpuaudit/main.go` +- Modify: `internal/providers/k8s/scanner.go` + +This task adds the `--prom-url` and `--prom-endpoint` CLI flags, passes them through to the K8s scan, wires CloudWatch Container Insights enrichment, and orchestrates the fallback chain in `main.go`. + +- [ ] **Step 1: Extend K8s ScanOptions** + +In `internal/providers/k8s/scanner.go`, change the `ScanOptions` struct (lines 20-23) from: + +```go +type ScanOptions struct { + Kubeconfig string + Context string +} +``` + +to: + +```go +type ScanOptions struct { + Kubeconfig string + Context string + PromURL string + PromEndpoint string +} +``` + +- [ ] **Step 2: Export BuildClient** + +Add to `internal/providers/k8s/scanner.go` after the existing `buildClient` function: + +```go +func BuildClientPublic(kubeconfigPath, contextName string) (K8sClient, string, error) { + return buildClient(kubeconfigPath, contextName) +} +``` + +- [ ] **Step 3: Add CLI flags** + +In `cmd/gpuaudit/main.go`, add the flag variables after `scanKubeContext` (around line 51): + +```go + scanPromURL string + scanPromEndpoint string +``` + +Add the flag registrations inside the first `init()` function, after the `--kube-context` flag (after line 73): + +```go + scanCmd.Flags().StringVar(&scanPromURL, "prom-url", "", "Prometheus URL for GPU metrics (e.g., https://prometheus.corp.example.com)") + scanCmd.Flags().StringVar(&scanPromEndpoint, "prom-endpoint", "", "In-cluster Prometheus service as namespace/service:port (e.g., monitoring/prometheus:9090)") +``` + +- [ ] **Step 4: Add flag validation and wiring in runScan** + +In `cmd/gpuaudit/main.go`, in the `runScan` function, add validation after `ctx := context.Background()` (line 84): + +```go + if scanPromURL != "" && scanPromEndpoint != "" { + return fmt.Errorf("--prom-url and --prom-endpoint are mutually exclusive") + } +``` + +Then modify the K8s scan section. Replace the block starting with `// Kubernetes API scan` (around lines 107-119) with: + +```go + // Kubernetes API scan + if !scanSkipK8s { + k8sOpts := k8sprovider.ScanOptions{ + Kubeconfig: scanKubeconfig, + Context: scanKubeContext, + PromURL: scanPromURL, + PromEndpoint: scanPromEndpoint, + } + k8sInstances, err := k8sprovider.Scan(ctx, k8sOpts) + if err != nil { + fmt.Fprintf(os.Stderr, " warning: Kubernetes scan failed: %v\n", err) + } else if len(k8sInstances) > 0 { + if !scanSkipMetrics { + enrichK8sGPUMetrics(ctx, k8sInstances, k8sOpts, opts) + } + analysis.AnalyzeAll(k8sInstances) + result.Instances = append(result.Instances, k8sInstances...) + result.Summary = awsprovider.BuildSummary(result.Instances) + } + } +``` + +- [ ] **Step 5: Add the enrichK8sGPUMetrics helper function** + +Add this function at the bottom of `cmd/gpuaudit/main.go`: + +```go +func enrichK8sGPUMetrics(ctx context.Context, instances []models.GPUInstance, k8sOpts k8sprovider.ScanOptions, awsOpts awsprovider.ScanOptions) { + // Source 1: CloudWatch Container Insights + if len(instances) > 0 && instances[0].ClusterName != "" { + cfgOpts := []func(*awsconfig.LoadOptions) error{} + if awsOpts.Profile != "" { + cfgOpts = append(cfgOpts, awsconfig.WithSharedConfigProfile(awsOpts.Profile)) + } + cfg, err := awsconfig.LoadDefaultConfig(ctx, cfgOpts...) + if err == nil { + region := instances[0].Region + if region == "" { + region = "us-east-1" + } + cfg.Region = region + cwClient := cloudwatch.NewFromConfig(cfg) + fmt.Fprintf(os.Stderr, " Enriching K8s GPU metrics via CloudWatch Container Insights...\n") + awsprovider.EnrichK8sGPUMetrics(ctx, cwClient, instances, instances[0].ClusterName, awsprovider.DefaultMetricWindow) + + enriched := 0 + for _, inst := range instances { + if inst.AvgGPUUtilization != nil { + enriched++ + } + } + fmt.Fprintf(os.Stderr, " CloudWatch: got GPU metrics for %d of %d nodes\n", enriched, len(instances)) + } + } + + // Count remaining + remaining := 0 + for _, inst := range instances { + if inst.AvgGPUUtilization == nil { + remaining++ + } + } + + // Source 2: DCGM exporter scrape + if remaining > 0 { + client, _, err := k8sprovider.BuildClientPublic(k8sOpts.Kubeconfig, k8sOpts.Context) + if err == nil { + k8sprovider.EnrichDCGMMetrics(ctx, client, instances) + } + + remaining = 0 + for _, inst := range instances { + if inst.AvgGPUUtilization == nil { + remaining++ + } + } + } + + // Source 3: Prometheus query + if remaining > 0 && (k8sOpts.PromURL != "" || k8sOpts.PromEndpoint != "") { + var client k8sprovider.K8sClient + if k8sOpts.PromEndpoint != "" { + c, _, err := k8sprovider.BuildClientPublic(k8sOpts.Kubeconfig, k8sOpts.Context) + if err == nil { + client = c + } + } + promOpts := k8sprovider.PrometheusOptions{ + URL: k8sOpts.PromURL, + Endpoint: k8sOpts.PromEndpoint, + } + k8sprovider.EnrichPrometheusMetrics(ctx, client, instances, promOpts) + } +} +``` + +You will need to add the `"github.com/aws/aws-sdk-go-v2/service/cloudwatch"` import to `main.go` if it's not already present. + +- [ ] **Step 6: Run build and full test suite** + +Run: `go build ./... && go test ./...` +Expected: Build succeeds, all tests pass + +- [ ] **Step 7: Commit** + +```bash +git add cmd/gpuaudit/main.go internal/providers/k8s/scanner.go +git commit -m "Wire K8s GPU metrics fallback chain into CLI scan flow" +``` From 879f2c19f29b6bc63a05f8b61f10f8638cb8352f Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 22:45:15 +0100 Subject: [PATCH 03/12] Add EnrichK8sGPUMetrics for CloudWatch Container Insights GPU metrics --- internal/providers/aws/cloudwatch.go | 57 ++++++++++ internal/providers/aws/cloudwatch_test.go | 125 ++++++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 internal/providers/aws/cloudwatch_test.go diff --git a/internal/providers/aws/cloudwatch.go b/internal/providers/aws/cloudwatch.go index 819261c..b9d1978 100644 --- a/internal/providers/aws/cloudwatch.go +++ b/internal/providers/aws/cloudwatch.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "os" + "strings" "time" "github.com/aws/aws-sdk-go-v2/aws" @@ -79,6 +80,62 @@ func EnrichSageMakerMetrics(ctx context.Context, client CloudWatchClient, instan return nil } +// EnrichK8sGPUMetrics populates GPU utilization metrics on K8s nodes using CloudWatch Container Insights. +func EnrichK8sGPUMetrics(ctx context.Context, client CloudWatchClient, instances []models.GPUInstance, clusterName string, window MetricWindow) { + type nodeRef struct { + index int + instanceID string + } + var nodes []nodeRef + for i := range instances { + inst := &instances[i] + if inst.Source != models.SourceK8sNode { + continue + } + if inst.AvgGPUUtilization != nil { + continue + } + if !strings.HasPrefix(inst.InstanceID, "i-") { + continue + } + nodes = append(nodes, nodeRef{index: i, instanceID: inst.InstanceID}) + } + if len(nodes) == 0 { + return + } + + now := time.Now() + start := now.Add(-window.Duration) + + clusterDim := cwtypes.Dimension{ + Name: aws.String("ClusterName"), + Value: aws.String(clusterName), + } + + for _, node := range nodes { + instanceDim := cwtypes.Dimension{ + Name: aws.String("InstanceId"), + Value: aws.String(node.instanceID), + } + + safeID := strings.ReplaceAll(node.instanceID, "-", "_") + + queries := []cwtypes.MetricDataQuery{ + metricQuery2("gpu_util_"+safeID, "ContainerInsights", "node_gpu_utilization", "Average", window.Period, clusterDim, instanceDim), + metricQuery2("gpu_mem_"+safeID, "ContainerInsights", "node_gpu_memory_utilization", "Average", window.Period, clusterDim, instanceDim), + } + + results, err := fetchMetrics(ctx, client, queries, start, now) + if err != nil { + fmt.Fprintf(os.Stderr, " warning: Container Insights metrics unavailable for %s: %v\n", node.instanceID, err) + continue + } + + instances[node.index].AvgGPUUtilization = results["gpu_util_"+safeID] + instances[node.index].AvgGPUMemUtilization = results["gpu_mem_"+safeID] + } +} + func getEC2Metrics(ctx context.Context, client CloudWatchClient, instanceID string, window MetricWindow) (map[string]*float64, error) { now := time.Now() start := now.Add(-window.Duration) diff --git a/internal/providers/aws/cloudwatch_test.go b/internal/providers/aws/cloudwatch_test.go new file mode 100644 index 0000000..6dd1d8f --- /dev/null +++ b/internal/providers/aws/cloudwatch_test.go @@ -0,0 +1,125 @@ +// Copyright 2026 the gpuaudit authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package aws + +import ( + "context" + "fmt" + "testing" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/cloudwatch" + cwtypes "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" + + "github.com/gpuaudit/cli/internal/models" +) + +type mockCloudWatchClient struct { + output *cloudwatch.GetMetricDataOutput + err error +} + +func (m *mockCloudWatchClient) GetMetricData(ctx context.Context, params *cloudwatch.GetMetricDataInput, optFns ...func(*cloudwatch.Options)) (*cloudwatch.GetMetricDataOutput, error) { + if m.err != nil { + return nil, m.err + } + return m.output, nil +} + +func TestEnrichK8sGPUMetrics_PopulatesUtilization(t *testing.T) { + client := &mockCloudWatchClient{ + output: &cloudwatch.GetMetricDataOutput{ + MetricDataResults: []cwtypes.MetricDataResult{ + {Id: aws.String("gpu_util_i_abc123"), Values: []float64{45.0, 50.0, 55.0}}, + {Id: aws.String("gpu_mem_i_abc123"), Values: []float64{30.0, 35.0, 40.0}}, + }, + }, + } + instances := []models.GPUInstance{ + { + InstanceID: "i-abc123", + Source: models.SourceK8sNode, + }, + } + + EnrichK8sGPUMetrics(context.Background(), client, instances, "ml-cluster", DefaultMetricWindow) + + if instances[0].AvgGPUUtilization == nil { + t.Fatal("expected GPU utilization to be populated") + } + if *instances[0].AvgGPUUtilization != 50.0 { + t.Errorf("expected avg GPU util 50.0, got %f", *instances[0].AvgGPUUtilization) + } + if instances[0].AvgGPUMemUtilization == nil { + t.Fatal("expected GPU memory utilization to be populated") + } + if *instances[0].AvgGPUMemUtilization != 35.0 { + t.Errorf("expected avg GPU mem util 35.0, got %f", *instances[0].AvgGPUMemUtilization) + } +} + +func TestEnrichK8sGPUMetrics_SkipsNonK8sNodes(t *testing.T) { + client := &mockCloudWatchClient{ + output: &cloudwatch.GetMetricDataOutput{}, + } + instances := []models.GPUInstance{ + {InstanceID: "i-ec2", Source: models.SourceEC2}, + } + + EnrichK8sGPUMetrics(context.Background(), client, instances, "cluster", DefaultMetricWindow) + + if instances[0].AvgGPUUtilization != nil { + t.Error("expected nil GPU util for non-K8s instance") + } +} + +func TestEnrichK8sGPUMetrics_SkipsNodesWithoutInstanceID(t *testing.T) { + client := &mockCloudWatchClient{ + output: &cloudwatch.GetMetricDataOutput{}, + } + instances := []models.GPUInstance{ + {InstanceID: "node-hostname", Source: models.SourceK8sNode}, + } + + EnrichK8sGPUMetrics(context.Background(), client, instances, "cluster", DefaultMetricWindow) + + if instances[0].AvgGPUUtilization != nil { + t.Error("expected nil GPU util for node without EC2 instance ID") + } +} + +func TestEnrichK8sGPUMetrics_SkipsAlreadyEnriched(t *testing.T) { + gpuUtil := 75.0 + client := &mockCloudWatchClient{ + output: &cloudwatch.GetMetricDataOutput{}, + } + instances := []models.GPUInstance{ + { + InstanceID: "i-abc123", + Source: models.SourceK8sNode, + AvgGPUUtilization: &gpuUtil, + }, + } + + EnrichK8sGPUMetrics(context.Background(), client, instances, "cluster", DefaultMetricWindow) + + if *instances[0].AvgGPUUtilization != 75.0 { + t.Errorf("expected existing value 75.0 to be preserved, got %f", *instances[0].AvgGPUUtilization) + } +} + +func TestEnrichK8sGPUMetrics_HandlesAPIError(t *testing.T) { + client := &mockCloudWatchClient{ + err: fmt.Errorf("access denied"), + } + instances := []models.GPUInstance{ + {InstanceID: "i-abc123", Source: models.SourceK8sNode}, + } + + EnrichK8sGPUMetrics(context.Background(), client, instances, "cluster", DefaultMetricWindow) + + if instances[0].AvgGPUUtilization != nil { + t.Error("expected nil GPU util after API error") + } +} From 9a176fa6c8f674a0bdb85e079660f4b7d839262a Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 22:46:52 +0100 Subject: [PATCH 04/12] Add ProxyGet to K8sClient interface for pod API proxy --- internal/providers/k8s/discover.go | 1 + internal/providers/k8s/discover_test.go | 17 +++++++++++++++-- internal/providers/k8s/scanner.go | 4 ++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/internal/providers/k8s/discover.go b/internal/providers/k8s/discover.go index 6df9ef0..14fe00c 100644 --- a/internal/providers/k8s/discover.go +++ b/internal/providers/k8s/discover.go @@ -24,6 +24,7 @@ const gpuResourceName corev1.ResourceName = "nvidia.com/gpu" type K8sClient interface { ListNodes(ctx context.Context, opts metav1.ListOptions) (*corev1.NodeList, error) ListPods(ctx context.Context, namespace string, opts metav1.ListOptions) (*corev1.PodList, error) + ProxyGet(ctx context.Context, namespace, podName, port, path string) ([]byte, error) } // DiscoverGPUNodes finds Kubernetes nodes with GPU capacity and reports their allocation. diff --git a/internal/providers/k8s/discover_test.go b/internal/providers/k8s/discover_test.go index 9d0cff1..016c9df 100644 --- a/internal/providers/k8s/discover_test.go +++ b/internal/providers/k8s/discover_test.go @@ -17,8 +17,10 @@ import ( ) type mockK8sClient struct { - nodes *corev1.NodeList - pods *corev1.PodList + nodes *corev1.NodeList + pods *corev1.PodList + proxyData map[string][]byte + proxyErr error } func (m *mockK8sClient) ListNodes(ctx context.Context, opts metav1.ListOptions) (*corev1.NodeList, error) { @@ -29,6 +31,17 @@ func (m *mockK8sClient) ListPods(ctx context.Context, namespace string, opts met return m.pods, nil } +func (m *mockK8sClient) ProxyGet(ctx context.Context, namespace, podName, port, path string) ([]byte, error) { + if m.proxyErr != nil { + return nil, m.proxyErr + } + key := fmt.Sprintf("%s/%s:%s%s", namespace, podName, port, path) + if data, ok := m.proxyData[key]; ok { + return data, nil + } + return nil, fmt.Errorf("no mock data for %s", key) +} + func gpuNode(name, instanceType string, gpuCount int, ready bool, created time.Time) corev1.Node { readyStatus := corev1.ConditionFalse if ready { diff --git a/internal/providers/k8s/scanner.go b/internal/providers/k8s/scanner.go index 67634f3..edea338 100644 --- a/internal/providers/k8s/scanner.go +++ b/internal/providers/k8s/scanner.go @@ -100,6 +100,10 @@ func (w *k8sClientWrapper) ListPods(ctx context.Context, namespace string, opts return w.clientset.CoreV1().Pods(namespace).List(ctx, opts) } +func (w *k8sClientWrapper) ProxyGet(ctx context.Context, namespace, podName, port, path string) ([]byte, error) { + return w.clientset.CoreV1().Pods(namespace).ProxyGet("http", podName, port, path, nil).DoRaw(ctx) +} + func defaultKubeconfig() string { home, err := os.UserHomeDir() if err != nil { From 4f54360f973ea8aee2362c382d9ff0d19ad03bf8 Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 22:50:49 +0100 Subject: [PATCH 05/12] Add DCGM exporter scraping for K8s GPU metrics Discovers dcgm-exporter pods via label selectors and scrapes their Prometheus metrics endpoint via kubectl proxy to populate GPU and GPU memory utilization on K8s node instances. Skips nodes that already have utilization data and gracefully handles scrape errors. --- go.mod | 17 ++- go.sum | 42 +++--- internal/providers/k8s/metrics.go | 132 +++++++++++++++++++ internal/providers/k8s/metrics_test.go | 172 +++++++++++++++++++++++++ 4 files changed, 338 insertions(+), 25 deletions(-) create mode 100644 internal/providers/k8s/metrics.go create mode 100644 internal/providers/k8s/metrics_test.go diff --git a/go.mod b/go.mod index b86d582..e6bceb9 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,8 @@ require ( github.com/aws/aws-sdk-go-v2/service/eks v1.82.0 github.com/aws/aws-sdk-go-v2/service/sagemaker v1.238.0 github.com/aws/aws-sdk-go-v2/service/sts v1.41.10 + github.com/prometheus/client_model v0.6.2 + github.com/prometheus/common v0.67.5 github.com/spf13/cobra v1.10.2 k8s.io/api v0.32.3 k8s.io/apimachinery v0.32.3 @@ -39,7 +41,7 @@ require ( github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/google/gnostic-models v0.6.8 // indirect - github.com/google/go-cmp v0.6.0 // indirect + github.com/google/go-cmp v0.7.0 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/uuid v1.6.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect @@ -52,13 +54,14 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/spf13/pflag v1.0.9 // indirect github.com/x448/float16 v0.8.4 // indirect - golang.org/x/net v0.30.0 // indirect - golang.org/x/oauth2 v0.23.0 // indirect - golang.org/x/sys v0.26.0 // indirect - golang.org/x/term v0.25.0 // indirect - golang.org/x/text v0.19.0 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + golang.org/x/net v0.48.0 // indirect + golang.org/x/oauth2 v0.34.0 // indirect + golang.org/x/sys v0.39.0 // indirect + golang.org/x/term v0.38.0 // indirect + golang.org/x/text v0.32.0 // indirect golang.org/x/time v0.7.0 // indirect - google.golang.org/protobuf v1.35.1 // indirect + google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index c4d6139..08691a8 100644 --- a/go.sum +++ b/go.sum @@ -65,8 +65,8 @@ github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6 github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -107,6 +107,10 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= +github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -121,12 +125,14 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -137,38 +143,38 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= -golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= -golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= -golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= +golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= +golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= +golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= -golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= -golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= +golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk= +golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q= +golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= -golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= +golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ= -golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0= +golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ= +golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= -google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/internal/providers/k8s/metrics.go b/internal/providers/k8s/metrics.go new file mode 100644 index 0000000..c487a45 --- /dev/null +++ b/internal/providers/k8s/metrics.go @@ -0,0 +1,132 @@ +// Copyright 2026 the gpuaudit authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package k8s + +import ( + "bytes" + "context" + "fmt" + "os" + + dto "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" + "github.com/prometheus/common/model" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/gpuaudit/cli/internal/models" +) + +// EnrichDCGMMetrics discovers dcgm-exporter pods and scrapes GPU metrics for K8s nodes +// that don't already have AvgGPUUtilization populated. Returns the number of nodes enriched. +func EnrichDCGMMetrics(ctx context.Context, client K8sClient, instances []models.GPUInstance) int { + needsMetrics := make(map[string]int) + for i := range instances { + inst := &instances[i] + if inst.Source != models.SourceK8sNode || inst.AvgGPUUtilization != nil { + continue + } + needsMetrics[inst.InstanceID] = i + } + if len(needsMetrics) == 0 { + return 0 + } + + dcgmPods, err := findDCGMPods(ctx, client) + if err != nil { + fmt.Fprintf(os.Stderr, " warning: could not list DCGM exporter pods: %v\n", err) + return 0 + } + if len(dcgmPods) == 0 { + fmt.Fprintf(os.Stderr, " DCGM exporter not detected, skipping\n") + return 0 + } + + fmt.Fprintf(os.Stderr, " Probing DCGM exporter on GPU nodes...\n") + + enriched := 0 + for _, pod := range dcgmPods { + idx, ok := needsMetrics[pod.Spec.NodeName] + if !ok { + continue + } + + data, err := client.ProxyGet(ctx, pod.Namespace, pod.Name, "9400", "/metrics") + if err != nil { + fmt.Fprintf(os.Stderr, " warning: DCGM scrape failed for %s: %v\n", pod.Spec.NodeName, err) + continue + } + + gpuUtil, memUtil := parseDCGMMetrics(data) + if gpuUtil != nil { + instances[idx].AvgGPUUtilization = gpuUtil + instances[idx].AvgGPUMemUtilization = memUtil + enriched++ + } + } + + fmt.Fprintf(os.Stderr, " DCGM: got GPU metrics for %d of %d remaining nodes\n", enriched, len(needsMetrics)) + return enriched +} + +func findDCGMPods(ctx context.Context, client K8sClient) ([]corev1.Pod, error) { + podList, err := client.ListPods(ctx, "", metav1.ListOptions{ + LabelSelector: "app.kubernetes.io/name=dcgm-exporter", + }) + if err != nil { + return nil, err + } + if len(podList.Items) > 0 { + return runningPods(podList.Items), nil + } + + podList, err = client.ListPods(ctx, "", metav1.ListOptions{ + LabelSelector: "app=nvidia-dcgm-exporter", + }) + if err != nil { + return nil, err + } + return runningPods(podList.Items), nil +} + +func runningPods(pods []corev1.Pod) []corev1.Pod { + var result []corev1.Pod + for _, p := range pods { + if p.Status.Phase == corev1.PodRunning { + result = append(result, p) + } + } + return result +} + +func parseDCGMMetrics(data []byte) (gpuUtil, memUtil *float64) { + parser := expfmt.NewTextParser(model.LegacyValidation) + families, err := parser.TextToMetricFamilies(bytes.NewReader(data)) + if err != nil { + return nil, nil + } + + gpuUtil = avgMetricValue(families["DCGM_FI_DEV_GPU_UTIL"]) + memUtil = avgMetricValue(families["DCGM_FI_DEV_MEM_COPY_UTIL"]) + return gpuUtil, memUtil +} + +func avgMetricValue(family *dto.MetricFamily) *float64 { + if family == nil || len(family.Metric) == 0 { + return nil + } + sum := 0.0 + count := 0 + for _, m := range family.Metric { + if m.Gauge != nil && m.Gauge.Value != nil { + sum += *m.Gauge.Value + count++ + } + } + if count == 0 { + return nil + } + avg := sum / float64(count) + return &avg +} diff --git a/internal/providers/k8s/metrics_test.go b/internal/providers/k8s/metrics_test.go new file mode 100644 index 0000000..01103cd --- /dev/null +++ b/internal/providers/k8s/metrics_test.go @@ -0,0 +1,172 @@ +// Copyright 2026 the gpuaudit authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package k8s + +import ( + "context" + "fmt" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/gpuaudit/cli/internal/models" +) + +func dcgmPod(name, namespace, nodeName string) corev1.Pod { + return corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: map[string]string{ + "app.kubernetes.io/name": "dcgm-exporter", + }, + }, + Spec: corev1.PodSpec{ + NodeName: nodeName, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + }, + } +} + +const sampleDCGMMetrics = `# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization. +# TYPE DCGM_FI_DEV_GPU_UTIL gauge +DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-abc",device="nvidia0",modelName="NVIDIA A10G",Hostname="node1"} 42.0 +DCGM_FI_DEV_GPU_UTIL{gpu="1",UUID="GPU-def",device="nvidia1",modelName="NVIDIA A10G",Hostname="node1"} 38.0 +# HELP DCGM_FI_DEV_MEM_COPY_UTIL GPU memory utilization. +# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-abc",device="nvidia0",modelName="NVIDIA A10G",Hostname="node1"} 55.0 +DCGM_FI_DEV_MEM_COPY_UTIL{gpu="1",UUID="GPU-def",device="nvidia1",modelName="NVIDIA A10G",Hostname="node1"} 60.0 +` + +func TestEnrichDCGMMetrics_PopulatesUtilization(t *testing.T) { + client := &mockK8sClient{ + nodes: &corev1.NodeList{}, + pods: &corev1.PodList{ + Items: []corev1.Pod{ + dcgmPod("dcgm-exporter-abc", "gpu-operator", "i-node1"), + }, + }, + proxyData: map[string][]byte{ + "gpu-operator/dcgm-exporter-abc:9400/metrics": []byte(sampleDCGMMetrics), + }, + } + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode, Name: "cluster/i-node1"}, + } + + enriched := EnrichDCGMMetrics(context.Background(), client, instances) + + if instances[0].AvgGPUUtilization == nil { + t.Fatal("expected GPU utilization to be populated") + } + if *instances[0].AvgGPUUtilization != 40.0 { + t.Errorf("expected avg GPU util 40.0 (average of 42 and 38), got %f", *instances[0].AvgGPUUtilization) + } + if instances[0].AvgGPUMemUtilization == nil { + t.Fatal("expected GPU memory utilization to be populated") + } + if *instances[0].AvgGPUMemUtilization != 57.5 { + t.Errorf("expected avg GPU mem util 57.5 (average of 55 and 60), got %f", *instances[0].AvgGPUMemUtilization) + } + if enriched != 1 { + t.Errorf("expected 1 enriched node, got %d", enriched) + } +} + +func TestEnrichDCGMMetrics_SkipsAlreadyEnriched(t *testing.T) { + gpuUtil := 75.0 + client := &mockK8sClient{ + nodes: &corev1.NodeList{}, + pods: &corev1.PodList{ + Items: []corev1.Pod{ + dcgmPod("dcgm-exporter-abc", "gpu-operator", "i-node1"), + }, + }, + proxyData: map[string][]byte{ + "gpu-operator/dcgm-exporter-abc:9400/metrics": []byte(sampleDCGMMetrics), + }, + } + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode, AvgGPUUtilization: &gpuUtil}, + } + + enriched := EnrichDCGMMetrics(context.Background(), client, instances) + + if *instances[0].AvgGPUUtilization != 75.0 { + t.Error("should not overwrite existing utilization") + } + if enriched != 0 { + t.Errorf("expected 0 enriched nodes, got %d", enriched) + } +} + +func TestEnrichDCGMMetrics_NoDCGMPods(t *testing.T) { + client := &mockK8sClient{ + nodes: &corev1.NodeList{}, + pods: &corev1.PodList{Items: []corev1.Pod{}}, + } + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode}, + } + + enriched := EnrichDCGMMetrics(context.Background(), client, instances) + + if instances[0].AvgGPUUtilization != nil { + t.Error("expected nil when no DCGM pods") + } + if enriched != 0 { + t.Errorf("expected 0 enriched, got %d", enriched) + } +} + +func TestEnrichDCGMMetrics_HandlesScrapeError(t *testing.T) { + client := &mockK8sClient{ + nodes: &corev1.NodeList{}, + pods: &corev1.PodList{ + Items: []corev1.Pod{ + dcgmPod("dcgm-exporter-abc", "gpu-operator", "i-node1"), + }, + }, + proxyErr: fmt.Errorf("connection refused"), + } + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode}, + } + + enriched := EnrichDCGMMetrics(context.Background(), client, instances) + + if instances[0].AvgGPUUtilization != nil { + t.Error("expected nil after scrape error") + } + if enriched != 0 { + t.Errorf("expected 0 enriched, got %d", enriched) + } +} + +func TestParseDCGMMetrics(t *testing.T) { + gpuUtil, memUtil := parseDCGMMetrics([]byte(sampleDCGMMetrics)) + + if gpuUtil == nil { + t.Fatal("expected gpu util") + } + if *gpuUtil != 40.0 { + t.Errorf("expected 40.0, got %f", *gpuUtil) + } + if memUtil == nil { + t.Fatal("expected mem util") + } + if *memUtil != 57.5 { + t.Errorf("expected 57.5, got %f", *memUtil) + } +} + +func TestParseDCGMMetrics_EmptyInput(t *testing.T) { + gpuUtil, memUtil := parseDCGMMetrics([]byte("")) + if gpuUtil != nil || memUtil != nil { + t.Error("expected nil for empty input") + } +} From 98003efb1d00026d63a4037620178ef0b47c33fc Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 22:54:16 +0100 Subject: [PATCH 06/12] Add Prometheus query enrichment for K8s GPU metrics --- internal/providers/k8s/metrics.go | 160 +++++++++++++++++++++++++ internal/providers/k8s/metrics_test.go | 142 ++++++++++++++++++++++ 2 files changed, 302 insertions(+) diff --git a/internal/providers/k8s/metrics.go b/internal/providers/k8s/metrics.go index c487a45..ef47470 100644 --- a/internal/providers/k8s/metrics.go +++ b/internal/providers/k8s/metrics.go @@ -6,8 +6,14 @@ package k8s import ( "bytes" "context" + "encoding/json" "fmt" + "io" + "net/http" + "net/url" "os" + "strconv" + "strings" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" @@ -130,3 +136,157 @@ func avgMetricValue(family *dto.MetricFamily) *float64 { avg := sum / float64(count) return &avg } + +// PrometheusOptions configures how to reach a Prometheus-compatible API. +type PrometheusOptions struct { + URL string + Endpoint string +} + +// EnrichPrometheusMetrics queries a Prometheus endpoint for GPU utilization metrics +// for K8s nodes that don't already have AvgGPUUtilization populated. +func EnrichPrometheusMetrics(ctx context.Context, client K8sClient, instances []models.GPUInstance, opts PrometheusOptions) int { + if opts.URL == "" && opts.Endpoint == "" { + return 0 + } + + type nodeRef struct { + index int + name string + } + var nodes []nodeRef + for i := range instances { + inst := &instances[i] + if inst.Source != models.SourceK8sNode || inst.AvgGPUUtilization != nil { + continue + } + nodes = append(nodes, nodeRef{index: i, name: inst.InstanceID}) + } + if len(nodes) == 0 { + return 0 + } + + source := opts.URL + if source == "" { + source = opts.Endpoint + } + fmt.Fprintf(os.Stderr, " Querying Prometheus at %s...\n", source) + + nodeNames := make([]string, len(nodes)) + for i, n := range nodes { + nodeNames[i] = n.name + } + nodeRegex := strings.Join(nodeNames, "|") + + gpuResults := queryPrometheus(ctx, client, opts, + fmt.Sprintf(`avg_over_time(DCGM_FI_DEV_GPU_UTIL{node=~"%s"}[7d])`, nodeRegex)) + memResults := queryPrometheus(ctx, client, opts, + fmt.Sprintf(`avg_over_time(DCGM_FI_DEV_MEM_COPY_UTIL{node=~"%s"}[7d])`, nodeRegex)) + + enriched := 0 + for _, node := range nodes { + if val, ok := gpuResults[node.name]; ok { + instances[node.index].AvgGPUUtilization = &val + if memVal, ok := memResults[node.name]; ok { + instances[node.index].AvgGPUMemUtilization = &memVal + } + enriched++ + } + } + + fmt.Fprintf(os.Stderr, " Prometheus: got GPU metrics for %d of %d remaining nodes\n", enriched, len(nodes)) + return enriched +} + +func queryPrometheus(ctx context.Context, client K8sClient, opts PrometheusOptions, query string) map[string]float64 { + var data []byte + var err error + + if opts.URL != "" { + data, err = queryPrometheusHTTP(ctx, opts.URL, query) + } else { + data, err = queryPrometheusProxy(ctx, client, opts.Endpoint, query) + } + if err != nil { + fmt.Fprintf(os.Stderr, " warning: Prometheus query failed: %v\n", err) + return nil + } + + return parsePrometheusResponse(data) +} + +func queryPrometheusHTTP(ctx context.Context, baseURL, query string) ([]byte, error) { + u := fmt.Sprintf("%s/api/v1/query?query=%s", strings.TrimRight(baseURL, "/"), url.QueryEscape(query)) + req, err := http.NewRequestWithContext(ctx, "GET", u, nil) + if err != nil { + return nil, err + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + return io.ReadAll(resp.Body) +} + +func queryPrometheusProxy(ctx context.Context, client K8sClient, endpoint, query string) ([]byte, error) { + ns, svc, port, err := parsePrometheusEndpoint(endpoint) + if err != nil { + return nil, err + } + path := fmt.Sprintf("/api/v1/query?query=%s", url.QueryEscape(query)) + return client.ProxyGet(ctx, ns, svc, port, path) +} + +func parsePrometheusEndpoint(endpoint string) (namespace, service, port string, err error) { + slashIdx := strings.Index(endpoint, "/") + if slashIdx < 1 { + return "", "", "", fmt.Errorf("invalid endpoint format %q, expected namespace/service:port", endpoint) + } + namespace = endpoint[:slashIdx] + rest := endpoint[slashIdx+1:] + colonIdx := strings.LastIndex(rest, ":") + if colonIdx < 1 { + return "", "", "", fmt.Errorf("invalid endpoint format %q, expected namespace/service:port", endpoint) + } + service = rest[:colonIdx] + port = rest[colonIdx+1:] + return namespace, service, port, nil +} + +func parsePrometheusResponse(data []byte) map[string]float64 { + var resp struct { + Status string `json:"status"` + Data struct { + ResultType string `json:"resultType"` + Result []struct { + Metric map[string]string `json:"metric"` + Value []json.RawMessage `json:"value"` + } `json:"result"` + } `json:"data"` + } + if err := json.Unmarshal(data, &resp); err != nil { + return nil + } + if resp.Status != "success" { + return nil + } + + results := make(map[string]float64) + for _, r := range resp.Data.Result { + node := r.Metric["node"] + if node == "" || len(r.Value) < 2 { + continue + } + var valStr string + if err := json.Unmarshal(r.Value[1], &valStr); err != nil { + continue + } + val, err := strconv.ParseFloat(valStr, 64) + if err != nil { + continue + } + results[node] = val + } + return results +} diff --git a/internal/providers/k8s/metrics_test.go b/internal/providers/k8s/metrics_test.go index 01103cd..329d7eb 100644 --- a/internal/providers/k8s/metrics_test.go +++ b/internal/providers/k8s/metrics_test.go @@ -6,6 +6,9 @@ package k8s import ( "context" "fmt" + "net/http" + "net/http/httptest" + "strings" "testing" corev1 "k8s.io/api/core/v1" @@ -170,3 +173,142 @@ func TestParseDCGMMetrics_EmptyInput(t *testing.T) { t.Error("expected nil for empty input") } } + +func TestEnrichPrometheusMetrics_PopulatesFromDirectURL(t *testing.T) { + promResponse := `{ + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"node": "i-node1"}, "value": [1700000000, "65.5"]}, + {"metric": {"node": "i-node2"}, "value": [1700000000, "30.0"]} + ] + } + }` + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/query" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + query := r.URL.Query().Get("query") + if !strings.Contains(query, "DCGM_FI_DEV_GPU_UTIL") && !strings.Contains(query, "DCGM_FI_DEV_MEM_COPY_UTIL") { + t.Errorf("unexpected query: %s", query) + } + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(promResponse)) + })) + defer server.Close() + + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode, Name: "cluster/i-node1"}, + {InstanceID: "i-node2", Source: models.SourceK8sNode, Name: "cluster/i-node2"}, + } + opts := PrometheusOptions{URL: server.URL} + + enriched := EnrichPrometheusMetrics(context.Background(), nil, instances, opts) + + if enriched != 2 { + t.Errorf("expected 2 enriched, got %d", enriched) + } + if instances[0].AvgGPUUtilization == nil || *instances[0].AvgGPUUtilization != 65.5 { + t.Errorf("expected node1 GPU util 65.5, got %v", instances[0].AvgGPUUtilization) + } + if instances[1].AvgGPUUtilization == nil || *instances[1].AvgGPUUtilization != 30.0 { + t.Errorf("expected node2 GPU util 30.0, got %v", instances[1].AvgGPUUtilization) + } +} + +func TestEnrichPrometheusMetrics_SkipsAlreadyEnriched(t *testing.T) { + gpuUtil := 80.0 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[]}}`)) + })) + defer server.Close() + + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode, AvgGPUUtilization: &gpuUtil}, + } + opts := PrometheusOptions{URL: server.URL} + + enriched := EnrichPrometheusMetrics(context.Background(), nil, instances, opts) + + if enriched != 0 { + t.Errorf("expected 0 enriched, got %d", enriched) + } +} + +func TestEnrichPrometheusMetrics_NoOptions(t *testing.T) { + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode}, + } + + enriched := EnrichPrometheusMetrics(context.Background(), nil, instances, PrometheusOptions{}) + + if enriched != 0 { + t.Errorf("expected 0 enriched, got %d", enriched) + } +} + +func TestEnrichPrometheusMetrics_InClusterEndpoint(t *testing.T) { + promResponse := `{ + "status": "success", + "data": { + "resultType": "vector", + "result": [ + {"metric": {"node": "i-node1"}, "value": [1700000000, "50.0"]} + ] + } + }` + instances := []models.GPUInstance{ + {InstanceID: "i-node1", Source: models.SourceK8sNode}, + } + opts := PrometheusOptions{Endpoint: "monitoring/prometheus:9090"} + + // Use a custom client that returns promResponse for any ProxyGet to monitoring/prometheus + customClient := &promMockClient{response: []byte(promResponse)} + + enriched := EnrichPrometheusMetrics(context.Background(), customClient, instances, opts) + + if enriched != 1 { + t.Errorf("expected 1 enriched, got %d", enriched) + } + if instances[0].AvgGPUUtilization == nil || *instances[0].AvgGPUUtilization != 50.0 { + t.Errorf("expected 50.0, got %v", instances[0].AvgGPUUtilization) + } +} + +// promMockClient is a specialized mock that always returns a fixed response for ProxyGet. +type promMockClient struct { + mockK8sClient + response []byte +} + +func (m *promMockClient) ProxyGet(ctx context.Context, namespace, podName, port, path string) ([]byte, error) { + return m.response, nil +} + +func TestParsePrometheusEndpoint(t *testing.T) { + tests := []struct { + input string + namespace string + service string + port string + wantErr bool + }{ + {"monitoring/prometheus:9090", "monitoring", "prometheus", "9090", false}, + {"kube-system/thanos-query:10902", "kube-system", "thanos-query", "10902", false}, + {"invalid", "", "", "", true}, + {"ns/svc", "", "", "", true}, + } + for _, tt := range tests { + ns, svc, port, err := parsePrometheusEndpoint(tt.input) + if (err != nil) != tt.wantErr { + t.Errorf("parsePrometheusEndpoint(%q): err=%v, wantErr=%v", tt.input, err, tt.wantErr) + continue + } + if ns != tt.namespace || svc != tt.service || port != tt.port { + t.Errorf("parsePrometheusEndpoint(%q) = (%q,%q,%q), want (%q,%q,%q)", + tt.input, ns, svc, port, tt.namespace, tt.service, tt.port) + } + } +} From 1a35f95c134aa714287c59a146bfacb10822ce6d Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 22:56:16 +0100 Subject: [PATCH 07/12] Add ruleK8sLowGPUUtil for utilization-based K8s GPU waste detection --- internal/analysis/rules.go | 30 +++++++++++++ internal/analysis/rules_test.go | 75 +++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/internal/analysis/rules.go b/internal/analysis/rules.go index f91bcbe..8b03d7b 100644 --- a/internal/analysis/rules.go +++ b/internal/analysis/rules.go @@ -28,6 +28,7 @@ func analyzeInstance(inst *models.GPUInstance) { ruleSageMakerLowUtil, ruleSageMakerOversized, ruleK8sUnallocatedGPU, + ruleK8sLowGPUUtil, } for _, rule := range rules { rule(inst) @@ -347,3 +348,32 @@ func ruleK8sUnallocatedGPU(inst *models.GPUInstance) { }) } } + +// Rule 8: K8s GPU node with low GPU utilization (requires DCGM/CW/Prometheus metrics). +func ruleK8sLowGPUUtil(inst *models.GPUInstance) { + if inst.Source != models.SourceK8sNode { + return + } + if inst.AvgGPUUtilization == nil { + return + } + if *inst.AvgGPUUtilization >= 10 { + return + } + + inst.WasteSignals = append(inst.WasteSignals, models.WasteSignal{ + Type: "low_utilization", + Severity: models.SeverityCritical, + Confidence: 0.85, + Evidence: fmt.Sprintf("K8s GPU node utilization averaging %.1f%%. GPUs are allocated but barely used.", *inst.AvgGPUUtilization), + }) + inst.Recommendations = append(inst.Recommendations, models.Recommendation{ + Action: models.ActionDownsize, + Description: fmt.Sprintf("GPU utilization averaging %.1f%%. Consider bin-packing more workloads, downsizing, or removing from the node pool.", *inst.AvgGPUUtilization), + CurrentMonthlyCost: inst.MonthlyCost, + RecommendedMonthlyCost: inst.MonthlyCost * 0.2, + MonthlySavings: inst.MonthlyCost * 0.8, + SavingsPercent: 80, + Risk: models.RiskMedium, + }) +} diff --git a/internal/analysis/rules_test.go b/internal/analysis/rules_test.go index d8d264d..c1d6223 100644 --- a/internal/analysis/rules_test.go +++ b/internal/analysis/rules_test.go @@ -259,3 +259,78 @@ func TestAnalyzeAll_ComputesSavings(t *testing.T) { t.Errorf("expected no signals for healthy instance, got %d", len(instances[1].WasteSignals)) } } + +func TestRuleK8sLowGPUUtil_FlagsLowUtilization(t *testing.T) { + inst := models.GPUInstance{ + InstanceID: "i-node1", + Source: models.SourceK8sNode, + State: "ready", + InstanceType: "g5.xlarge", + GPUModel: "A10G", + GPUCount: 1, + GPUAllocated: 1, + MonthlyCost: 734, + AvgGPUUtilization: ptr(3.5), + } + + ruleK8sLowGPUUtil(&inst) + + if len(inst.WasteSignals) != 1 { + t.Fatalf("expected 1 signal, got %d", len(inst.WasteSignals)) + } + if inst.WasteSignals[0].Type != "low_utilization" { + t.Errorf("expected low_utilization, got %s", inst.WasteSignals[0].Type) + } + if inst.WasteSignals[0].Severity != models.SeverityCritical { + t.Errorf("expected critical, got %s", inst.WasteSignals[0].Severity) + } + if inst.WasteSignals[0].Confidence != 0.85 { + t.Errorf("expected confidence 0.85, got %f", inst.WasteSignals[0].Confidence) + } + if len(inst.Recommendations) != 1 { + t.Fatalf("expected 1 recommendation, got %d", len(inst.Recommendations)) + } + if inst.Recommendations[0].MonthlySavings != 734*0.8 { + t.Errorf("expected savings %.0f, got %f", 734*0.8, inst.Recommendations[0].MonthlySavings) + } +} + +func TestRuleK8sLowGPUUtil_SkipsNonK8s(t *testing.T) { + inst := models.GPUInstance{ + Source: models.SourceEC2, + AvgGPUUtilization: ptr(3.5), + } + + ruleK8sLowGPUUtil(&inst) + + if len(inst.WasteSignals) != 0 { + t.Errorf("expected no signals for EC2 instance") + } +} + +func TestRuleK8sLowGPUUtil_SkipsNoMetrics(t *testing.T) { + inst := models.GPUInstance{ + Source: models.SourceK8sNode, + State: "ready", + } + + ruleK8sLowGPUUtil(&inst) + + if len(inst.WasteSignals) != 0 { + t.Errorf("expected no signals when metrics unavailable") + } +} + +func TestRuleK8sLowGPUUtil_SkipsHighUtilization(t *testing.T) { + inst := models.GPUInstance{ + Source: models.SourceK8sNode, + State: "ready", + AvgGPUUtilization: ptr(45.0), + } + + ruleK8sLowGPUUtil(&inst) + + if len(inst.WasteSignals) != 0 { + t.Errorf("expected no signals for well-utilized GPU") + } +} From 89d9cb35e58364fb965834be46be8ad41d39267a Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 23:02:04 +0100 Subject: [PATCH 08/12] Wire K8s GPU metrics fallback chain into CLI scan flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add --prom-url and --prom-endpoint flags (mutually exclusive) for Prometheus GPU metrics. Orchestrate the 3-source fallback chain (CloudWatch Container Insights → DCGM scrape → Prometheus) between K8s discovery and analysis. --- cmd/gpuaudit/main.go | 83 +++++++++++++++++++++++++++++-- internal/providers/k8s/scanner.go | 11 +++- 2 files changed, 87 insertions(+), 7 deletions(-) diff --git a/cmd/gpuaudit/main.go b/cmd/gpuaudit/main.go index ce8d61e..2aca4b5 100644 --- a/cmd/gpuaudit/main.go +++ b/cmd/gpuaudit/main.go @@ -13,12 +13,15 @@ import ( "github.com/spf13/cobra" - "github.com/gpuaudit/cli/internal/models" + awsconfig "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/cloudwatch" + "github.com/gpuaudit/cli/internal/analysis" - awsprovider "github.com/gpuaudit/cli/internal/providers/aws" - k8sprovider "github.com/gpuaudit/cli/internal/providers/k8s" + "github.com/gpuaudit/cli/internal/models" "github.com/gpuaudit/cli/internal/output" "github.com/gpuaudit/cli/internal/pricing" + awsprovider "github.com/gpuaudit/cli/internal/providers/aws" + k8sprovider "github.com/gpuaudit/cli/internal/providers/k8s" ) var version = "dev" @@ -49,6 +52,8 @@ var ( scanSkipCosts bool scanKubeconfig string scanKubeContext string + scanPromURL string + scanPromEndpoint string scanExcludeTags []string scanMinUptimeDays int ) @@ -71,6 +76,8 @@ func init() { scanCmd.Flags().BoolVar(&scanSkipCosts, "skip-costs", false, "Skip Cost Explorer data enrichment") scanCmd.Flags().StringVar(&scanKubeconfig, "kubeconfig", "", "Path to kubeconfig file (default: ~/.kube/config)") scanCmd.Flags().StringVar(&scanKubeContext, "kube-context", "", "Kubernetes context to use (default: current context)") + scanCmd.Flags().StringVar(&scanPromURL, "prom-url", "", "Prometheus URL for GPU metrics (e.g., https://prometheus.corp.example.com)") + scanCmd.Flags().StringVar(&scanPromEndpoint, "prom-endpoint", "", "In-cluster Prometheus service as namespace/service:port (e.g., monitoring/prometheus:9090)") scanCmd.Flags().StringSliceVar(&scanExcludeTags, "exclude-tag", nil, "Exclude instances matching tag (key=value, repeatable)") scanCmd.Flags().IntVar(&scanMinUptimeDays, "min-uptime-days", 0, "Only flag instances running for at least this many days") @@ -81,6 +88,10 @@ func init() { } func runScan(cmd *cobra.Command, args []string) error { + if scanPromURL != "" && scanPromEndpoint != "" { + return fmt.Errorf("--prom-url and --prom-endpoint are mutually exclusive") + } + ctx := context.Background() opts := awsprovider.DefaultScanOptions() @@ -106,13 +117,18 @@ func runScan(cmd *cobra.Command, args []string) error { // Kubernetes API scan if !scanSkipK8s { k8sOpts := k8sprovider.ScanOptions{ - Kubeconfig: scanKubeconfig, - Context: scanKubeContext, + Kubeconfig: scanKubeconfig, + Context: scanKubeContext, + PromURL: scanPromURL, + PromEndpoint: scanPromEndpoint, } k8sInstances, err := k8sprovider.Scan(ctx, k8sOpts) if err != nil { fmt.Fprintf(os.Stderr, " warning: Kubernetes scan failed: %v\n", err) } else if len(k8sInstances) > 0 { + if !scanSkipMetrics { + enrichK8sGPUMetrics(ctx, k8sInstances, k8sOpts, opts) + } analysis.AnalyzeAll(k8sInstances) result.Instances = append(result.Instances, k8sInstances...) result.Summary = awsprovider.BuildSummary(result.Instances) @@ -300,3 +316,60 @@ func parseExcludeTags(raw []string) map[string]string { } return tags } + +func enrichK8sGPUMetrics(ctx context.Context, instances []models.GPUInstance, k8sOpts k8sprovider.ScanOptions, awsOpts awsprovider.ScanOptions) { + // Source 1: CloudWatch Container Insights + if len(instances) > 0 && instances[0].ClusterName != "" { + cfgOpts := []func(*awsconfig.LoadOptions) error{} + if awsOpts.Profile != "" { + cfgOpts = append(cfgOpts, awsconfig.WithSharedConfigProfile(awsOpts.Profile)) + } + cfg, err := awsconfig.LoadDefaultConfig(ctx, cfgOpts...) + if err == nil { + region := instances[0].Region + if region == "" { + region = "us-east-1" + } + cfg.Region = region + cwClient := cloudwatch.NewFromConfig(cfg) + fmt.Fprintf(os.Stderr, " Enriching K8s GPU metrics via CloudWatch Container Insights...\n") + awsprovider.EnrichK8sGPUMetrics(ctx, cwClient, instances, instances[0].ClusterName, awsprovider.DefaultMetricWindow) + } + } + + // Source 2: DCGM exporter scrape + remaining := 0 + for _, inst := range instances { + if inst.Source == models.SourceK8sNode && inst.AvgGPUUtilization == nil { + remaining++ + } + } + if remaining > 0 { + client, _, err := k8sprovider.BuildClientPublic(k8sOpts.Kubeconfig, k8sOpts.Context) + if err == nil { + k8sprovider.EnrichDCGMMetrics(ctx, client, instances) + } + } + + // Source 3: Prometheus query + remaining = 0 + for _, inst := range instances { + if inst.Source == models.SourceK8sNode && inst.AvgGPUUtilization == nil { + remaining++ + } + } + if remaining > 0 && (k8sOpts.PromURL != "" || k8sOpts.PromEndpoint != "") { + var client k8sprovider.K8sClient + if k8sOpts.PromEndpoint != "" { + c, _, err := k8sprovider.BuildClientPublic(k8sOpts.Kubeconfig, k8sOpts.Context) + if err == nil { + client = c + } + } + promOpts := k8sprovider.PrometheusOptions{ + URL: k8sOpts.PromURL, + Endpoint: k8sOpts.PromEndpoint, + } + k8sprovider.EnrichPrometheusMetrics(ctx, client, instances, promOpts) + } +} diff --git a/internal/providers/k8s/scanner.go b/internal/providers/k8s/scanner.go index edea338..c35ef88 100644 --- a/internal/providers/k8s/scanner.go +++ b/internal/providers/k8s/scanner.go @@ -19,8 +19,10 @@ import ( // ScanOptions controls Kubernetes GPU scanning. type ScanOptions struct { - Kubeconfig string - Context string + Kubeconfig string + Context string + PromURL string + PromEndpoint string } // Scan discovers GPU nodes in Kubernetes clusters accessible via kubeconfig. @@ -47,6 +49,11 @@ func Scan(ctx context.Context, opts ScanOptions) ([]models.GPUInstance, error) { return instances, nil } +// BuildClientPublic builds a K8s client and returns the cluster name. +func BuildClientPublic(kubeconfigPath, contextName string) (K8sClient, string, error) { + return buildClient(kubeconfigPath, contextName) +} + func buildClient(kubeconfigPath, contextName string) (K8sClient, string, error) { loadingRules := clientcmd.NewDefaultClientConfigLoadingRules() if kubeconfigPath != "" { From c4dff65d58c1f7478d7c2ac34f31e4a2997416cf Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 23:09:03 +0100 Subject: [PATCH 09/12] Fix DCGM node matching and CW error spam MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DCGM enrichment matched pods to instances by InstanceID, but pod.Spec.NodeName is the K8s hostname (e.g. ip-10-22-1-100.ec2.internal) while InstanceID is the EC2 ID (i-0671...). Add K8sNodeName field to GPUInstance and use it for DCGM matching. Also stop retrying CW queries after the first error — all nodes will get the same AccessDenied when credentials aren't available. --- internal/models/models.go | 1 + internal/providers/aws/cloudwatch.go | 15 +++++++++++---- internal/providers/k8s/discover.go | 1 + internal/providers/k8s/metrics.go | 6 +++++- internal/providers/k8s/metrics_test.go | 12 ++++++------ 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/internal/models/models.go b/internal/models/models.go index 0fd6557..8e99dbd 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -66,6 +66,7 @@ type GPUInstance struct { // Kubernetes (populated for k8s-node source) ClusterName string `json:"cluster_name,omitempty"` + K8sNodeName string `json:"k8s_node_name,omitempty"` GPUAllocated int `json:"gpu_allocated,omitempty"` // State diff --git a/internal/providers/aws/cloudwatch.go b/internal/providers/aws/cloudwatch.go index b9d1978..ab06d3e 100644 --- a/internal/providers/aws/cloudwatch.go +++ b/internal/providers/aws/cloudwatch.go @@ -112,6 +112,7 @@ func EnrichK8sGPUMetrics(ctx context.Context, client CloudWatchClient, instances Value: aws.String(clusterName), } + enriched := 0 for _, node := range nodes { instanceDim := cwtypes.Dimension{ Name: aws.String("InstanceId"), @@ -127,12 +128,18 @@ func EnrichK8sGPUMetrics(ctx context.Context, client CloudWatchClient, instances results, err := fetchMetrics(ctx, client, queries, start, now) if err != nil { - fmt.Fprintf(os.Stderr, " warning: Container Insights metrics unavailable for %s: %v\n", node.instanceID, err) - continue + fmt.Fprintf(os.Stderr, " warning: Container Insights metrics unavailable: %v\n", err) + break } - instances[node.index].AvgGPUUtilization = results["gpu_util_"+safeID] - instances[node.index].AvgGPUMemUtilization = results["gpu_mem_"+safeID] + if results["gpu_util_"+safeID] != nil { + instances[node.index].AvgGPUUtilization = results["gpu_util_"+safeID] + instances[node.index].AvgGPUMemUtilization = results["gpu_mem_"+safeID] + enriched++ + } + } + if enriched > 0 { + fmt.Fprintf(os.Stderr, " CloudWatch: got GPU metrics for %d of %d nodes\n", enriched, len(nodes)) } } diff --git a/internal/providers/k8s/discover.go b/internal/providers/k8s/discover.go index 14fe00c..e3316c0 100644 --- a/internal/providers/k8s/discover.go +++ b/internal/providers/k8s/discover.go @@ -164,6 +164,7 @@ func nodeToGPUInstance(node corev1.Node, gpuPods []corev1.Pod, clusterName strin Name: fmt.Sprintf("%s/%s", clusterName, hostname), Tags: tags, ClusterName: clusterName, + K8sNodeName: node.Name, GPUAllocated: gpuAllocated, InstanceType: instanceType, GPUModel: gpuModel, diff --git a/internal/providers/k8s/metrics.go b/internal/providers/k8s/metrics.go index ef47470..5275347 100644 --- a/internal/providers/k8s/metrics.go +++ b/internal/providers/k8s/metrics.go @@ -33,7 +33,11 @@ func EnrichDCGMMetrics(ctx context.Context, client K8sClient, instances []models if inst.Source != models.SourceK8sNode || inst.AvgGPUUtilization != nil { continue } - needsMetrics[inst.InstanceID] = i + key := inst.K8sNodeName + if key == "" { + key = inst.InstanceID + } + needsMetrics[key] = i } if len(needsMetrics) == 0 { return 0 diff --git a/internal/providers/k8s/metrics_test.go b/internal/providers/k8s/metrics_test.go index 329d7eb..4d7e851 100644 --- a/internal/providers/k8s/metrics_test.go +++ b/internal/providers/k8s/metrics_test.go @@ -50,7 +50,7 @@ func TestEnrichDCGMMetrics_PopulatesUtilization(t *testing.T) { nodes: &corev1.NodeList{}, pods: &corev1.PodList{ Items: []corev1.Pod{ - dcgmPod("dcgm-exporter-abc", "gpu-operator", "i-node1"), + dcgmPod("dcgm-exporter-abc", "gpu-operator", "ip-10-22-1-100.ec2.internal"), }, }, proxyData: map[string][]byte{ @@ -58,7 +58,7 @@ func TestEnrichDCGMMetrics_PopulatesUtilization(t *testing.T) { }, } instances := []models.GPUInstance{ - {InstanceID: "i-node1", Source: models.SourceK8sNode, Name: "cluster/i-node1"}, + {InstanceID: "i-abc123", K8sNodeName: "ip-10-22-1-100.ec2.internal", Source: models.SourceK8sNode, Name: "cluster/ip-10-22-1-100"}, } enriched := EnrichDCGMMetrics(context.Background(), client, instances) @@ -86,7 +86,7 @@ func TestEnrichDCGMMetrics_SkipsAlreadyEnriched(t *testing.T) { nodes: &corev1.NodeList{}, pods: &corev1.PodList{ Items: []corev1.Pod{ - dcgmPod("dcgm-exporter-abc", "gpu-operator", "i-node1"), + dcgmPod("dcgm-exporter-abc", "gpu-operator", "node1"), }, }, proxyData: map[string][]byte{ @@ -94,7 +94,7 @@ func TestEnrichDCGMMetrics_SkipsAlreadyEnriched(t *testing.T) { }, } instances := []models.GPUInstance{ - {InstanceID: "i-node1", Source: models.SourceK8sNode, AvgGPUUtilization: &gpuUtil}, + {InstanceID: "i-abc123", K8sNodeName: "node1", Source: models.SourceK8sNode, AvgGPUUtilization: &gpuUtil}, } enriched := EnrichDCGMMetrics(context.Background(), client, instances) @@ -131,13 +131,13 @@ func TestEnrichDCGMMetrics_HandlesScrapeError(t *testing.T) { nodes: &corev1.NodeList{}, pods: &corev1.PodList{ Items: []corev1.Pod{ - dcgmPod("dcgm-exporter-abc", "gpu-operator", "i-node1"), + dcgmPod("dcgm-exporter-abc", "gpu-operator", "node1"), }, }, proxyErr: fmt.Errorf("connection refused"), } instances := []models.GPUInstance{ - {InstanceID: "i-node1", Source: models.SourceK8sNode}, + {InstanceID: "node1", Source: models.SourceK8sNode}, } enriched := EnrichDCGMMetrics(context.Background(), client, instances) From d89df5fd5d092ef7328c5ebc9f8e7cd186d8a34a Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 23:20:02 +0100 Subject: [PATCH 10/12] Fix DCGM scrape spam and Prometheus node name mismatch DCGM: stop spamming per-node warnings when scrapes fail consistently (likely RBAC). Log one warning, bail after 3 consecutive failures. Prometheus: use K8sNodeName (the actual K8s hostname) in the PromQL node=~ regex instead of InstanceID (EC2 ID). The Prometheus node label matches K8s hostnames, not EC2 instance IDs. --- internal/providers/k8s/metrics.go | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/internal/providers/k8s/metrics.go b/internal/providers/k8s/metrics.go index 5275347..180b8e1 100644 --- a/internal/providers/k8s/metrics.go +++ b/internal/providers/k8s/metrics.go @@ -56,6 +56,7 @@ func EnrichDCGMMetrics(ctx context.Context, client K8sClient, instances []models fmt.Fprintf(os.Stderr, " Probing DCGM exporter on GPU nodes...\n") enriched := 0 + scrapeErrors := 0 for _, pod := range dcgmPods { idx, ok := needsMetrics[pod.Spec.NodeName] if !ok { @@ -64,7 +65,14 @@ func EnrichDCGMMetrics(ctx context.Context, client K8sClient, instances []models data, err := client.ProxyGet(ctx, pod.Namespace, pod.Name, "9400", "/metrics") if err != nil { - fmt.Fprintf(os.Stderr, " warning: DCGM scrape failed for %s: %v\n", pod.Spec.NodeName, err) + scrapeErrors++ + if scrapeErrors == 1 { + fmt.Fprintf(os.Stderr, " warning: DCGM scrape failed: %v\n", err) + } + if scrapeErrors >= 3 { + fmt.Fprintf(os.Stderr, " warning: DCGM scrape failing consistently, skipping remaining nodes\n") + break + } continue } @@ -73,6 +81,7 @@ func EnrichDCGMMetrics(ctx context.Context, client K8sClient, instances []models instances[idx].AvgGPUUtilization = gpuUtil instances[idx].AvgGPUMemUtilization = memUtil enriched++ + scrapeErrors = 0 } } @@ -164,7 +173,11 @@ func EnrichPrometheusMetrics(ctx context.Context, client K8sClient, instances [] if inst.Source != models.SourceK8sNode || inst.AvgGPUUtilization != nil { continue } - nodes = append(nodes, nodeRef{index: i, name: inst.InstanceID}) + name := inst.K8sNodeName + if name == "" { + name = inst.InstanceID + } + nodes = append(nodes, nodeRef{index: i, name: name}) } if len(nodes) == 0 { return 0 From fa00dff8670cddd968fd9a78e23e734efbf9ae1e Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 23:26:23 +0100 Subject: [PATCH 11/12] Include time window in low GPU utilization recommendation text --- internal/analysis/rules.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/analysis/rules.go b/internal/analysis/rules.go index 8b03d7b..a676f7f 100644 --- a/internal/analysis/rules.go +++ b/internal/analysis/rules.go @@ -365,11 +365,11 @@ func ruleK8sLowGPUUtil(inst *models.GPUInstance) { Type: "low_utilization", Severity: models.SeverityCritical, Confidence: 0.85, - Evidence: fmt.Sprintf("K8s GPU node utilization averaging %.1f%%. GPUs are allocated but barely used.", *inst.AvgGPUUtilization), + Evidence: fmt.Sprintf("K8s GPU node utilization averaging %.1f%% over the past 7 days. GPUs are allocated but barely used.", *inst.AvgGPUUtilization), }) inst.Recommendations = append(inst.Recommendations, models.Recommendation{ Action: models.ActionDownsize, - Description: fmt.Sprintf("GPU utilization averaging %.1f%%. Consider bin-packing more workloads, downsizing, or removing from the node pool.", *inst.AvgGPUUtilization), + Description: fmt.Sprintf("GPU utilization averaging %.1f%% over the past 7 days. Consider bin-packing more workloads, downsizing, or removing from the node pool.", *inst.AvgGPUUtilization), CurrentMonthlyCost: inst.MonthlyCost, RecommendedMonthlyCost: inst.MonthlyCost * 0.2, MonthlySavings: inst.MonthlyCost * 0.8, From 51db9f4d96a68db29f8aae8bb34fdeb8f1341b7f Mon Sep 17 00:00:00 2001 From: Stas Maksimov Date: Sun, 19 Apr 2026 23:41:41 +0100 Subject: [PATCH 12/12] Skip CW enrichment when AWS creds unavailable, reduce DCGM noise --- cmd/gpuaudit/main.go | 10 ++++++---- internal/providers/k8s/metrics.go | 4 +++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cmd/gpuaudit/main.go b/cmd/gpuaudit/main.go index 2aca4b5..08b3e3c 100644 --- a/cmd/gpuaudit/main.go +++ b/cmd/gpuaudit/main.go @@ -104,8 +104,10 @@ func runScan(cmd *cobra.Command, args []string) error { opts.ExcludeTags = parseExcludeTags(scanExcludeTags) opts.MinUptimeDays = scanMinUptimeDays + awsAvailable := true result, err := awsprovider.Scan(ctx, opts) if err != nil { + awsAvailable = false if scanSkipK8s { return fmt.Errorf("scan failed: %w", err) } @@ -127,7 +129,7 @@ func runScan(cmd *cobra.Command, args []string) error { fmt.Fprintf(os.Stderr, " warning: Kubernetes scan failed: %v\n", err) } else if len(k8sInstances) > 0 { if !scanSkipMetrics { - enrichK8sGPUMetrics(ctx, k8sInstances, k8sOpts, opts) + enrichK8sGPUMetrics(ctx, k8sInstances, k8sOpts, opts, awsAvailable) } analysis.AnalyzeAll(k8sInstances) result.Instances = append(result.Instances, k8sInstances...) @@ -317,9 +319,9 @@ func parseExcludeTags(raw []string) map[string]string { return tags } -func enrichK8sGPUMetrics(ctx context.Context, instances []models.GPUInstance, k8sOpts k8sprovider.ScanOptions, awsOpts awsprovider.ScanOptions) { - // Source 1: CloudWatch Container Insights - if len(instances) > 0 && instances[0].ClusterName != "" { +func enrichK8sGPUMetrics(ctx context.Context, instances []models.GPUInstance, k8sOpts k8sprovider.ScanOptions, awsOpts awsprovider.ScanOptions, awsAvailable bool) { + // Source 1: CloudWatch Container Insights (skip if AWS creds unavailable) + if awsAvailable && len(instances) > 0 && instances[0].ClusterName != "" { cfgOpts := []func(*awsconfig.LoadOptions) error{} if awsOpts.Profile != "" { cfgOpts = append(cfgOpts, awsconfig.WithSharedConfigProfile(awsOpts.Profile)) diff --git a/internal/providers/k8s/metrics.go b/internal/providers/k8s/metrics.go index 180b8e1..4a587c2 100644 --- a/internal/providers/k8s/metrics.go +++ b/internal/providers/k8s/metrics.go @@ -85,7 +85,9 @@ func EnrichDCGMMetrics(ctx context.Context, client K8sClient, instances []models } } - fmt.Fprintf(os.Stderr, " DCGM: got GPU metrics for %d of %d remaining nodes\n", enriched, len(needsMetrics)) + if enriched > 0 { + fmt.Fprintf(os.Stderr, " DCGM: got GPU metrics for %d of %d remaining nodes\n", enriched, len(needsMetrics)) + } return enriched }