diff --git a/README.md b/README.md index 67be318..341c905 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,7 @@ WIP TESTS | ✅ | Ground Truth: DCGM Metrics vs allocations | node_gpu_count | /allocation | Query DCGM directly for expected GPUs counts and IDs. Compare those to the results returned from allocations. ensure the GPU ids are all present, as well as the amounts | Medium | | ✅ | Data Quality: Non-zero GPU Costs | node_gpu_hourly_cost | /allocation | Query the API directly and ensure that there are >0 GPU costs for >0 allocations | Low | | [ ] | Exported Cloud Costs Vs Ground Truth | N/A - creating these is in project scope | /cloudCost | Configure a prometheus exporter for cloud costs. with cloud costs enabled, query the upstream for cloud costs. then, query /cloudCosts on the OpenCost install. Verify returned results against the expected items. | High | -| [ ] | Ensure no pod restarts on OpenCost | kube_pod_container_status_running | N/A | use a kubernetes client to ensure the OpenCost pod has 0 restarts and no errors in the logs. also, query prometheus to ensure the opencost pod never left the running state after its initial boot | Low | +| ✅ | Ensure no pod restarts on OpenCost | kube_pod_container_status_running | N/A | use a kubernetes client to ensure the OpenCost pod has 0 restarts and no errors in the logs. also, query prometheus to ensure the opencost pod never left the running state after its initial boot | Low | | [ ] | Implement Dev Stack for OpenCost | N/A | N/A | Deploy a dev stack for opencost. the integration tests should be run on a loop on that. Images that pass testing should get promoted to the dev env. | Medium | | [ ] | Implement Chaos testing on OpenCost Dev Stack | N/A| N/A | Deploy a chaos monkey to the opencost dev stack. Integration tests should be updated to allow for temporary outages when monkey killed a pod, but should expect both prom backed and promless data to be intact.| Medium | | [ ] | Log Inspector Test | N/A| N/A | Implement a test that scans opencost logs inside k8s. If there is a panic or an ERROR log, fail the test | Medium | \ No newline at end of file diff --git a/pkg/prometheus/client.go b/pkg/prometheus/client.go index 8fa55ec..d0bdc39 100644 --- a/pkg/prometheus/client.go +++ b/pkg/prometheus/client.go @@ -63,6 +63,7 @@ type Metric struct { Pod string `json:"pod"` Namespace string `json:"namespace"` Container string `json:"container"` + UID string `json:"uid"` Node string `json:"node"` Instance string `json:"instance"` @@ -125,6 +126,8 @@ func (m *Metric) UnmarshalJSON(data []byte) error { m.Namespace = strVal case "container": m.Container = strVal + case "uid": + m.UID = strVal case "node": m.Node = strVal case "instance": diff --git a/test/integration/prometheus/no_pod_restarts_test.go b/test/integration/prometheus/no_pod_restarts_test.go new file mode 100644 index 0000000..2f2e68e --- /dev/null +++ b/test/integration/prometheus/no_pod_restarts_test.go @@ -0,0 +1,110 @@ +package prometheus + +// Description - Checks for Pod Restarts + +import ( + "github.com/opencost/opencost-integration-tests/pkg/prometheus" + "testing" + "time" +) + +const Resolution = "1m" +const Tolerance = 0.07 +const NegligibleUsage = 0.01 + +func TestNoPodRestart(t *testing.T) { + + testCases := []struct { + name string + window string + aggregate string + accumulate string + }{ + { + name: "Yesterday", + window: "24h", + aggregate: "namespace", + accumulate: "false", + }, + { + name: "Last 7 days", + window: "168h", + aggregate: "namespace", + accumulate: "false", + }, + } + + t.Logf("testCases: %v", testCases) + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + + // Use this information to find start and end time of pod + queryEnd := time.Now().UTC().Truncate(time.Hour).Add(time.Hour) + + endTime := queryEnd.Unix() + + client := prometheus.NewClient() + // Query all running pod information + // avg(kube_pod_container_status_running{} != 0) + // by + // ("uid", pod, namespace)[24h:5m] + + // Q) != 0 is not necessary I suppose? + promPodInfoInput := prometheus.PrometheusInput{} + + promPodInfoInput.Metric = "kube_pod_container_status_running" + promPodInfoInput.MetricNotEqualTo = "0" + promPodInfoInput.AggregateBy = []string{"uid", "pod", "namespace"} + promPodInfoInput.Function = []string{"avg"} + promPodInfoInput.AggregateWindow = tc.window + promPodInfoInput.AggregateResolution = Resolution + promPodInfoInput.Time = &endTime + + podInfo, err := client.RunPromQLQuery(promPodInfoInput) + if err != nil { + t.Fatalf("Error while calling Prometheus API %v", err) + } + + type PodKey struct { + Namespace string + Pod string + } + + // Number of Pod Duplicates (includes replicas and restarts) + podMap := make(map[PodKey]int) + + for _, podInfoResponseItem := range podInfo.Data.Result { + pod := podInfoResponseItem.Metric.Pod + namespace := podInfoResponseItem.Metric.Namespace + uid := podInfoResponseItem.Metric.UID + + if uid == "" { + continue + } + + podKey := PodKey{ + Namespace: namespace, + Pod: pod, + } + + _, ok := podMap[podKey] + t.Logf("%v", podKey) + if !ok { + podMap[podKey] = 1 + } else { + podMap[podKey] += 1 + } + } + + // Windows are not accurate for prometheus and allocation + for pod, count := range podMap { + if count > 1 { + t.Errorf("[Fail] %v: Pod Restarted. %v Duplicates Found.", pod, count) + } else { + t.Logf("[Pass] %v", pod) + } + } + }) + } +} diff --git a/test/integration/prometheus/test.bats b/test/integration/prometheus/test.bats index 2b6c5bf..acbe932 100644 --- a/test/integration/prometheus/test.bats +++ b/test/integration/prometheus/test.bats @@ -78,10 +78,17 @@ teardown() { # ------------------------------------------------------ - # ------------------------------------------------------ # Node Costs @test "prometheus: Node Hourly Cost" { go test ./test/integration/prometheus/node_costs_test.go } +# ------------------------------------------------------ + + +# ------------------------------------------------------ +# Miscellaneous +@test "prometheus: No Pod Restarts" { + go test ./test/integration/prometheus/no_pod_restarts_test.go +} # ------------------------------------------------------ \ No newline at end of file