opencost · Manas23601 · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025 · Aug 26, 2025
@@ -102,7 +102,7 @@ WIP TESTS
 | ✅        | Ground Truth: DCGM Metrics vs allocations | node_gpu_count | /allocation | Query DCGM directly for expected GPUs counts and IDs. Compare those to the results returned from allocations. ensure the GPU ids are all present, as well as the amounts | Medium |
 | ✅        | Data Quality: Non-zero GPU Costs | node_gpu_hourly_cost | /allocation | Query the API directly and ensure that there are >0 GPU costs for >0 allocations | Low |
 | [ ]        | Exported Cloud Costs Vs Ground Truth | N/A - creating these is in project scope | /cloudCost | Configure a prometheus exporter for cloud costs. with cloud costs enabled, query the upstream for cloud costs. then, query /cloudCosts on the OpenCost install. Verify returned results against the expected items. | High |
-| [ ]        | Ensure no pod restarts on OpenCost | kube_pod_container_status_running | N/A | use a kubernetes client to ensure the OpenCost pod has 0 restarts and no errors in the logs. also, query prometheus to ensure the opencost pod never left the running state after its initial boot | Low |
+| ✅        | Ensure no pod restarts on OpenCost | kube_pod_container_status_running | N/A | use a kubernetes client to ensure the OpenCost pod has 0 restarts and no errors in the logs. also, query prometheus to ensure the opencost pod never left the running state after its initial boot | Low |
 | [ ]        | Implement Dev Stack for OpenCost | N/A | N/A | Deploy a dev stack for opencost. the integration tests should be run on a loop on that. Images that pass testing should get promoted to the dev env. | Medium |
 | [ ]        | Implement Chaos testing on OpenCost Dev Stack | N/A| N/A | Deploy a chaos monkey to the opencost dev stack. Integration tests should be updated to allow for temporary outages when monkey killed a pod, but should expect both prom backed and promless data to be intact.| Medium |
 | [ ]        | Log Inspector Test | N/A| N/A | Implement a test that scans opencost logs inside k8s. If there is a panic or an ERROR log, fail the test | Medium |
@@ -63,6 +63,7 @@ type Metric struct {
 	Pod       string `json:"pod"`
 	Namespace string `json:"namespace"`
 	Container string `json:"container"`
+	UID		  string `json:"uid"`
 
 	Node         string `json:"node"`
 	Instance     string `json:"instance"`
@@ -125,6 +126,8 @@ func (m *Metric) UnmarshalJSON(data []byte) error {
 			m.Namespace = strVal
 		case "container":
 			m.Container = strVal
+		case "uid":
+			m.UID = strVal
 		case "node":
 			m.Node = strVal
 		case "instance":

@@ -0,0 +1,110 @@
+package prometheus
+
+// Description - Checks for Pod Restarts
+
+import (
+	"github.com/opencost/opencost-integration-tests/pkg/prometheus"
+	"testing"
+	"time"
+)
+
+const Resolution = "1m"
+const Tolerance = 0.07
+const NegligibleUsage = 0.01
+
+func TestNoPodRestart(t *testing.T) {
+
+	testCases := []struct {
+		name       string
+		window     string
+		aggregate  string
+		accumulate string
+	}{
+		{
+			name:       "Yesterday",
+			window:     "24h",
+			aggregate:  "namespace",
+			accumulate: "false",
+		},
+		{
+			name:       "Last 7 days",
+			window:     "168h",
+			aggregate:  "namespace",
+			accumulate: "false",
+		},
+	}
+
+	t.Logf("testCases: %v", testCases)
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+
+			// Use this information to find start and end time of pod
+			queryEnd := time.Now().UTC().Truncate(time.Hour).Add(time.Hour)
+
+			endTime := queryEnd.Unix()
+
+			client := prometheus.NewClient()
+			// Query all running pod information
+			// avg(kube_pod_container_status_running{} != 0)
+			// by
+			// ("uid", pod, namespace)[24h:5m]
+
+			// Q) != 0 is not necessary I suppose?
+			promPodInfoInput := prometheus.PrometheusInput{}
+
+			promPodInfoInput.Metric = "kube_pod_container_status_running"
+			promPodInfoInput.MetricNotEqualTo = "0"
+			promPodInfoInput.AggregateBy = []string{"uid", "pod", "namespace"}
+			promPodInfoInput.Function = []string{"avg"}
+			promPodInfoInput.AggregateWindow = tc.window
+			promPodInfoInput.AggregateResolution = Resolution
+			promPodInfoInput.Time = &endTime
+
+			podInfo, err := client.RunPromQLQuery(promPodInfoInput)
+			if err != nil {
+				t.Fatalf("Error while calling Prometheus API %v", err)
+			}
+
+			type PodKey struct {
+				Namespace string
+				Pod       string
+			}
+
+			// Number of Pod Duplicates (includes replicas and restarts)
+			podMap := make(map[PodKey]int)
+
+			for _, podInfoResponseItem := range podInfo.Data.Result {
+				pod := podInfoResponseItem.Metric.Pod
+				namespace := podInfoResponseItem.Metric.Namespace
+				uid := podInfoResponseItem.Metric.UID
+
+				if uid == "" {
+					continue
+				}
+
+				podKey := PodKey{
+					Namespace: namespace,
+					Pod: pod,
+				}
+
+				_, ok := podMap[podKey]
+				t.Logf("%v", podKey)
+				if !ok {
+					podMap[podKey] = 1
+				} else {
+					podMap[podKey] += 1
+				}
+			}
+
+			// Windows are not accurate for prometheus and allocation
+			for pod, count := range podMap {
+				if count > 1 {
+					t.Errorf("[Fail] %v: Pod Restarted. %v Duplicates Found.", pod, count)
+				} else {
+					t.Logf("[Pass] %v", pod)
+				}
+			}
+		})
+	}
+}
@@ -78,10 +78,17 @@ teardown() {
 # ------------------------------------------------------
 
 
-
 # ------------------------------------------------------
 # Node Costs
 @test "prometheus: Node Hourly Cost" {
     go test ./test/integration/prometheus/node_costs_test.go
 }
+# ------------------------------------------------------
+
+
+# ------------------------------------------------------
+# Miscellaneous
+@test "prometheus: No Pod Restarts" {
+    go test ./test/integration/prometheus/no_pod_restarts_test.go
+}
 # ------------------------------------------------------