Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ WIP TESTS
| ✅ | Ground Truth: DCGM Metrics vs allocations | node_gpu_count | /allocation | Query DCGM directly for expected GPUs counts and IDs. Compare those to the results returned from allocations. ensure the GPU ids are all present, as well as the amounts | Medium |
| ✅ | Data Quality: Non-zero GPU Costs | node_gpu_hourly_cost | /allocation | Query the API directly and ensure that there are >0 GPU costs for >0 allocations | Low |
| [ ] | Exported Cloud Costs Vs Ground Truth | N/A - creating these is in project scope | /cloudCost | Configure a prometheus exporter for cloud costs. with cloud costs enabled, query the upstream for cloud costs. then, query /cloudCosts on the OpenCost install. Verify returned results against the expected items. | High |
| [ ] | Ensure no pod restarts on OpenCost | kube_pod_container_status_running | N/A | use a kubernetes client to ensure the OpenCost pod has 0 restarts and no errors in the logs. also, query prometheus to ensure the opencost pod never left the running state after its initial boot | Low |
| | Ensure no pod restarts on OpenCost | kube_pod_container_status_running | N/A | use a kubernetes client to ensure the OpenCost pod has 0 restarts and no errors in the logs. also, query prometheus to ensure the opencost pod never left the running state after its initial boot | Low |
| [ ] | Implement Dev Stack for OpenCost | N/A | N/A | Deploy a dev stack for opencost. the integration tests should be run on a loop on that. Images that pass testing should get promoted to the dev env. | Medium |
| [ ] | Implement Chaos testing on OpenCost Dev Stack | N/A| N/A | Deploy a chaos monkey to the opencost dev stack. Integration tests should be updated to allow for temporary outages when monkey killed a pod, but should expect both prom backed and promless data to be intact.| Medium |
| [ ] | Log Inspector Test | N/A| N/A | Implement a test that scans opencost logs inside k8s. If there is a panic or an ERROR log, fail the test | Medium |
3 changes: 3 additions & 0 deletions pkg/prometheus/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ type Metric struct {
Pod string `json:"pod"`
Namespace string `json:"namespace"`
Container string `json:"container"`
UID string `json:"uid"`

Node string `json:"node"`
Instance string `json:"instance"`
Expand Down Expand Up @@ -125,6 +126,8 @@ func (m *Metric) UnmarshalJSON(data []byte) error {
m.Namespace = strVal
case "container":
m.Container = strVal
case "uid":
m.UID = strVal
case "node":
m.Node = strVal
case "instance":
Expand Down
110 changes: 110 additions & 0 deletions test/integration/prometheus/no_pod_restarts_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package prometheus

// Description - Checks for Pod Restarts

import (
"github.com/opencost/opencost-integration-tests/pkg/prometheus"
"testing"
"time"
)

const Resolution = "1m"
const Tolerance = 0.07
const NegligibleUsage = 0.01

func TestNoPodRestart(t *testing.T) {

testCases := []struct {
name string
window string
aggregate string
accumulate string
}{
{
name: "Yesterday",
window: "24h",
aggregate: "namespace",
accumulate: "false",
},
{
name: "Last 7 days",
window: "168h",
aggregate: "namespace",
accumulate: "false",
},
}

t.Logf("testCases: %v", testCases)

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {

// Use this information to find start and end time of pod
queryEnd := time.Now().UTC().Truncate(time.Hour).Add(time.Hour)

endTime := queryEnd.Unix()

client := prometheus.NewClient()
// Query all running pod information
// avg(kube_pod_container_status_running{} != 0)
// by
// ("uid", pod, namespace)[24h:5m]

// Q) != 0 is not necessary I suppose?
promPodInfoInput := prometheus.PrometheusInput{}

promPodInfoInput.Metric = "kube_pod_container_status_running"
promPodInfoInput.MetricNotEqualTo = "0"
promPodInfoInput.AggregateBy = []string{"uid", "pod", "namespace"}
promPodInfoInput.Function = []string{"avg"}
promPodInfoInput.AggregateWindow = tc.window
promPodInfoInput.AggregateResolution = Resolution
promPodInfoInput.Time = &endTime

podInfo, err := client.RunPromQLQuery(promPodInfoInput)
if err != nil {
t.Fatalf("Error while calling Prometheus API %v", err)
}

type PodKey struct {
Namespace string
Pod string
}

// Number of Pod Duplicates (includes replicas and restarts)
podMap := make(map[PodKey]int)

for _, podInfoResponseItem := range podInfo.Data.Result {
pod := podInfoResponseItem.Metric.Pod
namespace := podInfoResponseItem.Metric.Namespace
uid := podInfoResponseItem.Metric.UID

if uid == "" {
continue
}

podKey := PodKey{
Namespace: namespace,
Pod: pod,
}

_, ok := podMap[podKey]
t.Logf("%v", podKey)
if !ok {
podMap[podKey] = 1
} else {
podMap[podKey] += 1
}
}

// Windows are not accurate for prometheus and allocation
for pod, count := range podMap {
if count > 1 {
t.Errorf("[Fail] %v: Pod Restarted. %v Duplicates Found.", pod, count)
} else {
t.Logf("[Pass] %v", pod)
}
}
})
}
}
9 changes: 8 additions & 1 deletion test/integration/prometheus/test.bats
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,17 @@ teardown() {
# ------------------------------------------------------



# ------------------------------------------------------
# Node Costs
@test "prometheus: Node Hourly Cost" {
go test ./test/integration/prometheus/node_costs_test.go
}
# ------------------------------------------------------


# ------------------------------------------------------
# Miscellaneous
@test "prometheus: No Pod Restarts" {
go test ./test/integration/prometheus/no_pod_restarts_test.go
}
# ------------------------------------------------------
Loading