diff --git a/README.md b/README.md index 78ffe73..ce617e8 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,8 @@ kube_pod_status_phase{namespace="default", pod="nginx", phase="Failed"} 0 We adopt the same pattern for controller Conditions, but we export only one time series per (status, reason) variant, meaning we delete all other variants in the group when we set the metric, ensuring the cardinality stays under control. +Additionally, rather than return 1/0 indicating the activeness of the metric, we set the last transition time of the +condition as the value (unix timestamp). Example metric: @@ -146,12 +148,13 @@ operator_controller_condition{ condition="Ready", status="False", reason="FailedToProvision" -} 1 +} 17591743210 ``` - **Index**: controller, resource_kind, resource_name, resource_namespace - **Group**: condition - **Extra**: status, reason +- **Metric Value**: Unix timestamp of last transition of given condition ### Initialization @@ -223,10 +226,12 @@ const ( ) // SetStatusCondition utility function which replaces and wraps meta.SetStatusCondition calls -func (r *MyReconciler) SetStatusCondition(cr *v1.MyCR, condition metav1.Condition) bool { - changed := meta.SetStatusCondition(&cr.Status.Conditions, condition) +func (r *MyReconciler) SetStatusCondition(cr *v1.MyCR, cond metav1.Condition) bool { + changed := meta.SetStatusCondition(&cr.Status.Conditions, cond) if changed { - r.Recorder.RecordConditionFor(kind, cr, condition.Type, string(condition.Status), condition.Reason) + r.Recorder.RecordConditionFor( + kind, cr, cond.Type, string(cond.Status), cond.Reason, cond.LastTransitionTime, + ) } return changed } diff --git a/pkg/operator_condition_metrics/operator_condition_metrics.go b/pkg/operator_condition_metrics/operator_condition_metrics.go index 00f820b..c017d1b 100644 --- a/pkg/operator_condition_metrics/operator_condition_metrics.go +++ b/pkg/operator_condition_metrics/operator_condition_metrics.go @@ -1,6 +1,8 @@ package operator_condition_metrics import ( + "time" + metrics "github.com/sourcehawk/go-prometheus-gaugevecset/pkg/gauge_vec_set" ) @@ -15,7 +17,8 @@ and marking exactly one as active (1) while the others are inactive (0). Example kube_pod_status_phase{namespace="default", pod="nginx", phase="Failed"} 0 We adopt the same pattern for controller Conditions, but we export one time series per (status, reason) variant -and enforce **exclusivity per condition**. +and enforce **exclusivity per condition**. The value of the metric we set is also the last transition time of the +condition. For any given (controller, kind, name, namespace, condition) exactly one (status, reason) series is present at a time. All other variants are **deleted**. This keeps cardinality under control. @@ -33,7 +36,7 @@ Labels (order matches registration) - reason: short machine-typed reason (often "" when status="True") Value - - Always 1 for the single active (status, reason) series in the group. + - The timestamp of last transition time for the condition Examples: @@ -47,7 +50,7 @@ Examples: condition="Ready", status="True", reason="" - } 1 + } 1759174202 (Other status/reason variants for this condition are removed.) @@ -60,7 +63,7 @@ Examples: condition="Ready", status="False", reason="Failed" - } 1 + } 1759174205 3. Another condition can be active simultaneously (different group): @@ -69,7 +72,7 @@ Examples: condition="Synchronized", status="True", reason="" - } 1 + } 17591743210 Cleanup When the resource is deleted/pruned, all series for its index key @@ -157,7 +160,9 @@ type ConditionMetricRecorder struct { // RecordConditionFor sets a condition metric for a given controller and object. // // It enforces exclusivity within the same (controller, name, namespace, condition) group, -// ensuring that only the latest status (True/False/Unknown) is present for a given condition type. +// ensuring that only the latest (status, phase) is present for a given condition type. +// +// If the lastTransitionTime is zero, the value of the metric is set to the unix timestamp for time.Now().UTC() // // The following label values are set: // @@ -171,15 +176,20 @@ type ConditionMetricRecorder struct { // // Example: // -// r.RecordConditionFor(kind, obj, "Ready", "True", "AppReady") +// r.RecordConditionFor(kind, obj, "Ready", "True", "AppReady", lastTransitionTime) func (r *ConditionMetricRecorder) RecordConditionFor( - kind string, object ObjectLike, conditionType, conditionStatus, conditionReason string, + kind string, object ObjectLike, + conditionType, conditionStatus, conditionReason string, lastTransitionTime time.Time, ) { indexValues := []string{r.Controller, kind, object.GetName(), object.GetNamespace()} groupValues := []string{conditionType} extraValues := []string{conditionStatus, conditionReason} - r.OperatorConditionsGauge.SetGroup(1, indexValues, groupValues, extraValues...) + if lastTransitionTime.IsZero() { + lastTransitionTime = time.Now().UTC() + } + + r.OperatorConditionsGauge.SetGroup(float64(lastTransitionTime.Unix()), indexValues, groupValues, extraValues...) } // RemoveConditionsFor deletes all condition metrics for a given resource. diff --git a/pkg/operator_condition_metrics/operator_condition_metrics_benchmark_test.go b/pkg/operator_condition_metrics/operator_condition_metrics_benchmark_test.go index e64b92c..91344f0 100644 --- a/pkg/operator_condition_metrics/operator_condition_metrics_benchmark_test.go +++ b/pkg/operator_condition_metrics/operator_condition_metrics_benchmark_test.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" "testing" + "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/expfmt" @@ -65,6 +66,7 @@ func createBenchmarkScenario(tb testing.TB, registry *prometheus.Registry) *Cond } obj := &FakeObject{} + transitionTime := time.Now().UTC() condition := &FakeCondition{ Status: "True", // doesn't matter, cardinality decided by Reason @@ -82,7 +84,7 @@ func createBenchmarkScenario(tb testing.TB, registry *prometheus.Registry) *Cond for v := 0; v < variantsPerCondition; v++ { condition.Reason = generatedName("variant", v) - rec.RecordConditionFor(kind, obj, condition.Type, condition.Reason, condition.Reason) + rec.RecordConditionFor(kind, obj, condition.Type, condition.Reason, condition.Reason, transitionTime) } } } @@ -106,6 +108,7 @@ func Benchmark_ConditionMetricsRecorder_TimePerCall(b *testing.B) { Name: "Resource0", Namespace: "namespace0", } + transitionTime := time.Now().UTC() // Two variants in the same (controller,kind,name,namespace,condition) group. condTrue := &FakeCondition{ @@ -126,9 +129,9 @@ func Benchmark_ConditionMetricsRecorder_TimePerCall(b *testing.B) { for i := 0; i < b.N; i++ { // Flip between two variants if (i & 1) == 0 { - rec.RecordConditionFor(kind, obj, condTrue.Type, condTrue.Status, condTrue.Reason) + rec.RecordConditionFor(kind, obj, condTrue.Type, condTrue.Status, condTrue.Reason, transitionTime) } else { - rec.RecordConditionFor(kind, obj, condFalse.Type, condFalse.Status, condFalse.Reason) + rec.RecordConditionFor(kind, obj, condFalse.Type, condFalse.Status, condFalse.Reason, transitionTime) } } }) @@ -140,7 +143,7 @@ func Benchmark_ConditionMetricsRecorder_TimePerCall(b *testing.B) { for i := 0; i < b.N; i++ { // Ensure there is something to remove, but do not count the set time. b.StopTimer() - rec.RecordConditionFor(kind, obj, condTrue.Type, condTrue.Status, condTrue.Reason) + rec.RecordConditionFor(kind, obj, condTrue.Type, condTrue.Status, condTrue.Reason, transitionTime) b.StartTimer() rec.RemoveConditionsFor(kind, obj) diff --git a/pkg/operator_condition_metrics/operator_condition_metrics_test.go b/pkg/operator_condition_metrics/operator_condition_metrics_test.go index 48303e7..8273ac2 100644 --- a/pkg/operator_condition_metrics/operator_condition_metrics_test.go +++ b/pkg/operator_condition_metrics/operator_condition_metrics_test.go @@ -3,6 +3,7 @@ package operator_condition_metrics import ( "strings" "testing" + "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" @@ -30,23 +31,25 @@ func TestConditionMetricRecorder_Record_Transition_And_SecondCondition(t *testin kind := "MyCRD" name := "cr-1" ns := "prod" + transitionTime := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) + obj := makeObj(name, ns) // Record Ready=True - rec.RecordConditionFor(kind, obj, "Ready", "True", "") + rec.RecordConditionFor(kind, obj, "Ready", "True", "", transitionTime) // Flip Ready -> False with reason - rec.RecordConditionFor(kind, obj, "Ready", "False", "Failed") + rec.RecordConditionFor(kind, obj, "Ready", "False", "Failed", transitionTime) // Another condition Synchronized=True (independent group) - rec.RecordConditionFor(kind, obj, "Synchronized", "True", "") + rec.RecordConditionFor(kind, obj, "Synchronized", "True", "", transitionTime) // Expect: Ready False(reason)=1, Synchronized True=1 want := ` # HELP test_record_transition_and_second_condition_controller_condition Condition status for a custom resource; one active (status,reason) time series per (controller,kind,name,namespace,condition). # TYPE test_record_transition_and_second_condition_controller_condition gauge -test_record_transition_and_second_condition_controller_condition{condition="Ready",controller="my-controller",reason="Failed",resource_kind="MyCRD",resource_name="cr-1",resource_namespace="prod",status="False"} 1 -test_record_transition_and_second_condition_controller_condition{condition="Synchronized",controller="my-controller",reason="",resource_kind="MyCRD",resource_name="cr-1",resource_namespace="prod",status="True",} 1 +test_record_transition_and_second_condition_controller_condition{condition="Ready",controller="my-controller",reason="Failed",resource_kind="MyCRD",resource_name="cr-1",resource_namespace="prod",status="False"} 1735689600 +test_record_transition_and_second_condition_controller_condition{condition="Synchronized",controller="my-controller",reason="",resource_kind="MyCRD",resource_name="cr-1",resource_namespace="prod",status="True",} 1735689600 ` require.NoError(t, testutil.GatherAndCompare( @@ -72,10 +75,11 @@ func TestConditionMetricRecorder_RemoveConditionsFor(t *testing.T) { kind := "MyCRD" name := "cr-2" ns := "staging" + transitionTime := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) obj := makeObj(name, ns) - rec.RecordConditionFor(kind, obj, "Ready", "True", "") - rec.RecordConditionFor(kind, obj, "Synchronized", "False", "SyncPending") + rec.RecordConditionFor(kind, obj, "Ready", "True", "", transitionTime) + rec.RecordConditionFor(kind, obj, "Synchronized", "False", "SyncPending", transitionTime) // Remove all condition series for this object removed := rec.RemoveConditionsFor(kind, obj) @@ -103,16 +107,17 @@ func TestConditionMetricRecorder_SetsKindLabelFromObject(t *testing.T) { kind := "FancyKind" name := "obj-1" ns := "ns-1" + transitionTime := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) obj := makeObj(name, ns) // Record a condition - rec.RecordConditionFor(kind, obj, "Ready", "True", "") + rec.RecordConditionFor(kind, obj, "Ready", "True", "", transitionTime) // Expect the 'kind' label to reflect the object's Kind want := ` # HELP test_sets_kind_label_from_object_controller_condition Condition status for a custom resource; one active (status,reason) time series per (controller,kind,name,namespace,condition). # TYPE test_sets_kind_label_from_object_controller_condition gauge -test_sets_kind_label_from_object_controller_condition{condition="Ready",controller="my-controller",reason="",resource_kind="FancyKind",resource_name="obj-1",resource_namespace="ns-1",status="True"} 1 +test_sets_kind_label_from_object_controller_condition{condition="Ready",controller="my-controller",reason="",resource_kind="FancyKind",resource_name="obj-1",resource_namespace="ns-1",status="True"} 1735689600 ` require.NoError(t, testutil.GatherAndCompare(