diff --git a/.gitignore b/.gitignore index 86733f8..ed20272 100644 --- a/.gitignore +++ b/.gitignore @@ -28,5 +28,5 @@ go.work.sum .env # Editor/IDE - .idea/ - .vscode/ +.idea/ +.vscode/ diff --git a/README.md b/README.md index a466b54..ae7c132 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,6 @@ A flexible, memory efficient Prometheus `GaugeVec` wrapper for managing **sets** of metrics. ---- - -## GaugeVecSet - The `GaugeVecSet` is a high-performance wrapper around Prometheus `GaugeVec` that enables bulk operations on series by specified index and grouping labels. @@ -105,162 +101,13 @@ deleted := PodPhase.DeleteByIndex("prod") ### GaugeVecSet: DeleteByGroup -Delete all series that match the given (index, group) +Delete all series that match the given (index, group). The number of index and group values this method requires +coincides with the number of values the gauge was initialized with, meaning you cannot specify partial values for +deletion. ```go deleted := PodPhase.DeleteByGroup( []string{"prod"}, // index "nginx-6f4c", // group ) -``` - -## ConditionMetricsRecorder - -The `ConditionMetricsRecorder` is an implementation of `GaugeVecSet` for kubernetes operators. It enables -controllers to record metrics for it's kubernetes `metav1.Conditions` on custom resources. - -It is inspired by kube-state-metrics patterns for metrics such as `kube_pod_status_phase`. KSM exports one time series -per phase for each (namespace, pod), and marks exactly one as active (1) while the others are inactive (0). This metric -can be thought of as a `GaugeVecSet` with the index label `namespace`, the group `pod` and the `extra` labels -(i.e. variants per group) as the options for `phase`. - -Example: - -``` -kube_pod_status_phase{namespace="default", pod="nginx", phase="Running"} 1 -kube_pod_status_phase{namespace="default", pod="nginx", phase="Pending"} 0 -kube_pod_status_phase{namespace="default", pod="nginx", phase="Failed"} 0 -``` - -We adopt the same pattern for controller Conditions, but we export only one time series per (status, reason) variant, -meaning we delete all other variants in the group when we set the metric, ensuring the cardinality stays under control. -Additionally, rather than return 1/0 indicating the activeness of the metric, we set the last transition time of the -condition as the value (unix timestamp). - -Example metric: - -``` -operator_controller_condition{ - controller="my_controller", - resource_kind="MyCR", - resource_name="my-cr", - resource_namespace="default", - condition="Ready", - status="False", - reason="FailedToProvision" -} 17591743210 -``` - -- **Index**: controller, resource_kind, resource_name, resource_namespace -- **Group**: condition -- **Extra**: status, reason -- **Metric Value**: Unix timestamp of last transition of given condition - -### Initialization - -The metric should be initialized and registered once. - -You can embed the `ControllerMetricsRecorder` in your controller's recorder. - -```go -package my_metrics - -import ( - controllermetrics "sigs.k8s.io/controller-runtime/pkg/metrics" - ocg "github.com/sourcehawk/go-prometheus-gaugevecset/pkg/operator_condition_metrics" -) - -// We need this variable later to create the ConditionMetricsRecorder -var OperatorConditionsGauge *ocg.OperatorConditionsGauge - -// Initialize the operator condition gauge once -func init() { - OperatorConditionsGauge = ocg.NewOperatorConditionsGauge("my-operator") - controllermetrics.Registry.MustRegister(OperatorConditionsGauge) -} - -// Embed in existing metrics recorder -type MyControllerRecorder struct { - ocg.ConditionMetricRecorder -} -``` - -When constructing your reconciler, initialize the condition metrics recorder with the -operator conditions gauge and a unique name for each controller. - -_cmd/main.go_ -```go -package main - -import ( - mymetrics "path/to/pkg/my_metrics" - ocg "github.com/sourcehawk/go-prometheus-gaugevecset/pkg/operator_condition_metrics" -) - -func main() { - // ... - recorder := mymetrics.MyControllerRecorder{ - ConditionMetricRecorder: ocg.ConditionMetricRecorder{ - Controller: "my-controller", // unique name per reconciler - OperatorConditionsGauge: mymetrics.OperatorConditionsGauge, - }, - } - - reconciler := &MyReconciler{ - Recorder: recorder, - } - // ... -} -``` - -## Usage - -The easiest drop-in way to start using the metrics recorder is by creating a `SetStatusCondition` wrapper, which -comes instead of `meta.SetStatusCondition`. - -To delete the metrics for a given custom resource, simply call `RemoveConditionsFor` and pass the object. - -```go -const ( - kind = "MyCR" -) - -// SetStatusCondition utility function which replaces and wraps meta.SetStatusCondition calls -func (r *MyReconciler) SetStatusCondition(cr *v1.MyCR, cond metav1.Condition) bool { - changed := meta.SetStatusCondition(&cr.Status.Conditions, cond) - if changed { - // refetch the condition to get the updated version - updated := meta.FindStatusCondition(cr.Status.Conditions, cond.Type) - if updated != nil { - r.Recorder.RecordConditionFor( - kind, cr, updated.Type, string(updated.Status), updated.Reason, updated.LastTransitionTime, - ) - } - } - return changed -} - -func (r *MyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - // Get the resource we're reconciling - cr := new(v1.MyCR) - if err = r.Get(ctx, req.NamespacedName, cr); err != nil { - return ctrl.Result{}, client.IgnoreNotFound(err) - } - - // Remove the metrics when the CR is deleted - if cr.DeletionTimestamp != nil { - r.Recorder.RemoveConditionsFor(kind, cr) - } - - // ... - - // Update the status conditions using the recorder (it records the metric if changed) - if r.SetStatusCondition(cr, condition) { - if err = r.Status().Update(ctx, cr); err != nil { - return ctrl.Result{}, err - } - } - - return ctrl.Result{}, nil -} -``` +``` \ No newline at end of file diff --git a/pkg/operator_condition_metrics/operator_condition_metrics.go b/pkg/operator_condition_metrics/operator_condition_metrics.go deleted file mode 100644 index c017d1b..0000000 --- a/pkg/operator_condition_metrics/operator_condition_metrics.go +++ /dev/null @@ -1,202 +0,0 @@ -package operator_condition_metrics - -import ( - "time" - - metrics "github.com/sourcehawk/go-prometheus-gaugevecset/pkg/gauge_vec_set" -) - -/* -Inspired by kube-state-metrics enum-style patterns: - -kube-state-metrics models enum-like states (e.g., Pod phase) by exporting one time series per variant, -and marking exactly one as active (1) while the others are inactive (0). Example: - - kube_pod_status_phase{namespace="default", pod="nginx", phase="Running"} 1 - kube_pod_status_phase{namespace="default", pod="nginx", phase="Pending"} 0 - kube_pod_status_phase{namespace="default", pod="nginx", phase="Failed"} 0 - -We adopt the same pattern for controller Conditions, but we export one time series per (status, reason) variant -and enforce **exclusivity per condition**. The value of the metric we set is also the last transition time of the -condition. - -For any given (controller, kind, name, namespace, condition) exactly one (status, reason) series is present at a time. -All other variants are **deleted**. This keeps cardinality under control. - -Metric - _controller_condition - -Labels (order matches registration) - - controller: controller name (e.g., "my-operator") - - resource_kind: resource kind (e.g., "MyCRD") - - resource_name: resource name - - resource_namespace: resource namespace ("" for cluster-scoped) - - condition: condition type (e.g., "Ready", "Reconciled") - - status: "True" | "False" | "Unknown" - - reason: short machine-typed reason (often "" when status="True") - -Value - - The timestamp of last transition time for the condition - -Examples: - -1. Resource becomes Ready (True): - - my_controller_condition{ - controller="my-operator", - resource_kind="MyCRD", - resource_name="my-cr-1", - resource_namespace="prod", - condition="Ready", - status="True", - reason="" - } 1759174202 - -(Other status/reason variants for this condition are removed.) - -2. Transition: Ready to false - - // Previous series is removed - // New series becomes active: - my_controller_condition{ - ..., - condition="Ready", - status="False", - reason="Failed" - } 1759174205 - -3. Another condition can be active simultaneously (different group): - - my_controller_condition{ - ..., - condition="Synchronized", - status="True", - reason="" - } 17591743210 - -Cleanup - When the resource is deleted/pruned, all series for its index key - (controller, kind, resource_name, resource_namespace) are removed via DeleteByIndex(). - -Implementation - Backed by a GaugeVecSet with: - indexLabels = [controller, resource_kind, resource_name, resource_namespace] - groupLabels = [condition] - extraLabels = [status, reason] - Exclusivity is enforced with SetGroup(), which deletes sibling series. -*/ - -const ( - operatorConditionMetricSubsystem = "controller" - operatorConditionMetricName = "condition" - operatorConditionMetricHelp = "Condition status for a custom resource; one active (status,reason) time series per (controller,kind,name,namespace,condition)." -) - -var ( - indexLabels = []string{"controller", "resource_kind", "resource_name", "resource_namespace"} - groupLabels = []string{"condition"} - extraLabels = []string{"status", "reason"} -) - -type OperatorConditionsGauge struct { - *metrics.GaugeVecSet -} - -// NewOperatorConditionsGauge creates a new OperatorConditionsGauge for an operator. -// Initialize once (e.g., in your package init or setup) -// -// var OperatorConditionsGauge *OperatorConditionsGauge = nil -// -// func init() { -// OperatorConditionsGauge = NewOperatorConditionsGauge("my-operator") -// controllermetrics.Registry.MustRegister(OperatorConditionsGauge) -// } -func NewOperatorConditionsGauge(metricNamespace string) *OperatorConditionsGauge { - return &OperatorConditionsGauge{ - metrics.NewGaugeVecSet( - metricNamespace, - operatorConditionMetricSubsystem, - operatorConditionMetricName, - operatorConditionMetricHelp, - indexLabels, - groupLabels, - extraLabels..., - ), - } -} - -type ObjectLike interface { - GetName() string - GetNamespace() string -} - -// ConditionMetricRecorder records metrics for Kubernetes style `metav1.Condition` -// objects on custom resources, using a Prometheus gauge. -// -// Usage: -// -// # Embed in your custom recorder or reconciler -// -// type MyRecorder struct { -// gvs.ConditionMetricRecorder -// } -// -// r := MyControllerRecorder{ -// ConditionMetricRecorder: gvs.ConditionMetricRecorder{ -// Controller: "my-controller", -// OperatorConditionsGauge: my_metrics.OperatorConditionsGauge, -// }, -// } -// -// r.RecordConditionFor(kind, obj, cond.Type, string(cond.Status), cond.Reason) -// r.RemoveConditionsFor(kind, obj) -type ConditionMetricRecorder struct { - // The name of the controller the condition metrics are for - Controller string - // The OperatorConditionsGauge initialized by NewOperatorConditionsGauge - OperatorConditionsGauge *OperatorConditionsGauge -} - -// RecordConditionFor sets a condition metric for a given controller and object. -// -// It enforces exclusivity within the same (controller, name, namespace, condition) group, -// ensuring that only the latest (status, phase) is present for a given condition type. -// -// If the lastTransitionTime is zero, the value of the metric is set to the unix timestamp for time.Now().UTC() -// -// The following label values are set: -// -// - controller: the controller name reporting the condition -// - resource_kind: object kind -// - resource_name: object name -// - resource_namespace: object namespace -// - condition: condition type (e.g., "Ready", "Reconciled") -// - status: condition status ("True", "False", "Unknown") -// - reason: short reason string -// -// Example: -// -// r.RecordConditionFor(kind, obj, "Ready", "True", "AppReady", lastTransitionTime) -func (r *ConditionMetricRecorder) RecordConditionFor( - kind string, object ObjectLike, - conditionType, conditionStatus, conditionReason string, lastTransitionTime time.Time, -) { - indexValues := []string{r.Controller, kind, object.GetName(), object.GetNamespace()} - groupValues := []string{conditionType} - extraValues := []string{conditionStatus, conditionReason} - - if lastTransitionTime.IsZero() { - lastTransitionTime = time.Now().UTC() - } - - r.OperatorConditionsGauge.SetGroup(float64(lastTransitionTime.Unix()), indexValues, groupValues, extraValues...) -} - -// RemoveConditionsFor deletes all condition metrics for a given resource. -// This removes all condition types (e.g., Ready, Reconciled) for the resource in one call. -// -// Typically called when the object is deleted or no longer relevant to the controller (Deletion reconcile). -// Returns the number of time series deleted. -func (r *ConditionMetricRecorder) RemoveConditionsFor(kind string, object ObjectLike) (removed int) { - return r.OperatorConditionsGauge.DeleteByIndex(r.Controller, kind, object.GetName(), object.GetNamespace()) -} diff --git a/pkg/operator_condition_metrics/operator_condition_metrics_benchmark_test.go b/pkg/operator_condition_metrics/operator_condition_metrics_benchmark_test.go deleted file mode 100644 index 91344f0..0000000 --- a/pkg/operator_condition_metrics/operator_condition_metrics_benchmark_test.go +++ /dev/null @@ -1,176 +0,0 @@ -package operator_condition_metrics - -import ( - "bytes" - "fmt" - "testing" - "time" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/common/expfmt" -) - -/* -Run: - go test -run '^$' -bench . -benchtime=10000x -benchmem ./pkg/operator_condition_metrics -*/ - -// Let's benchmark against a somewhat realistic high usage scenario -const ( - controllerCount = 10 - resourcesPerController = 200 - conditionsPerController = 3 - variantsPerCondition = 10 - // Maximum total time series variants: 10 * 200 * 3 * 10 = 60k - // In our configuration however, we expect only one variant per condition to be exported. - // Maximum total exported time series: 10 * 200 * 3 * 1 = 6k - maxCardinality = controllerCount * resourcesPerController * conditionsPerController * variantsPerCondition -) - -func generatedName(prefix string, i int) string { - return fmt.Sprintf("%s%d", prefix, i) -} - -type FakeObject struct { - Name string - Namespace string -} - -func (f *FakeObject) GetName() string { - return f.Name -} - -func (f *FakeObject) GetNamespace() string { - return f.Namespace -} - -type FakeCondition struct { - Type string - Status string - Reason string -} - -func createBenchmarkScenario(tb testing.TB, registry *prometheus.Registry) *ConditionMetricRecorder { - tb.Helper() - - ns := "bench_ns_" + generatedName("", tb.(*testing.B).N) - gauge := NewOperatorConditionsGauge(ns) - _ = registry.Register(gauge) - tb.Cleanup(func() { - registry.Unregister(gauge) - }) - - rec := &ConditionMetricRecorder{ - Controller: "my-controller", - OperatorConditionsGauge: gauge, - } - - obj := &FakeObject{} - transitionTime := time.Now().UTC() - - condition := &FakeCondition{ - Status: "True", // doesn't matter, cardinality decided by Reason - } - - for i := 0; i < controllerCount; i++ { - kind := generatedName("Controller", i) - - for j := 0; j < resourcesPerController; j++ { - obj.Name = generatedName("Resource", j) - obj.Namespace = generatedName("namespace", j) - - for k := 0; k < conditionsPerController; k++ { - condition.Type = generatedName("condition", k) - - for v := 0; v < variantsPerCondition; v++ { - condition.Reason = generatedName("variant", v) - rec.RecordConditionFor(kind, obj, condition.Type, condition.Reason, condition.Reason, transitionTime) - } - } - } - } - - return rec -} - -// Benchmark the average time per call on a pre-populated scenario: -// - RecordConditionFor -// - RemoveConditionsFor -// -// Reports: ns/op for each sub-benchmark. -func Benchmark_ConditionMetricsRecorder_TimePerCall(b *testing.B) { - reg := prometheus.NewRegistry() - rec := createBenchmarkScenario(b, reg) - - // Use a stable object that exists in the populated dataset. - kind := "Benchmark" - obj := &FakeObject{ - Name: "Resource0", - Namespace: "namespace0", - } - transitionTime := time.Now().UTC() - - // Two variants in the same (controller,kind,name,namespace,condition) group. - condTrue := &FakeCondition{ - Type: "condition0", - Status: "True", - Reason: "variant0", - } - condFalse := &FakeCondition{ - Type: "condition0", - Status: "False", - Reason: "variant0", - } - - b.Run("RecordConditionFor", func(b *testing.B) { - b.ReportAllocs() - b.ResetTimer() - b.ReportMetric(float64(maxCardinality), "series/op") - for i := 0; i < b.N; i++ { - // Flip between two variants - if (i & 1) == 0 { - rec.RecordConditionFor(kind, obj, condTrue.Type, condTrue.Status, condTrue.Reason, transitionTime) - } else { - rec.RecordConditionFor(kind, obj, condFalse.Type, condFalse.Status, condFalse.Reason, transitionTime) - } - } - }) - - b.Run("RemoveConditionsFor", func(b *testing.B) { - b.ReportAllocs() - b.ResetTimer() - b.ReportMetric(float64(maxCardinality), "series/op") - for i := 0; i < b.N; i++ { - // Ensure there is something to remove, but do not count the set time. - b.StopTimer() - rec.RecordConditionFor(kind, obj, condTrue.Type, condTrue.Status, condTrue.Reason, transitionTime) - b.StartTimer() - - rec.RemoveConditionsFor(kind, obj) - } - }) -} - -// Benchmark the size of the Prometheus gather output on a pre-populated scenario. -// -// Reports: Metric size in KB retrieved from the registry. -func Benchmark_ConditionMetricsRecorder_PrometheusMemorySize(b *testing.B) { - reg := prometheus.NewRegistry() - _ = createBenchmarkScenario(b, reg) - - b.ReportAllocs() - b.ResetTimer() - b.ReportMetric(float64(maxCardinality), "series/op") - - mfs, err := reg.Gather() - if err != nil { - b.Fatalf("gather: %v", err) - } - var buf bytes.Buffer - for _, mf := range mfs { - _, _ = expfmt.MetricFamilyToText(&buf, mf) - } - sizeKB := float64(buf.Len()) / 1024.0 - - b.ReportMetric(sizeKB, "KB") -} diff --git a/pkg/operator_condition_metrics/operator_condition_metrics_test.go b/pkg/operator_condition_metrics/operator_condition_metrics_test.go deleted file mode 100644 index 8273ac2..0000000 --- a/pkg/operator_condition_metrics/operator_condition_metrics_test.go +++ /dev/null @@ -1,131 +0,0 @@ -package operator_condition_metrics - -import ( - "strings" - "testing" - "time" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/testutil" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func makeObj(name, namespace string) *FakeObject { - return &FakeObject{ - Name: name, - Namespace: namespace, - } -} - -func TestConditionMetricRecorder_Record_Transition_And_SecondCondition(t *testing.T) { - gauge := NewOperatorConditionsGauge("test_record_transition_and_second_condition") - reg := prometheus.NewRegistry() - _ = reg.Register(gauge) - - // Arrange - rec := &ConditionMetricRecorder{ - Controller: "my-controller", - OperatorConditionsGauge: gauge, - } - kind := "MyCRD" - name := "cr-1" - ns := "prod" - transitionTime := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) - - obj := makeObj(name, ns) - - // Record Ready=True - rec.RecordConditionFor(kind, obj, "Ready", "True", "", transitionTime) - - // Flip Ready -> False with reason - rec.RecordConditionFor(kind, obj, "Ready", "False", "Failed", transitionTime) - - // Another condition Synchronized=True (independent group) - rec.RecordConditionFor(kind, obj, "Synchronized", "True", "", transitionTime) - - // Expect: Ready False(reason)=1, Synchronized True=1 - want := ` -# HELP test_record_transition_and_second_condition_controller_condition Condition status for a custom resource; one active (status,reason) time series per (controller,kind,name,namespace,condition). -# TYPE test_record_transition_and_second_condition_controller_condition gauge -test_record_transition_and_second_condition_controller_condition{condition="Ready",controller="my-controller",reason="Failed",resource_kind="MyCRD",resource_name="cr-1",resource_namespace="prod",status="False"} 1735689600 -test_record_transition_and_second_condition_controller_condition{condition="Synchronized",controller="my-controller",reason="",resource_kind="MyCRD",resource_name="cr-1",resource_namespace="prod",status="True",} 1735689600 -` - require.NoError(t, - testutil.GatherAndCompare( - reg, - strings.NewReader(want), - "test_record_transition_and_second_condition_controller_condition", - ), - ) - - removed := rec.RemoveConditionsFor(kind, obj) - assert.Equal(t, 2, removed) -} - -func TestConditionMetricRecorder_RemoveConditionsFor(t *testing.T) { - gauge := NewOperatorConditionsGauge("test_remove_conditions_for_condition") - reg := prometheus.NewRegistry() - _ = reg.Register(gauge) - // Arrange - rec := &ConditionMetricRecorder{ - Controller: "my-controller", - OperatorConditionsGauge: gauge, - } - kind := "MyCRD" - name := "cr-2" - ns := "staging" - transitionTime := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) - obj := makeObj(name, ns) - - rec.RecordConditionFor(kind, obj, "Ready", "True", "", transitionTime) - rec.RecordConditionFor(kind, obj, "Synchronized", "False", "SyncPending", transitionTime) - - // Remove all condition series for this object - removed := rec.RemoveConditionsFor(kind, obj) - assert.Equal(t, 2, removed) - - // No series remain for this object - require.NoError(t, - testutil.GatherAndCompare( - reg, - strings.NewReader(""), - "test_remove_conditions_for_condition_controller_condition", - ), - ) -} - -func TestConditionMetricRecorder_SetsKindLabelFromObject(t *testing.T) { - gauge := NewOperatorConditionsGauge("test_sets_kind_label_from_object") - reg := prometheus.NewRegistry() - _ = reg.Register(gauge) - ctrl := "my-controller" - rec := &ConditionMetricRecorder{ - Controller: ctrl, - OperatorConditionsGauge: gauge, - } - kind := "FancyKind" - name := "obj-1" - ns := "ns-1" - transitionTime := time.Date(2025, time.January, 1, 0, 0, 0, 0, time.UTC) - obj := makeObj(name, ns) - - // Record a condition - rec.RecordConditionFor(kind, obj, "Ready", "True", "", transitionTime) - - // Expect the 'kind' label to reflect the object's Kind - want := ` -# HELP test_sets_kind_label_from_object_controller_condition Condition status for a custom resource; one active (status,reason) time series per (controller,kind,name,namespace,condition). -# TYPE test_sets_kind_label_from_object_controller_condition gauge -test_sets_kind_label_from_object_controller_condition{condition="Ready",controller="my-controller",reason="",resource_kind="FancyKind",resource_name="obj-1",resource_namespace="ns-1",status="True"} 1735689600 -` - require.NoError(t, - testutil.GatherAndCompare( - reg, - strings.NewReader(want), - "test_sets_kind_label_from_object_controller_condition", - ), - ) - - assert.Equal(t, 1, gauge.DeleteByIndex(ctrl, kind, name, ns)) -}