diff --git a/cmd/proxy/cmd/metrics.go b/cmd/proxy/cmd/metrics.go new file mode 100644 index 000000000..206b6bb7b --- /dev/null +++ b/cmd/proxy/cmd/metrics.go @@ -0,0 +1,64 @@ +// Copyright 2021 Sorint.lab +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +var ( + proxyHealthGauge = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "stolon_proxy_health", + Help: "Set to 1 if proxy healthy and accepting connections", + }, + ) + + clusterdataLastValidUpdateSeconds = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "stolon_proxy_clusterdata_last_valid_update_seconds", + Help: "Last time we received a valid clusterdata from our store as seconds since unix epoch", + }, + ) + + proxyListenerStartedSeconds = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "stolon_proxy_listener_started_seconds", + Help: "Last time we started the proxy listener as seconds since unix epoch", + }, + ) + + getClusterInfoErrors = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "stolon_proxy_get_cluster_info_errors", + Help: "Count of failed getting and parsing cluster info operationss", + }, + ) + + updateProxyInfoErrors = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "stolon_proxy_update_proxy_info_errors", + Help: "Count of update proxyInfo failures", + }, + ) +) + +func init() { + prometheus.MustRegister(proxyHealthGauge) + prometheus.MustRegister(clusterdataLastValidUpdateSeconds) + prometheus.MustRegister(proxyListenerStartedSeconds) + prometheus.MustRegister(getClusterInfoErrors) + prometheus.MustRegister(updateProxyInfoErrors) +} diff --git a/cmd/proxy/cmd/proxy.go b/cmd/proxy/cmd/proxy.go index 39f1f3b99..29711dd65 100644 --- a/cmd/proxy/cmd/proxy.go +++ b/cmd/proxy/cmd/proxy.go @@ -148,6 +148,8 @@ func (c *ClusterChecker) startPollonProxy() error { c.endPollonProxyCh <- c.pp.Start() }() + proxyHealthGauge.Set(1) + proxyListenerStartedSeconds.SetToCurrentTime() return nil } @@ -160,6 +162,7 @@ func (c *ClusterChecker) stopPollonProxy() { c.pp = nil c.listener.Close() c.listener = nil + proxyHealthGauge.Set(0) } } @@ -190,6 +193,7 @@ func (c *ClusterChecker) SetProxyInfo(e store.Store, generation int64, proxyTime func (c *ClusterChecker) Check() error { cd, _, err := c.e.GetClusterData(context.TODO()) if err != nil { + getClusterInfoErrors.Inc() return fmt.Errorf("cannot get cluster data: %v", err) } @@ -206,13 +210,20 @@ func (c *ClusterChecker) Check() error { } if cd.FormatVersion != cluster.CurrentCDFormatVersion { c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) + getClusterInfoErrors.Inc() return fmt.Errorf("unsupported clusterdata format version: %d", cd.FormatVersion) } if err = cd.Cluster.Spec.Validate(); err != nil { c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) + getClusterInfoErrors.Inc() return fmt.Errorf("clusterdata validation failed: %v", err) } + // Mark that the clusterdata we've received is valid. We'll use this metric to detect + // when our store is failing to serve a valid clusterdata, so it's important we only + // update the metric here. + clusterdataLastValidUpdateSeconds.SetToCurrentTime() + cdProxyCheckInterval := cd.Cluster.DefSpec().ProxyCheckInterval.Duration cdProxyTimeout := cd.Cluster.DefSpec().ProxyTimeout.Duration @@ -231,6 +242,7 @@ func (c *ClusterChecker) Check() error { c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) // ignore errors on setting proxy info if err = c.SetProxyInfo(c.e, cluster.NoGeneration, proxyTimeout); err != nil { + updateProxyInfoErrors.Inc() log.Errorw("failed to update proxyInfo", zap.Error(err)) } else { // update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info @@ -248,6 +260,7 @@ func (c *ClusterChecker) Check() error { c.sendPollonConfData(pollon.ConfData{DestAddr: nil}) // ignore errors on setting proxy info if err = c.SetProxyInfo(c.e, proxy.Generation, proxyTimeout); err != nil { + updateProxyInfoErrors.Inc() log.Errorw("failed to update proxyInfo", zap.Error(err)) } else { // update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info @@ -271,6 +284,7 @@ func (c *ClusterChecker) Check() error { // cannot ignore this error since the sentinel won't know that we exist // and are sending connections to a master so, when electing a new // master, it'll not wait for us to close connections to the old one. + updateProxyInfoErrors.Inc() return fmt.Errorf("failed to update proxyInfo: %v", err) } else { // update proxyCheckinterval and proxyTimeout only if we successfully updated our proxy info