Skip to content

Commit 90208b4

Browse files
committed
health/alerts endpoint: brush up old PR
1 parent 4bcb91e commit 90208b4

14 files changed

+91
-66
lines changed

api/rest/client/client.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,9 @@ type Client interface {
9696
// Otherwise, it happens everywhere.
9797
RecoverAll(ctx context.Context, local bool) ([]*api.GlobalPinInfo, error)
9898

99-
// Alerts returns things that are wrong with cluster.
100-
Alerts(ctx context.Context) (map[string]api.Alert, error)
99+
// Alerts returns information health events in the cluster (expired
100+
// metrics etc.).
101+
Alerts(ctx context.Context) ([]*api.Alert, error)
101102

102103
// Version returns the ipfs-cluster peer's version.
103104
Version(context.Context) (*api.Version, error)
@@ -169,7 +170,7 @@ type Config struct {
169170
}
170171

171172
// AsTemplateFor creates client configs from resolved multiaddresses
172-
func (c *Config) AsTemplateFor(addrs []ma.Multiaddr) ([]*Config) {
173+
func (c *Config) AsTemplateFor(addrs []ma.Multiaddr) []*Config {
173174
var cfgs []*Config
174175
for _, addr := range addrs {
175176
cfg := *c
@@ -391,7 +392,7 @@ func resolveAddr(ctx context.Context, addr ma.Multiaddr) ([]ma.Multiaddr, error)
391392
if err != nil {
392393
return nil, err
393394
}
394-
395+
395396
if len(resolved) == 0 {
396397
return nil, fmt.Errorf("resolving %s returned 0 results", addr)
397398
}

api/rest/client/lbclient.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -301,8 +301,8 @@ func (lc *loadBalancingClient) RecoverAll(ctx context.Context, local bool) ([]*a
301301
}
302302

303303
// Alerts returns things that are wrong with cluster.
304-
func (lc *loadBalancingClient) Alerts(ctx context.Context) (map[string]api.Alert, error) {
305-
var alerts map[string]api.Alert
304+
func (lc *loadBalancingClient) Alerts(ctx context.Context) ([]*api.Alert, error) {
305+
var alerts []*api.Alert
306306
call := func(c Client) error {
307307
var err error
308308
alerts, err = c.Alerts(ctx)

api/rest/client/methods.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -274,13 +274,14 @@ func (c *defaultClient) RecoverAll(ctx context.Context, local bool) ([]*api.Glob
274274
return gpis, err
275275
}
276276

277-
// Alerts returns things that are wrong with cluster.
278-
func (c *defaultClient) Alerts(ctx context.Context) (map[string]api.Alert, error) {
277+
// Alerts returns information health events in the cluster (expired metrics
278+
// etc.).
279+
func (c *defaultClient) Alerts(ctx context.Context) ([]*api.Alert, error) {
279280
ctx, span := trace.StartSpan(ctx, "client/Alert")
280281
defer span.End()
281282

282-
var alerts map[string]api.Alert
283-
err := c.do(ctx, "GET", "/alerts", nil, nil, &alerts)
283+
var alerts []*api.Alert
284+
err := c.do(ctx, "GET", "/health/alerts", nil, nil, &alerts)
284285
return alerts, err
285286
}
286287

api/rest/client/methods_test.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -427,9 +427,8 @@ func TestAlerts(t *testing.T) {
427427
if len(alerts) != 1 {
428428
t.Fatal("expected 1 alert")
429429
}
430-
pID2 := peer.IDB58Encode(test.PeerID2)
431-
_, ok := alerts[pID2]
432-
if !ok {
430+
pID2 := peer.Encode(test.PeerID2)
431+
if alerts[0].Peer != test.PeerID2 {
433432
t.Errorf("expected an alert from %s", pID2)
434433
}
435434
}

api/rest/restapi.go

+7-7
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,12 @@ func (api *API) routes() []route {
466466
"/health/graph",
467467
api.graphHandler,
468468
},
469+
{
470+
"Alerts",
471+
"GET",
472+
"/health/alerts",
473+
api.alertsHandler,
474+
},
469475
{
470476
"Metrics",
471477
"GET",
@@ -478,12 +484,6 @@ func (api *API) routes() []route {
478484
"/monitor/metrics",
479485
api.metricNamesHandler,
480486
},
481-
{
482-
"Alerts",
483-
"GET",
484-
"/alerts",
485-
api.alertsHandler,
486-
},
487487
}
488488
}
489489

@@ -666,7 +666,7 @@ func (api *API) metricNamesHandler(w http.ResponseWriter, r *http.Request) {
666666
}
667667

668668
func (api *API) alertsHandler(w http.ResponseWriter, r *http.Request) {
669-
var alerts map[string]types.Alert
669+
var alerts []types.Alert
670670
err := api.rpcClient.CallContext(
671671
r.Context(),
672672
"",

api/rest/restapi_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -855,8 +855,8 @@ func TestAPIAlertsEndpoint(t *testing.T) {
855855
defer rest.Shutdown(ctx)
856856

857857
tf := func(t *testing.T, url urlF) {
858-
var resp map[string]api.Alert
859-
makeGet(t, rest, url(rest)+"/alerts", &resp)
858+
var resp []api.Alert
859+
makeGet(t, rest, url(rest)+"/health/alerts", &resp)
860860
if len(resp) != 1 {
861861
t.Error("expected one alert")
862862
}

api/types.go

+3-5
Original file line numberDiff line numberDiff line change
@@ -1069,12 +1069,10 @@ func (es MetricSlice) Less(i, j int) bool {
10691069
return es[i].Peer < es[j].Peer
10701070
}
10711071

1072-
// Alert carries alerting information about a peer. WIP.
1072+
// Alert carries alerting information about a peer.
10731073
type Alert struct {
1074-
Peer peer.ID `json:"peer" codec:"p"`
1075-
MetricName string `json:"metric_name" codec:"m"`
1076-
Expiry int64 `json:"expiry" codec:"e"`
1077-
Value string `json:"value" codec:"v"`
1074+
Metric
1075+
TriggeredAt time.Time `json:"triggered_at" codec:"r,omitempty"`
10781076
}
10791077

10801078
// Error can be used by APIs to return errors.

cluster.go

+17-15
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ const (
4242
bootstrapCount = 3
4343
reBootstrapInterval = 30 * time.Second
4444
mdnsServiceTag = "_ipfs-cluster-discovery._udp"
45+
maxAlerts = 1000
4546
)
4647

4748
var (
@@ -74,7 +75,7 @@ type Cluster struct {
7475
informers []Informer
7576
tracer Tracer
7677

77-
alerts map[string]api.Alert
78+
alerts []api.Alert
7879
alertsMux sync.Mutex
7980

8081
doneCh chan struct{}
@@ -163,7 +164,7 @@ func NewCluster(
163164
allocator: allocator,
164165
informers: informers,
165166
tracer: tracer,
166-
alerts: make(map[string]api.Alert),
167+
alerts: []api.Alert{},
167168
peerManager: peerManager,
168169
shutdownB: false,
169170
removed: false,
@@ -388,16 +389,16 @@ func (c *Cluster) pushPingMetrics(ctx context.Context) {
388389
}
389390
}
390391

391-
// Alerts returns things that are wrong with the cluster.
392-
func (c *Cluster) Alerts() map[string]api.Alert {
393-
alerts := make(map[string]api.Alert)
392+
// Alerts returns the last alerts recorded by this cluster peer with the most
393+
// recent first.
394+
func (c *Cluster) Alerts() []api.Alert {
395+
alerts := make([]api.Alert, len(c.alerts), len(c.alerts))
394396

395397
c.alertsMux.Lock()
396398
{
397-
for i, alert := range c.alerts {
398-
if time.Now().Before(time.Unix(0, alert.Expiry)) {
399-
alerts[i] = alert
400-
}
399+
total := len(alerts)
400+
for i, a := range c.alerts {
401+
alerts[total-1-i] = a
401402
}
402403
}
403404
c.alertsMux.Unlock()
@@ -418,17 +419,18 @@ func (c *Cluster) alertsHandler() {
418419
continue
419420
}
420421

421-
logger.Warnf("metric alert for %s: Peer: %s.", alrt.MetricName, alrt.Peer)
422+
logger.Warnf("metric alert for %s: Peer: %s.", alrt.Name, alrt.Peer)
422423
c.alertsMux.Lock()
423-
for pID, alert := range c.alerts {
424-
if time.Now().After(time.Unix(0, alert.Expiry)) {
425-
delete(c.alerts, pID)
424+
{
425+
if len(c.alerts) > maxAlerts {
426+
c.alerts = c.alerts[:0]
426427
}
428+
429+
c.alerts = append(c.alerts, *alrt)
427430
}
428-
c.alerts[peer.IDB58Encode(alrt.Peer)] = *alrt
429431
c.alertsMux.Unlock()
430432

431-
if alrt.MetricName != pingMetricName {
433+
if alrt.Name != pingMetricName {
432434
continue // only handle ping alerts
433435
}
434436

cmd/ipfs-cluster-ctl/formatters.go

+14-3
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ func textFormatObject(resp interface{}) {
6666
textFormatPrintError(r)
6767
case *api.Metric:
6868
textFormatPrintMetric(r)
69+
case *api.Alert:
70+
textFormatPrintAlert(r)
6971
case []*api.ID:
7072
for _, item := range r {
7173
textFormatObject(item)
@@ -96,9 +98,9 @@ func textFormatObject(resp interface{}) {
9698
for _, item := range r {
9799
textFormatObject(item)
98100
}
99-
case map[string]api.Alert:
100-
for i := range resp.(map[string]api.Alert) {
101-
fmt.Printf("peer is down: %s\n", i)
101+
case []*api.Alert:
102+
for _, item := range r {
103+
textFormatObject(item)
102104
}
103105
default:
104106
checkErr("", errors.New("unsupported type returned"))
@@ -244,6 +246,15 @@ func textFormatPrintMetric(obj *api.Metric) {
244246
fmt.Printf("%s | %s | Expires in: %s\n", peer.Encode(obj.Peer), obj.Name, humanize.Time(time.Unix(0, obj.Expire)))
245247
}
246248

249+
func textFormatPrintAlert(obj *api.Alert) {
250+
fmt.Printf("%s: %s. Expired at: %s. Triggered at: %s\n",
251+
obj.Peer,
252+
obj.Name,
253+
humanize.Time(time.Unix(0, obj.Expire)),
254+
humanize.Time(obj.TriggeredAt),
255+
)
256+
}
257+
247258
func textFormatPrintGlobalRepoGC(obj *api.GlobalRepoGC) {
248259
peers := make(sort.StringSlice, 0, len(obj.PeerMap))
249260
for peer := range obj.PeerMap {

cmd/ipfs-cluster-ctl/main.go

+18-10
Original file line numberDiff line numberDiff line change
@@ -970,16 +970,24 @@ but usually are:
970970
return nil
971971
},
972972
},
973-
},
974-
},
975-
{
976-
Name: "alerts",
977-
Usage: "Show things which are wrong with this cluster",
978-
Description: "Show things which are wrong with this cluster",
979-
Action: func(c *cli.Context) error {
980-
resp, cerr := globalClient.Alerts(ctx)
981-
formatResponse(c, resp, cerr)
982-
return nil
973+
{
974+
Name: "alerts",
975+
Usage: "List the latest expired metric alerts",
976+
Description: `
977+
This command provides a list of "alerts" that the cluster has seen.
978+
979+
An alert is triggered when one of the metrics seen for a peer expires, and no
980+
new metrics have been received.
981+
982+
Different alerts may be handled in different ways. i.e. ping alerts may
983+
trigger automatic repinnings if configured.
984+
`,
985+
Action: func(c *cli.Context) error {
986+
resp, cerr := globalClient.Alerts(ctx)
987+
formatResponse(c, resp, cerr)
988+
return nil
989+
},
990+
},
983991
},
984992
},
985993
{

monitor/metrics/checker.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,11 @@ func (mc *Checker) alert(pid peer.ID, metricName string) error {
112112
}
113113

114114
failedMetrics[metricName]++
115+
lastMetric := mc.metrics.PeerLatest(metricName, pid)
115116

116117
alrt := &api.Alert{
117-
Peer: pid,
118-
MetricName: metricName,
119-
Expiry: time.Now().Add(30 * time.Second).UnixNano(),
118+
Metric: *lastMetric,
119+
TriggeredAt: time.Now(),
120120
}
121121
select {
122122
case mc.alertCh <- alrt:

monitor/pubsubmon/pubsubmon_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ func TestPeerMonitorAlerts(t *testing.T) {
303303
case <-timeout.C:
304304
t.Fatal("should have thrown an alert by now")
305305
case alrt := <-pm.Alerts():
306-
if alrt.MetricName != "test" {
306+
if alrt.Name != "test" {
307307
t.Error("Alert should be for test")
308308
}
309309
if alrt.Peer != test.PeerID1 {

rpc_api.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,7 @@ func (rpcapi *ClusterRPCAPI) SendInformersMetrics(ctx context.Context, in struct
427427
}
428428

429429
// Alerts runs Cluster.Alerts().
430-
func (rpcapi *ClusterRPCAPI) Alerts(ctx context.Context, in struct{}, out *map[string]api.Alert) error {
430+
func (rpcapi *ClusterRPCAPI) Alerts(ctx context.Context, in struct{}, out *[]api.Alert) error {
431431
alerts := rpcapi.c.Alerts()
432432
*out = alerts
433433
return nil

test/rpc_api_mock.go

+11-6
Original file line numberDiff line numberDiff line change
@@ -341,12 +341,17 @@ func (mock *mockCluster) SendInformerMetric(ctx context.Context, in struct{}, ou
341341
return nil
342342
}
343343

344-
func (mock *mockCluster) Alerts(ctx context.Context, in struct{}, out *map[string]api.Alert) error {
345-
*out = map[string]api.Alert{
346-
peer.IDB58Encode(PeerID2): api.Alert{
347-
Peer: PeerID2,
348-
MetricName: "ping",
349-
Expiry: time.Now().Add(30 * time.Second).UnixNano(),
344+
func (mock *mockCluster) Alerts(ctx context.Context, in struct{}, out *[]api.Alert) error {
345+
*out = []api.Alert{
346+
api.Alert{
347+
Metric: api.Metric{
348+
Name: "ping",
349+
Peer: PeerID2,
350+
Expire: time.Now().Add(-30 * time.Second).UnixNano(),
351+
Valid: true,
352+
ReceivedAt: time.Now().Add(-60 * time.Second).UnixNano(),
353+
},
354+
TriggeredAt: time.Now(),
350355
},
351356
}
352357
return nil

0 commit comments

Comments
 (0)