Skip to content

Commit c62f312

Browse files
committed
feat: add support for app metrics
1 parent 1849852 commit c62f312

File tree

6 files changed

+312
-28
lines changed

6 files changed

+312
-28
lines changed

cmd/app.go

+78-3
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@ type App struct {
1717
metricsMgr *metrics.Manager
1818
nseMgr *nse.Manager
1919

20-
hardwareSvc *hardwareService
21-
dbSvc *dbService
22-
networkSvc *networkService
20+
hardwareSvc *hardwareService
21+
dbSvc *dbService
22+
networkSvc *networkService
23+
applicationSvc *applicationService
2324
}
2425

2526
type Opts struct {
@@ -43,6 +44,11 @@ type networkService struct {
4344
queries map[string]string
4445
}
4546

47+
type applicationService struct {
48+
hosts []string
49+
queries map[string]string
50+
}
51+
4652
// fetchHWMetrics fetches hardware metrics from the Prometheus HTTP API.
4753
func (app *App) fetchHWMetrics() (map[string]models.HWPromResp, error) {
4854
hwMetrics := make(map[string]models.HWPromResp)
@@ -178,6 +184,53 @@ func (app *App) fetchNetworkMetrics() (map[string]models.NetworkPromResp, error)
178184
return networkMetrics, nil
179185
}
180186

187+
// fetchApplicationMetrics fetches application metrics from the Prometheus HTTP API.
188+
func (app *App) fetchApplicationMetrics() (map[string]models.AppPromResp, error) {
189+
appMetrics := make(map[string]models.AppPromResp)
190+
191+
for _, host := range app.applicationSvc.hosts {
192+
appMetricsResp := models.AppPromResp{}
193+
for metric, query := range app.applicationSvc.queries {
194+
switch metric {
195+
case "throughput":
196+
// NOTE: The query doesn't have a host parameter as it aggregates across all hosts.
197+
value, err := app.metricsMgr.Query(query)
198+
if err != nil {
199+
app.lo.Error("Failed to query Prometheus",
200+
"host", host,
201+
"metric", metric,
202+
"error", err)
203+
continue
204+
}
205+
appMetricsResp.Throughput = value
206+
207+
case "failure_count":
208+
// NOTE: The query doesn't have a host parameter as it aggregates across all hosts.
209+
value, err := app.metricsMgr.Query(query)
210+
if err != nil {
211+
app.lo.Error("Failed to query Prometheus",
212+
"host", host,
213+
"metric", metric,
214+
"error", err)
215+
continue
216+
}
217+
appMetricsResp.FailureCount = value
218+
219+
default:
220+
app.lo.Warn("Unknown application metric queried",
221+
"host", host,
222+
"metric", metric)
223+
}
224+
}
225+
226+
// Add host metrics to the map.
227+
appMetrics[host] = appMetricsResp
228+
app.lo.Debug("fetched metrics", "host", host, "data", appMetricsResp)
229+
}
230+
231+
return appMetrics, nil
232+
}
233+
181234
// pushHWMetrics pushes hardware metrics to the NSE.
182235
func (app *App) pushHWMetrics(host string, data models.HWPromResp) error {
183236
for i := 0; i < app.opts.MaxRetries; i++ {
@@ -249,3 +302,25 @@ func (app *App) pushNetworkMetrics(host string, data models.NetworkPromResp) err
249302
}
250303
return nil
251304
}
305+
306+
func (app *App) pushApplicationMetrics(host string, data models.AppPromResp) error {
307+
for i := 0; i < app.opts.MaxRetries; i++ {
308+
if err := app.nseMgr.PushAppMetrics(host, data); err != nil {
309+
if i < app.opts.MaxRetries-1 {
310+
app.lo.Error("Failed to push application metrics to NSE. Retrying...",
311+
"host", host,
312+
"attempt", i+1,
313+
"error", err)
314+
time.Sleep(app.opts.RetryInterval)
315+
continue
316+
}
317+
app.lo.Error("Failed to push application metrics to NSE after max retries",
318+
"host", host,
319+
"max_retries", app.opts.MaxRetries,
320+
"error", err)
321+
return err
322+
}
323+
break
324+
}
325+
return nil
326+
}

cmd/init.go

+28
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,34 @@ func initNetworkSvc(ko *koanf.Koanf) (*networkService, error) {
207207
}, nil
208208
}
209209

210+
func initApplicationSvc(ko *koanf.Koanf) (*applicationService, error) {
211+
var (
212+
queries = map[string]string{
213+
"failure_count": ko.MustString("metrics.application.failure_count"),
214+
"throughput": ko.MustString("metrics.application.throughput"),
215+
}
216+
hosts = ko.Strings("metrics.application.hosts")
217+
cfgPath = ko.String("prometheus.config_path")
218+
)
219+
220+
if len(hosts) == 0 && cfgPath != "" {
221+
defaultHosts, err := initDefaultHosts(ko, cfgPath)
222+
if err != nil {
223+
return nil, err
224+
}
225+
hosts = defaultHosts
226+
}
227+
228+
if len(hosts) == 0 {
229+
return nil, fmt.Errorf("no hosts found in the config")
230+
}
231+
232+
return &applicationService{
233+
hosts: hosts,
234+
queries: queries,
235+
}, nil
236+
}
237+
210238
// initNSEManager initialises the NSE manager.
211239
func initNSEManager(ko *koanf.Koanf, lo *slog.Logger) (*nse.Manager, error) {
212240
nseMgr, err := nse.New(lo, nse.Opts{

cmd/main.go

+54-7
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,13 @@ func main() {
5252
exit()
5353
}
5454

55+
// Load queries for application metrics.
56+
applicationSvc, err := initApplicationSvc(ko)
57+
if err != nil {
58+
lo.Error("failed to init application service", "error", err)
59+
exit()
60+
}
61+
5562
// Initialise the NSE manager.
5663
nseMgr, err := initNSEManager(ko, lo)
5764
if err != nil {
@@ -61,13 +68,14 @@ func main() {
6168

6269
// Init the app.
6370
app := &App{
64-
lo: lo,
65-
opts: initOpts(ko),
66-
metricsMgr: metricsMgr,
67-
nseMgr: nseMgr,
68-
hardwareSvc: hardwareSvc,
69-
dbSvc: dbSvc,
70-
networkSvc: networkSvc,
71+
lo: lo,
72+
opts: initOpts(ko),
73+
metricsMgr: metricsMgr,
74+
nseMgr: nseMgr,
75+
hardwareSvc: hardwareSvc,
76+
dbSvc: dbSvc,
77+
networkSvc: networkSvc,
78+
applicationSvc: applicationSvc,
7179
}
7280

7381
// Create a new context which is cancelled when `SIGINT`/`SIGTERM` is received.
@@ -85,6 +93,9 @@ func main() {
8593
wg.Add(1)
8694
go app.syncNetworkMetricsWorker(ctx, wg)
8795

96+
wg.Add(1)
97+
go app.syncApplicationMetricsWorker(ctx, wg)
98+
8899
// Listen on the close channel indefinitely until a
89100
// `SIGINT` or `SIGTERM` is received.
90101
<-ctx.Done()
@@ -201,3 +212,39 @@ func (app *App) syncNetworkMetricsWorker(ctx context.Context, wg *sync.WaitGroup
201212
}
202213
}
203214
}
215+
216+
// Add a new worker function for the application service.
217+
func (app *App) syncApplicationMetricsWorker(ctx context.Context, wg *sync.WaitGroup) {
218+
defer wg.Done()
219+
220+
ticker := time.NewTicker(app.opts.SyncInterval)
221+
defer ticker.Stop()
222+
223+
app.lo.Info("Starting application metrics worker", "interval", app.opts.SyncInterval)
224+
for {
225+
select {
226+
case <-ticker.C:
227+
data, err := app.fetchApplicationMetrics()
228+
if err != nil {
229+
app.lo.Error("Failed to fetch application metrics", "error", err)
230+
continue
231+
}
232+
233+
// Push to upstream LAMA APIs.
234+
for host, hostData := range data {
235+
if err := app.pushApplicationMetrics(host, hostData); err != nil {
236+
app.lo.Error("Failed to push application metrics to NSE", "host", host, "error", err)
237+
continue
238+
}
239+
240+
// FIXME: Currently the LAMA API does not support multiple hosts.
241+
// Once we've pushed the data for the first host, break the loop.
242+
// Once the LAMA API supports multiple hosts, remove this.
243+
break
244+
}
245+
case <-ctx.Done():
246+
app.lo.Info("Stopping application metrics worker")
247+
return
248+
}
249+
}
250+
}

config.sample.toml

+23-18
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,45 @@
11
[app]
2-
log_level = "debug" # To enable debug logging, level should be `debug`.
3-
sync_interval = "5m" # Interval at which the app should fetch data from metrics store.
2+
log_level = "debug" # To enable debug logging, level should be `debug`.
3+
max_retries = 3 # Maximum number of retries for a failed request.
44
retry_interval = "5s" # Interval at which the app should retry if the previous request failed.
5-
max_retries = 3 # Maximum number of retries for a failed request.
5+
sync_interval = "5m" # Interval at which the app should fetch data from metrics store.
66

77
[lama.nse]
8+
exchange_id = 1 # 1=National Stock Exchange
9+
idle_timeout = "5m" # Idle timeout for HTTP requests
10+
login_id = "redacted"
11+
member_id = "redacted"
12+
password = "redacted"
13+
timeout = "30s" # Timeout for HTTP requests
814
url = "https://lama.nse.internal" # Endpoint for NSE LAMA API Gateway
9-
login_id = "redacted"
10-
member_id = "redacted"
11-
password = "redacted"
12-
timeout = "30s" # Timeout for HTTP requests
13-
idle_timeout = "5m" # Idle timeout for HTTP requests
14-
exchange_id = 1 # 1=National Stock Exchange
1515

1616
[prometheus]
17-
endpoint = "http://prometheus:9090" # Endpoint for Prometheus API
18-
query_path = "/api/v1/query" # Endpoint for Prometheus query API
19-
username = "redacted" # HTTP Basic Auth username
20-
password = "redacted" # HTTP Basic Auth password
21-
timeout = "10s" # Timeout for HTTP requests
22-
idle_timeout = "5m" # Idle timeout for HTTP requests
23-
max_idle_conns = 10
2417
config_path = "/etc/prometheus/prometheus.yml" # Path to Prometheus config file. This is used to load a list of hosts to fetch metrics for.
18+
endpoint = "http://prometheus:9090" # Endpoint for Prometheus API
19+
idle_timeout = "5m" # Idle timeout for HTTP requests
20+
max_idle_conns = 10
21+
password = "redacted" # HTTP Basic Auth password
22+
query_path = "/api/v1/query" # Endpoint for Prometheus query API
23+
timeout = "10s" # Timeout for HTTP requests
24+
username = "redacted" # HTTP Basic Auth username
2525

2626
[metrics.hardware] # Define Prometheus queries for hardware metrics
2727
# List of hosts to fetch metrics for. Keep this empty to fetch metrics for all hosts defined in `prometheus.config_path` file.
28-
hosts = []
2928
cpu = '100 * (1 - avg(rate(node_cpu_seconds_total{mode="idle", hostname="%s"}[5m])))'
30-
memory = '(1 - ((node_memory_MemFree_bytes{hostname="%s"} + node_memory_Buffers_bytes{hostname="%s"} + node_memory_Cached_bytes{hostname="%s"}) / node_memory_MemTotal_bytes{hostname="%s"})) * 100'
3129
disk = '100 - ((node_filesystem_avail_bytes{hostname="%s",device!~"rootfs"} * 100) / node_filesystem_size_bytes{hostname="%s",device!~"rootfs"})'
30+
hosts = []
31+
memory = '(1 - ((node_memory_MemFree_bytes{hostname="%s"} + node_memory_Buffers_bytes{hostname="%s"} + node_memory_Cached_bytes{hostname="%s"}) / node_memory_MemTotal_bytes{hostname="%s"})) * 100'
3232
uptime = '(node_time_seconds{hostname="%s"} - node_boot_time_seconds{hostname="%s"}) / 60'
3333

3434
[metrics.database] # Define Prometheus queries for db metrics
3535
hosts = []
3636
status = 'up{hostname="%s"}'
3737

3838
[metrics.network]
39+
hosts = []
3940
packet_errors = 'sum(rate(node_network_receive_errs_total{hostname="%s"}[5m])) + sum(rate(node_network_transmit_errs_total{hostname="%s"}[5m]))'
41+
42+
[metrics.application]
43+
failure_count = 'sum(sum without (hostname, instance, server) (rate(haproxy_server_http_responses_total{job="my-app",code="5xx",proxy="my-backend"}[5m]))) by (code)'
4044
hosts = []
45+
throughput = 'sum(sum without (hostname, instance, server) (rate(haproxy_server_http_responses_total{job="my-app",proxy="my-backend"}[5m]))) by (proxy)'

0 commit comments

Comments
 (0)