Skip to content

Commit 7727318

Browse files
authored
feat(indexer,gateway): add and unify service metrics (#517)
* indexer: add core operational metrics - add chain head/processed gauges and websocket reconnect counter - add commit and tree-sync latency/size histograms - add HTTP latency middleware with route/status-class tags - register metrics on startup and wire instrumentation through indexer flows * indexer: remove ws reconnect metric Drop indexer.ws.reconnects and related reconnect counter updates. Keep chain/head, processed, tree sync, commit, and HTTP latency metrics. * gateway: unify metrics API and HTTP latency naming Refactor gateway metrics emission through helper functions in metrics.rs, register metric descriptions at startup, and align HTTP latency naming/tags with the indexer approach (gateway.http.latency_ms, route, status_class). * metrics: remove service-name prefixes - drop gateway./indexer. prefixes from metric keys - keep namespaced paths without service prefix (e.g. http.latency_ms) - clean up indexer middleware by removing no-op path normalizer
1 parent 2e6300c commit 7727318

File tree

18 files changed

+338
-88
lines changed

18 files changed

+338
-88
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

services/gateway/src/batch_policy.rs

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,7 @@ use alloy::{
88
providers::{DynProvider, Provider},
99
};
1010

11-
use crate::{
12-
config::BatchPolicyConfig,
13-
metrics::{
14-
METRICS_BATCH_POLICY_COST_SCORE, METRICS_BATCH_POLICY_DEFER,
15-
METRICS_BATCH_POLICY_FORCE_SEND, METRICS_BATCH_POLICY_TARGET_SIZE,
16-
METRICS_BATCH_POLICY_URGENCY_SCORE,
17-
},
18-
};
11+
use crate::{config::BatchPolicyConfig, metrics};
1912

2013
/// Aggregated queued backlog pressure from Redis.
2114
#[derive(Debug, Clone, Copy, Default)]
@@ -272,24 +265,19 @@ impl BatchPolicyEngine {
272265

273266
/// Emits policy metrics for a decision.
274267
pub fn record_policy_metrics(batch_type: &'static str, decision: &PolicyDecision) {
275-
::metrics::histogram!(METRICS_BATCH_POLICY_COST_SCORE, "type" => batch_type)
276-
.record(decision.cost_score);
277-
::metrics::histogram!(METRICS_BATCH_POLICY_URGENCY_SCORE, "type" => batch_type)
278-
.record(decision.urgency_score);
279-
::metrics::histogram!(METRICS_BATCH_POLICY_TARGET_SIZE, "type" => batch_type)
280-
.record(decision.target_batch_size as f64);
268+
metrics::record_policy_scores(
269+
batch_type,
270+
decision.cost_score,
271+
decision.urgency_score,
272+
decision.target_batch_size,
273+
);
281274

282275
if decision.force_send {
283-
::metrics::counter!(METRICS_BATCH_POLICY_FORCE_SEND, "type" => batch_type).increment(1);
276+
metrics::increment_policy_force_send(batch_type);
284277
}
285278

286279
if !decision.should_send {
287-
::metrics::counter!(
288-
METRICS_BATCH_POLICY_DEFER,
289-
"type" => batch_type,
290-
"reason" => decision.reason.as_str()
291-
)
292-
.increment(1);
280+
metrics::increment_policy_defer(batch_type, decision.reason.as_str());
293281
}
294282
}
295283

services/gateway/src/create_batcher.rs

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,7 @@ use crate::{
55
batch_policy::BaseFeeCache,
66
config::BatchPolicyConfig,
77
error::parse_contract_error,
8-
metrics::{
9-
METRICS_BATCH_FAILURE, METRICS_BATCH_LATENCY_MS, METRICS_BATCH_SIZE,
10-
METRICS_BATCH_SUBMITTED, METRICS_BATCH_SUCCESS,
11-
},
8+
metrics,
129
policy_batcher::{PolicyBatchLoopRunner, TimedEnvelope},
1310
request_tracker::BacklogScope,
1411
};
@@ -92,8 +89,7 @@ impl CreateBatcherRunner {
9289
let batch_size = batch.len();
9390
let ids: Vec<String> = batch.iter().map(|env| env.id.clone()).collect();
9491

95-
::metrics::counter!(METRICS_BATCH_SUBMITTED, "type" => "create").increment(1);
96-
::metrics::histogram!(METRICS_BATCH_SIZE, "type" => "create").record(batch_size as f64);
92+
metrics::record_batch_submitted("create", batch_size);
9793

9894
self.tracker
9995
.set_status_batch(&ids, GatewayRequestState::Batching)
@@ -122,9 +118,7 @@ impl CreateBatcherRunner {
122118
match call.send().await {
123119
Ok(builder) => {
124120
let latency_ms = start.elapsed().as_millis() as f64;
125-
::metrics::histogram!(METRICS_BATCH_LATENCY_MS, "type" => "create")
126-
.record(latency_ms);
127-
::metrics::counter!(METRICS_BATCH_SUCCESS, "type" => "create").increment(1);
121+
metrics::record_batch_result("create", true, latency_ms);
128122

129123
let hash = format!("0x{:x}", builder.tx_hash());
130124
self.tracker
@@ -164,9 +158,7 @@ impl CreateBatcherRunner {
164158
}
165159
Err(err) => {
166160
let latency_ms = start.elapsed().as_millis() as f64;
167-
::metrics::histogram!(METRICS_BATCH_LATENCY_MS, "type" => "create")
168-
.record(latency_ms);
169-
::metrics::counter!(METRICS_BATCH_FAILURE, "type" => "create").increment(1);
161+
metrics::record_batch_result("create", false, latency_ms);
170162

171163
tracing::error!(error = %err, "create batch send failed");
172164
let error_str = err.to_string();

services/gateway/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ mod batcher;
1717
mod config;
1818
mod create_batcher;
1919
mod error;
20-
mod metrics;
20+
pub mod metrics;
2121
pub mod nonce;
2222
mod ops_batcher;
2323
pub mod orphan_sweeper;

services/gateway/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ async fn main() -> GatewayResult<()> {
77
let env_path = Path::new(env!("CARGO_MANIFEST_DIR")).join(".env"); // load env vars in the root of this service
88
let _ = dotenvy::from_path(&env_path);
99
let _guard = telemetry_batteries::init();
10+
world_id_gateway::metrics::describe_metrics();
1011

1112
let _ = dotenvy::dotenv();
1213
tracing::info!("Starting world-id-gateway");

services/gateway/src/metrics.rs

Lines changed: 157 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,162 @@
1-
//! Metrics definitions for the world-id-gateway.
1+
//! Metrics definitions and helpers for the world-id-gateway.
22
3-
// Request metrics
4-
pub const METRICS_REQUESTS_LATENCY_MS: &str = "gateway.requests.latency_ms";
3+
// HTTP metrics
4+
pub const METRICS_HTTP_LATENCY_MS: &str = "http.latency_ms";
55

66
// Root cache metrics
7-
pub const METRICS_ROOT_CACHE_HITS: &str = "gateway.root_cache.hits";
8-
pub const METRICS_ROOT_CACHE_MISSES: &str = "gateway.root_cache.misses";
7+
pub const METRICS_ROOT_CACHE_HITS: &str = "root_cache.hits";
8+
pub const METRICS_ROOT_CACHE_MISSES: &str = "root_cache.misses";
99

1010
// Batcher metrics
11-
pub const METRICS_BATCH_SUBMITTED: &str = "gateway.batch.submitted";
12-
pub const METRICS_BATCH_SIZE: &str = "gateway.batch.size";
13-
pub const METRICS_BATCH_LATENCY_MS: &str = "gateway.batch.latency_ms";
14-
pub const METRICS_BATCH_SUCCESS: &str = "gateway.batch.success";
15-
pub const METRICS_BATCH_FAILURE: &str = "gateway.batch.failure";
16-
pub const METRICS_BATCH_POLICY_COST_SCORE: &str = "gateway.batch.policy.cost_score";
17-
pub const METRICS_BATCH_POLICY_URGENCY_SCORE: &str = "gateway.batch.policy.urgency_score";
18-
pub const METRICS_BATCH_POLICY_DEFER: &str = "gateway.batch.policy.defer";
19-
pub const METRICS_BATCH_POLICY_FORCE_SEND: &str = "gateway.batch.policy.force_send";
20-
pub const METRICS_BATCH_POLICY_TARGET_SIZE: &str = "gateway.batch.policy.target_size";
11+
pub const METRICS_BATCH_SUBMITTED: &str = "batch.submitted";
12+
pub const METRICS_BATCH_SIZE: &str = "batch.size";
13+
pub const METRICS_BATCH_LATENCY_MS: &str = "batch.latency_ms";
14+
pub const METRICS_BATCH_SUCCESS: &str = "batch.success";
15+
pub const METRICS_BATCH_FAILURE: &str = "batch.failure";
16+
pub const METRICS_BATCH_POLICY_COST_SCORE: &str = "batch.policy.cost_score";
17+
pub const METRICS_BATCH_POLICY_URGENCY_SCORE: &str = "batch.policy.urgency_score";
18+
pub const METRICS_BATCH_POLICY_DEFER: &str = "batch.policy.defer";
19+
pub const METRICS_BATCH_POLICY_FORCE_SEND: &str = "batch.policy.force_send";
20+
pub const METRICS_BATCH_POLICY_TARGET_SIZE: &str = "batch.policy.target_size";
21+
22+
pub fn describe_metrics() {
23+
::metrics::describe_histogram!(
24+
METRICS_HTTP_LATENCY_MS,
25+
::metrics::Unit::Milliseconds,
26+
"Gateway HTTP request latency in milliseconds."
27+
);
28+
29+
::metrics::describe_counter!(
30+
METRICS_ROOT_CACHE_HITS,
31+
::metrics::Unit::Count,
32+
"Number of root cache hits."
33+
);
34+
::metrics::describe_counter!(
35+
METRICS_ROOT_CACHE_MISSES,
36+
::metrics::Unit::Count,
37+
"Number of root cache misses."
38+
);
39+
40+
::metrics::describe_counter!(
41+
METRICS_BATCH_SUBMITTED,
42+
::metrics::Unit::Count,
43+
"Number of submitted batches."
44+
);
45+
::metrics::describe_histogram!(
46+
METRICS_BATCH_SIZE,
47+
::metrics::Unit::Count,
48+
"Number of requests per submitted batch."
49+
);
50+
::metrics::describe_histogram!(
51+
METRICS_BATCH_LATENCY_MS,
52+
::metrics::Unit::Milliseconds,
53+
"Batch submission latency in milliseconds."
54+
);
55+
::metrics::describe_counter!(
56+
METRICS_BATCH_SUCCESS,
57+
::metrics::Unit::Count,
58+
"Number of successfully submitted batches."
59+
);
60+
::metrics::describe_counter!(
61+
METRICS_BATCH_FAILURE,
62+
::metrics::Unit::Count,
63+
"Number of failed batch submissions."
64+
);
65+
66+
::metrics::describe_histogram!(
67+
METRICS_BATCH_POLICY_COST_SCORE,
68+
::metrics::Unit::Count,
69+
"Batch policy cost score."
70+
);
71+
::metrics::describe_histogram!(
72+
METRICS_BATCH_POLICY_URGENCY_SCORE,
73+
::metrics::Unit::Count,
74+
"Batch policy urgency score."
75+
);
76+
::metrics::describe_histogram!(
77+
METRICS_BATCH_POLICY_TARGET_SIZE,
78+
::metrics::Unit::Count,
79+
"Target batch size chosen by batch policy."
80+
);
81+
::metrics::describe_counter!(
82+
METRICS_BATCH_POLICY_FORCE_SEND,
83+
::metrics::Unit::Count,
84+
"Number of forced sends triggered by policy."
85+
);
86+
::metrics::describe_counter!(
87+
METRICS_BATCH_POLICY_DEFER,
88+
::metrics::Unit::Count,
89+
"Number of policy deferrals by reason."
90+
);
91+
}
92+
93+
pub fn record_http_latency_ms(path: &str, status: u16, latency_ms: f64) {
94+
let status_class = match status / 100 {
95+
1 => "1xx",
96+
2 => "2xx",
97+
3 => "3xx",
98+
4 => "4xx",
99+
5 => "5xx",
100+
_ => "other",
101+
};
102+
103+
::metrics::histogram!(
104+
METRICS_HTTP_LATENCY_MS,
105+
"route" => normalize_path(path),
106+
"status_class" => status_class
107+
)
108+
.record(latency_ms);
109+
}
110+
111+
pub fn increment_root_cache_hit() {
112+
::metrics::counter!(METRICS_ROOT_CACHE_HITS).increment(1);
113+
}
114+
115+
pub fn increment_root_cache_miss() {
116+
::metrics::counter!(METRICS_ROOT_CACHE_MISSES).increment(1);
117+
}
118+
119+
pub fn record_batch_submitted(batch_type: &'static str, batch_size: usize) {
120+
::metrics::counter!(METRICS_BATCH_SUBMITTED, "type" => batch_type).increment(1);
121+
::metrics::histogram!(METRICS_BATCH_SIZE, "type" => batch_type).record(batch_size as f64);
122+
}
123+
124+
pub fn record_batch_result(batch_type: &'static str, success: bool, latency_ms: f64) {
125+
::metrics::histogram!(METRICS_BATCH_LATENCY_MS, "type" => batch_type).record(latency_ms);
126+
127+
if success {
128+
::metrics::counter!(METRICS_BATCH_SUCCESS, "type" => batch_type).increment(1);
129+
} else {
130+
::metrics::counter!(METRICS_BATCH_FAILURE, "type" => batch_type).increment(1);
131+
}
132+
}
133+
134+
pub fn record_policy_scores(
135+
batch_type: &'static str,
136+
cost_score: f64,
137+
urgency_score: f64,
138+
target_batch_size: usize,
139+
) {
140+
::metrics::histogram!(METRICS_BATCH_POLICY_COST_SCORE, "type" => batch_type).record(cost_score);
141+
::metrics::histogram!(METRICS_BATCH_POLICY_URGENCY_SCORE, "type" => batch_type)
142+
.record(urgency_score);
143+
::metrics::histogram!(METRICS_BATCH_POLICY_TARGET_SIZE, "type" => batch_type)
144+
.record(target_batch_size as f64);
145+
}
146+
147+
pub fn increment_policy_force_send(batch_type: &'static str) {
148+
::metrics::counter!(METRICS_BATCH_POLICY_FORCE_SEND, "type" => batch_type).increment(1);
149+
}
150+
151+
pub fn increment_policy_defer(batch_type: &'static str, reason: &'static str) {
152+
::metrics::counter!(METRICS_BATCH_POLICY_DEFER, "type" => batch_type, "reason" => reason)
153+
.increment(1);
154+
}
155+
156+
fn normalize_path(path: &str) -> String {
157+
// Replace dynamic segments like /status/{id} with /status/:id
158+
if path.starts_with("/status/") {
159+
return "/status/:id".to_string();
160+
}
161+
path.to_string()
162+
}

services/gateway/src/ops_batcher.rs

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,7 @@ use crate::{
99
batch_policy::BaseFeeCache,
1010
config::BatchPolicyConfig,
1111
error::parse_contract_error,
12-
metrics::{
13-
METRICS_BATCH_FAILURE, METRICS_BATCH_LATENCY_MS, METRICS_BATCH_SIZE,
14-
METRICS_BATCH_SUBMITTED, METRICS_BATCH_SUCCESS,
15-
},
12+
metrics,
1613
policy_batcher::{PolicyBatchLoopRunner, TimedEnvelope},
1714
request_tracker::BacklogScope,
1815
};
@@ -95,8 +92,7 @@ impl OpsBatcherRunner {
9592
let batch_size = batch.len();
9693
let ids: Vec<String> = batch.iter().map(|env| env.id.clone()).collect();
9794

98-
::metrics::counter!(METRICS_BATCH_SUBMITTED, "type" => "ops").increment(1);
99-
::metrics::histogram!(METRICS_BATCH_SIZE, "type" => "ops").record(batch_size as f64);
95+
metrics::record_batch_submitted("ops", batch_size);
10096

10197
self.tracker
10298
.set_status_batch(&ids, GatewayRequestState::Batching)
@@ -116,8 +112,7 @@ impl OpsBatcherRunner {
116112
match res {
117113
Ok(builder) => {
118114
let latency_ms = start.elapsed().as_millis() as f64;
119-
::metrics::histogram!(METRICS_BATCH_LATENCY_MS, "type" => "ops").record(latency_ms);
120-
::metrics::counter!(METRICS_BATCH_SUCCESS, "type" => "ops").increment(1);
115+
metrics::record_batch_result("ops", true, latency_ms);
121116

122117
let hash = format!("0x{:x}", builder.tx_hash());
123118
self.tracker
@@ -154,8 +149,7 @@ impl OpsBatcherRunner {
154149
}
155150
Err(e) => {
156151
let latency_ms = start.elapsed().as_millis() as f64;
157-
::metrics::histogram!(METRICS_BATCH_LATENCY_MS, "type" => "ops").record(latency_ms);
158-
::metrics::counter!(METRICS_BATCH_FAILURE, "type" => "ops").increment(1);
152+
metrics::record_batch_result("ops", false, latency_ms);
159153

160154
tracing::warn!(error = %e, "multicall3 send failed");
161155
let error_str = e.to_string();

services/gateway/src/routes/is_valid_root.rs

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,4 @@
1-
use crate::{
2-
error::GatewayErrorResponse,
3-
metrics::{METRICS_ROOT_CACHE_HITS, METRICS_ROOT_CACHE_MISSES},
4-
request::Registry,
5-
types::AppState,
6-
};
1+
use crate::{error::GatewayErrorResponse, metrics, request::Registry, types::AppState};
72
use alloy::primitives::U256;
83
use axum::{Json, extract::State};
94
use std::{
@@ -90,10 +85,10 @@ pub(crate) async fn is_valid_root(
9085
) -> Result<Json<IsValidRootResponse>, GatewayErrorResponse> {
9186
let root = req_u256("root", &q.root)?;
9287
if is_cached_root(&state, root).await {
93-
::metrics::counter!(METRICS_ROOT_CACHE_HITS).increment(1);
88+
metrics::increment_root_cache_hit();
9489
return Ok(Json(IsValidRootResponse { valid: true }));
9590
}
96-
::metrics::counter!(METRICS_ROOT_CACHE_MISSES).increment(1);
91+
metrics::increment_root_cache_miss();
9792
let now = now_timestamp()?;
9893

9994
let valid = state

0 commit comments

Comments
 (0)