From ba4b96ddd0a06a390266702b7e4c9221d39319fb Mon Sep 17 00:00:00 2001 From: Toby Lawrence Date: Thu, 16 Jan 2025 12:32:05 -0500 Subject: [PATCH] experiment: expose ability to capture jemalloc heap profiles in pprof format from admin API --- .cargo/config.toml | 2 +- Cargo.lock | 92 +++++++++++++++++++++++++++++++- Cargo.toml | 1 + LICENSE-3rdparty.csv | 7 +++ bin/agent-data-plane/Cargo.toml | 10 ++-- bin/agent-data-plane/src/main.rs | 3 +- lib/saluki-app/Cargo.toml | 3 +- lib/saluki-app/src/memory.rs | 41 ++++++++++++++ 8 files changed, 148 insertions(+), 11 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index afc3c4a6..b5e3c7b3 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -2,4 +2,4 @@ rustflags = ["--cfg", "tokio_unstable"] [env] -JEMALLOC_SYS_WITH_MALLOC_CONF = "abort_conf:true,max_background_threads:1,narenas:1,tcache:false,thp:never,oversize_threshold:32768,dirty_decay_ms:1000,muzzy_decay_ms:0" +JEMALLOC_SYS_WITH_MALLOC_CONF = "abort_conf:true,prof:true,prof_active:true,lg_prof_sample:19,max_background_threads:1,narenas:1,tcache:false,thp:never,oversize_threshold:32768,dirty_decay_ms:1000,muzzy_decay_ms:0" diff --git a/Cargo.lock b/Cargo.lock index f860ab3d..5d1a3ff5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,6 +24,7 @@ dependencies = [ "async-trait", "bytesize", "chrono", + "jemalloc_pprof", "memory-accounting", "saluki-app", "saluki-components", @@ -42,7 +43,6 @@ dependencies = [ "tikv-jemalloc-ctl", "tikv-jemallocator", "tokio", - "tokio-rustls", "tracing", ] @@ -1932,6 +1932,23 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +[[package]] +name = "jemalloc_pprof" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb" +dependencies = [ + "anyhow", + "libc", + "mappings", + "once_cell", + "pprof_util", + "tempfile", + "tikv-jemalloc-ctl", + "tokio", + "tracing", +] + [[package]] name = "jobserver" version = "0.1.32" @@ -2140,6 +2157,19 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "mappings" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e" +dependencies = [ + "anyhow", + "libc", + "once_cell", + "pprof_util", + "tracing", +] + [[package]] name = "matchers" version = "0.1.0" @@ -2356,6 +2386,30 @@ dependencies = [ "winapi", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + [[package]] name = "num-complex" version = "0.4.6" @@ -2380,6 +2434,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -2777,6 +2853,19 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "pprof_util" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781" +dependencies = [ + "anyhow", + "flate2", + "num", + "paste", + "prost 0.13.4", +] + [[package]] name = "ppv-lite86" version = "0.2.20" @@ -3402,6 +3491,7 @@ dependencies = [ "chrono-tz", "http 1.2.0", "iana-time-zone", + "jemalloc_pprof", "memory-accounting", "metrics", "metrics-util", diff --git a/Cargo.toml b/Cargo.toml index 4b8b4eaa..aa6f321e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,6 +66,7 @@ tracing = { version = "0.1", default-features = false, features = ["std"] } ahash = { version = "0.8", default-features = false, features = ["std", "runtime-rng"] } async-compression = { version = "0.4.13", default-features = false } bitmask-enum = { version = "2.2", default-features = false } +jemalloc_pprof = { version = "0.6", default-features = false } figment = { version = "0.10", default-features = false } hostname = { version = "0.4", default-features = false } http = { version = "1", default-features = false } diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index eff79ce6..68f3852e 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -140,6 +140,7 @@ is-terminal,https://github.com/sunfishcode/is-terminal,MIT,"softprops +jemalloc_pprof,https://github.com/polarsignals/rust-jemalloc-pprof,Apache-2.0,"Frederic Branczyk , Brennan Vincent " jobserver,https://github.com/rust-lang/jobserver-rs,MIT OR Apache-2.0,Alex Crichton js-sys,https://github.com/rustwasm/wasm-bindgen/tree/master/crates/js-sys,MIT OR Apache-2.0,The wasm-bindgen Developers jsonpath-rust,https://github.com/besok/jsonpath-rust,MIT,BorisZhguchev @@ -154,6 +155,7 @@ libm,https://github.com/rust-lang/libm,MIT AND (MIT OR Apache-2.0),Jorge Aparici linux-raw-sys,https://github.com/sunfishcode/linux-raw-sys,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Dan Gohman litemap,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers log,https://github.com/rust-lang/log,MIT OR Apache-2.0,The Rust Project Developers +mappings,https://github.com/polarsignals/rust-jemalloc-pprof,Apache-2.0,The mappings Authors matchers,https://github.com/hawkw/matchers,MIT,Eliza Weisman matchit,https://github.com/ibraheemdev/matchit,MIT AND BSD-3-Clause,Ibraheem Ahmed matrixmultiply,https://github.com/bluss/matrixmultiply,MIT OR Apache-2.0,"bluss, R. Janis Goldschmidt" @@ -169,9 +171,13 @@ ndarray,https://github.com/rust-ndarray/ndarray,MIT OR Apache-2.0,"Ulrik Sverdru noisy_float,https://github.com/SergiusIW/noisy_float-rs,Apache-2.0,Matthew Michelotti nom,https://github.com/Geal/nom,MIT,contact@geoffroycouprie.com nu-ansi-term,https://github.com/nushell/nu-ansi-term,MIT,"ogham@bsago.me, Ryan Scheel (Havvy) , Josh Triplett , The Nushell Project Developers" +num,https://github.com/rust-num/num,MIT OR Apache-2.0,The Rust Project Developers +num-bigint,https://github.com/rust-num/num-bigint,MIT OR Apache-2.0,The Rust Project Developers num-complex,https://github.com/rust-num/num-complex,MIT OR Apache-2.0,The Rust Project Developers num-conv,https://github.com/jhpratt/num-conv,MIT OR Apache-2.0,Jacob Pratt num-integer,https://github.com/rust-num/num-integer,MIT OR Apache-2.0,The Rust Project Developers +num-iter,https://github.com/rust-num/num-iter,MIT OR Apache-2.0,The Rust Project Developers +num-rational,https://github.com/rust-num/num-rational,MIT OR Apache-2.0,The Rust Project Developers num-traits,https://github.com/rust-num/num-traits,MIT OR Apache-2.0,The Rust Project Developers num_threads,https://github.com/jhpratt/num_threads,MIT OR Apache-2.0,Jacob Pratt object,https://github.com/gimli-rs/object,Apache-2.0 OR MIT,The object Authors @@ -202,6 +208,7 @@ plotters,https://github.com/plotters-rs/plotters,MIT,Hao Hou +pprof_util,https://github.com/polarsignals/rust-jemalloc-pprof,Apache-2.0,The pprof_util Authors ppv-lite86,https://github.com/cryptocorrosion/cryptocorrosion,MIT OR Apache-2.0,The CryptoCorrosion Contributors prettyplease,https://github.com/dtolnay/prettyplease,MIT OR Apache-2.0,David Tolnay proc-macro2,https://github.com/dtolnay/proc-macro2,MIT OR Apache-2.0,"David Tolnay , Alex Crichton " diff --git a/bin/agent-data-plane/Cargo.toml b/bin/agent-data-plane/Cargo.toml index 87febcfb..7176efcd 100644 --- a/bin/agent-data-plane/Cargo.toml +++ b/bin/agent-data-plane/Cargo.toml @@ -12,6 +12,7 @@ fips = ["saluki-app/tls-fips"] [dependencies] async-trait = { workspace = true } bytesize = { workspace = true } +jemalloc_pprof = { workspace = true } memory-accounting = { workspace = true } saluki-app = { workspace = true, features = ["full"] } saluki-components = { workspace = true } @@ -27,19 +28,14 @@ saluki-metadata = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } stringtheory = { workspace = true } -tokio = { workspace = true, features = [ - "macros", - "rt", - "rt-multi-thread", - "signal", -] } -tokio-rustls = { workspace = true } +tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread", "signal"] } tracing = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } tikv-jemallocator = { workspace = true, features = [ "background_threads", + "profiling", "unprefixed_malloc_on_supported_platforms", "stats", ] } diff --git a/bin/agent-data-plane/src/main.rs b/bin/agent-data-plane/src/main.rs index ba42c31e..f6cec7a5 100644 --- a/bin/agent-data-plane/src/main.rs +++ b/bin/agent-data-plane/src/main.rs @@ -11,7 +11,7 @@ use std::{ }; use memory_accounting::{ComponentBounds, ComponentRegistry}; -use saluki_app::{api::APIBuilder, logging::LoggingAPIHandler, prelude::*}; +use saluki_app::{api::APIBuilder, logging::LoggingAPIHandler, memory::MemoryProfilingAPIHandler, prelude::*}; use saluki_components::{ destinations::{ new_remote_agent_service, DatadogEventsServiceChecksConfiguration, DatadogMetricsConfiguration, @@ -123,6 +123,7 @@ async fn run(started: Instant, logging_api_handler: LoggingAPIHandler) -> Result .with_self_signed_tls() .with_grpc_service(new_remote_agent_service()) .with_handler(logging_api_handler) + .with_handler(MemoryProfilingAPIHandler) .with_optional_handler(env_provider.workload_api_handler()); // Run memory bounds validation to ensure that we can launch the topology with our configured memory limit, if any. diff --git a/lib/saluki-app/Cargo.toml b/lib/saluki-app/Cargo.toml index f6c1fb82..425351dc 100644 --- a/lib/saluki-app/Cargo.toml +++ b/lib/saluki-app/Cargo.toml @@ -10,7 +10,7 @@ default = [] full = ["api", "logging", "memory", "metrics", "tls"] api = ["dep:axum", "dep:saluki-api", "dep:saluki-error", "dep:saluki-io", "dep:tokio", "dep:tower", "dep:tracing"] logging = ["api", "dep:chrono", "dep:chrono-tz", "dep:iana-time-zone", "dep:serde", "dep:tracing", "dep:tracing-subscriber"] -memory = ["metrics", "dep:bytesize", "dep:memory-accounting", "dep:saluki-config", "dep:saluki-error", "dep:serde", "dep:tokio", "dep:tracing"] +memory = ["api", "metrics", "dep:bytesize", "dep:jemalloc_pprof", "dep:memory-accounting", "dep:saluki-config", "dep:saluki-error", "dep:serde", "dep:tokio", "dep:tracing"] metrics = ["dep:saluki-core", "dep:metrics", "dep:tokio"] tls = ["dep:saluki-error", "dep:saluki-tls"] tls-fips = ["saluki-tls?/fips"] @@ -22,6 +22,7 @@ chrono = { workspace = true, optional = true } chrono-tz = { workspace = true, optional = true } http = { workspace = true } iana-time-zone = { workspace = true, optional = true } +jemalloc_pprof = { workspace = true, default-features = false, optional = true } memory-accounting = { workspace = true, optional = true } metrics = { workspace = true, optional = true } metrics-util = { workspace = true, features = ["handles", "recency", "registry"], optional = true } diff --git a/lib/saluki-app/src/memory.rs b/lib/saluki-app/src/memory.rs index 69944433..7d67895e 100644 --- a/lib/saluki-app/src/memory.rs +++ b/lib/saluki-app/src/memory.rs @@ -13,6 +13,11 @@ use memory_accounting::{ ComponentRegistry, MemoryGrant, MemoryLimiter, VerifiedBounds, }; use metrics::{counter, gauge, Counter, Gauge, Level}; +use saluki_api::{ + response::IntoResponse, + routing::{get, Router}, + APIHandler, StatusCode, +}; use saluki_config::GenericConfiguration; use saluki_error::{generic_error, ErrorContext as _, GenericError}; use serde::Deserialize; @@ -316,3 +321,39 @@ impl CgroupMemoryParser { memory.parse::().ok() } } + +/// An API handler for memory profiling. +/// +/// This handler exposes a single route -- `/debug/pprof/heap` -- which returns a jemalloc heap profile in pprof +/// format. As one might expect, this handler should only be used when jemalloc is set as the global allocator for the +/// application. +pub struct MemoryProfilingAPIHandler; + +impl MemoryProfilingAPIHandler { + async fn pprof_handler() -> Result { + let mut prof_ctl = jemalloc_pprof::PROF_CTL.as_ref().unwrap().lock().await; + require_profiling_activated(&prof_ctl)?; + let pprof = prof_ctl + .dump_pprof() + .map_err(|err| (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()))?; + Ok(pprof) + } +} + +impl APIHandler for MemoryProfilingAPIHandler { + type State = (); + + fn generate_initial_state(&self) -> Self::State {} + + fn generate_routes(&self) -> Router { + Router::new().route("/debug/pprof/heap", get(Self::pprof_handler)) + } +} + +fn require_profiling_activated(prof_ctl: &jemalloc_pprof::JemallocProfCtl) -> Result<(), (StatusCode, String)> { + if prof_ctl.activated() { + Ok(()) + } else { + Err((axum::http::StatusCode::FORBIDDEN, "heap profiling not activated".into())) + } +}