From 8db5ca74b53cbbb940e6a6c80862069b9c8748b1 Mon Sep 17 00:00:00 2001
From: Remi Dettai <rdettai@gmail.com>
Date: Tue, 29 Apr 2025 16:57:57 +0200
Subject: [PATCH 1/5] Heap profiler with leak tracking

---
 quickwit/Cargo.lock                           |   4 +
 quickwit/Cargo.toml                           |   1 +
 quickwit/quickwit-cli/Cargo.toml              |   8 +
 quickwit/quickwit-cli/src/jemalloc.rs         |   4 +
 quickwit/quickwit-common/Cargo.toml           |   9 +
 quickwit/quickwit-common/src/alloc_tracker.rs | 232 ++++++++++++++++++
 .../quickwit-common/src/jemalloc_profiled.rs  | 202 +++++++++++++++
 quickwit/quickwit-common/src/lib.rs           |   4 +
 quickwit/quickwit-serve/Cargo.toml            |   3 +
 .../src/developer_api/heap_prof.rs            |  53 ++++
 .../src/developer_api/heap_prof_disabled.rs   |  29 +++
 .../quickwit-serve/src/developer_api/mod.rs   |   9 +-
 12 files changed, 555 insertions(+), 3 deletions(-)
 create mode 100644 quickwit/quickwit-common/src/alloc_tracker.rs
 create mode 100644 quickwit/quickwit-common/src/jemalloc_profiled.rs
 create mode 100644 quickwit/quickwit-serve/src/developer_api/heap_prof.rs
 create mode 100644 quickwit/quickwit-serve/src/developer_api/heap_prof_disabled.rs

diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
index d7f4eee6c63..88e6f7ee978 100644
--- a/quickwit/Cargo.lock
+++ b/quickwit/Cargo.lock
@@ -6866,6 +6866,7 @@ dependencies = [
  "anyhow",
  "async-speed-limit",
  "async-trait",
+ "backtrace",
  "bytesize",
  "coarsetime",
  "dyn-clone",
@@ -6887,10 +6888,13 @@ dependencies = [
  "regex",
  "serde",
  "serde_json",
+ "serial_test",
  "siphasher 0.3.11",
  "sysinfo",
  "tempfile",
  "thiserror 1.0.69",
+ "tikv-jemalloc-ctl",
+ "tikv-jemallocator",
  "tokio",
  "tokio-metrics",
  "tokio-stream",
diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml
index 1f687c96d72..460dfb06756 100644
--- a/quickwit/Cargo.toml
+++ b/quickwit/Cargo.toml
@@ -85,6 +85,7 @@ assert-json-diff = "2"
 async-compression = { version = "0.4", features = ["tokio", "gzip"] }
 async-speed-limit = "0.4"
 async-trait = "0.1"
+backtrace = "0.3"
 base64 = "0.22"
 binggan = { version = "0.14" }
 bytes = { version = "1", features = ["serde"] }
diff --git a/quickwit/quickwit-cli/Cargo.toml b/quickwit/quickwit-cli/Cargo.toml
index 41a8fdce5e0..524e77fc8fd 100644
--- a/quickwit/quickwit-cli/Cargo.toml
+++ b/quickwit/quickwit-cli/Cargo.toml
@@ -81,6 +81,10 @@ quickwit-storage = { workspace = true, features = ["testsuite"] }
 
 [features]
 jemalloc = ["dep:tikv-jemalloc-ctl", "dep:tikv-jemallocator"]
+jemalloc-profiled = [
+  "quickwit-common/jemalloc-profiled",
+  "quickwit-serve/jemalloc-profiled"
+]
 ci-test = []
 pprof = ["quickwit-serve/pprof"]
 openssl-support = ["openssl-probe"]
@@ -127,6 +131,10 @@ release-macos-feature-vendored-set = [
   "quickwit-metastore/postgres",
   "quickwit-doc-mapper/multilang",
 ]
+release-heap-profiled = [
+  "release-feature-set",
+  "jemalloc-profiled"
+]
 
 [package.metadata.cargo-machete]
 # used to enable the `multilang` feature
diff --git a/quickwit/quickwit-cli/src/jemalloc.rs b/quickwit/quickwit-cli/src/jemalloc.rs
index 71969f79909..e5223e5ee31 100644
--- a/quickwit/quickwit-cli/src/jemalloc.rs
+++ b/quickwit/quickwit-cli/src/jemalloc.rs
@@ -19,6 +19,10 @@ use tikv_jemallocator::Jemalloc;
 use tracing::error;
 
 #[global_allocator]
+#[cfg(feature = "jemalloc-profiled")]
+pub static GLOBAL: quickwit_common::jemalloc_profiled::JemallocProfiled =
+    quickwit_common::jemalloc_profiled::JemallocProfiled(Jemalloc);
+#[cfg(not(feature = "jemalloc-profiled"))]
 pub static GLOBAL: Jemalloc = Jemalloc;
 
 const JEMALLOC_METRICS_POLLING_INTERVAL: Duration = Duration::from_secs(1);
diff --git a/quickwit/quickwit-common/Cargo.toml b/quickwit/quickwit-common/Cargo.toml
index 3abcd35f3d5..5f0e7f1375b 100644
--- a/quickwit/quickwit-common/Cargo.toml
+++ b/quickwit/quickwit-common/Cargo.toml
@@ -14,6 +14,7 @@ license.workspace = true
 anyhow = { workspace = true }
 async-speed-limit = { workspace = true }
 async-trait = { workspace = true }
+backtrace = { workspace = true, optional = true }
 bytesize = { workspace = true }
 coarsetime = { workspace = true }
 dyn-clone = { workspace = true }
@@ -37,6 +38,8 @@ siphasher = { workspace = true }
 sysinfo = { workspace = true }
 tempfile = { workspace = true }
 thiserror = { workspace = true }
+tikv-jemallocator = { workspace = true, optional = true }
+tikv-jemalloc-ctl = { workspace = true, optional = true }
 tokio = { workspace = true }
 tokio-metrics = { workspace = true }
 tokio-stream = { workspace = true }
@@ -47,9 +50,15 @@ tracing = { workspace = true }
 [features]
 testsuite = []
 named_tasks = ["tokio/tracing"]
+jemalloc-profiled = [
+    "dep:backtrace",
+    "dep:tikv-jemallocator",
+    "dep:tikv-jemalloc-ctl"
+]
 
 [dev-dependencies]
 serde_json = { workspace = true }
 tempfile = { workspace = true }
 proptest = { workspace = true }
+serial_test = { workspace = true }
 tokio = { workspace = true, features = ["test-util"] }
diff --git a/quickwit/quickwit-common/src/alloc_tracker.rs b/quickwit/quickwit-common/src/alloc_tracker.rs
new file mode 100644
index 00000000000..7c2bf3ae483
--- /dev/null
+++ b/quickwit/quickwit-common/src/alloc_tracker.rs
@@ -0,0 +1,232 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::collections::hash_map::Entry;
+use std::sync::Mutex;
+
+use once_cell::sync::Lazy;
+
+const DEFAULT_REPORTING_INTERVAL: u64 = 1024 * 1024 * 1024;
+
+static ALLOCATION_TRACKER: Lazy<Mutex<Allocations>> =
+    Lazy::new(|| Mutex::new(Allocations::default()));
+
+#[derive(Debug)]
+struct Allocation {
+    pub callsite_hash: u64,
+    pub size: u64,
+}
+
+#[derive(Debug, Copy, Clone)]
+pub struct Statistic {
+    pub count: u64,
+    pub size: u64,
+    pub last_print: u64,
+}
+
+/// WARN:
+/// - keys and values in these maps should not allocate!
+/// - we assume HashMaps don't allocate if their capacity is not exceeded
+#[derive(Debug)]
+struct Allocations {
+    memory_locations: HashMap<usize, Allocation>,
+    callsite_statistics: HashMap<u64, Statistic>,
+    is_started: bool,
+    reporting_interval: u64,
+}
+
+impl Default for Allocations {
+    fn default() -> Self {
+        Self {
+            memory_locations: HashMap::with_capacity(128 * 1024),
+            callsite_statistics: HashMap::with_capacity(32 * 1024),
+            is_started: false,
+            reporting_interval: DEFAULT_REPORTING_INTERVAL,
+        }
+    }
+}
+
+// pub fn log_dump() {
+//     tracing::info!(allocations=?ALLOCATION_TRACKER.lock().unwrap(), "dump");
+// }
+
+pub fn init(alloc_size_triggering_backtrace: Option<u64>) {
+    let mut guard = ALLOCATION_TRACKER.lock().unwrap();
+    guard.memory_locations.clear();
+    guard.callsite_statistics.clear();
+    guard.is_started = true;
+    guard.reporting_interval =
+        alloc_size_triggering_backtrace.unwrap_or(DEFAULT_REPORTING_INTERVAL);
+}
+
+pub enum AllocRecordingResponse {
+    ThresholdExceeded(Statistic),
+    ThresholdNotExceeded,
+    TrackerFull(&'static str),
+    NotStarted,
+}
+
+/// Records an allocation and occasionally reports the cumulated allocation size
+/// for the provided callsite_hash.
+///
+/// Every time a the total allocated size with the same callsite_hash
+/// exceeds the previous reported value by at least reporting_interval, that
+/// allocated size is reported.
+///
+/// WARN: this function should not allocate!
+pub fn record_allocation(callsite_hash: u64, size: u64, ptr: *mut u8) -> AllocRecordingResponse {
+    let mut guard = ALLOCATION_TRACKER.lock().unwrap();
+    if !guard.is_started {
+        return AllocRecordingResponse::NotStarted;
+    }
+    if guard.memory_locations.capacity() == guard.memory_locations.len() {
+        return AllocRecordingResponse::TrackerFull("memory_locations");
+    }
+    if guard.callsite_statistics.capacity() == guard.callsite_statistics.len() {
+        return AllocRecordingResponse::TrackerFull("memory_locations");
+    }
+    guard.memory_locations.insert(
+        ptr as usize,
+        Allocation {
+            callsite_hash,
+            size,
+        },
+    );
+    let reporting_interval = guard.reporting_interval;
+    let entry = guard
+        .callsite_statistics
+        .entry(callsite_hash)
+        .and_modify(|stat| {
+            stat.count += 1;
+            stat.size += size;
+        })
+        .or_insert(Statistic {
+            count: 1,
+            size,
+            last_print: 0,
+        });
+    let new_threshold_exceeded = entry.size > (entry.last_print + reporting_interval);
+    if new_threshold_exceeded {
+        let reported_statistic = *entry;
+        entry.last_print = entry.size;
+        AllocRecordingResponse::ThresholdExceeded(reported_statistic)
+    } else {
+        AllocRecordingResponse::ThresholdNotExceeded
+    }
+}
+
+/// WARN: this function should not allocate!
+pub fn record_deallocation(ptr: *mut u8) {
+    let mut guard = ALLOCATION_TRACKER.lock().unwrap();
+    if !guard.is_started {
+        return;
+    }
+    let Some(Allocation {
+        size,
+        callsite_hash,
+        ..
+    }) = guard.memory_locations.remove(&(ptr as usize))
+    else {
+        // this was allocated before the tracking started
+        return;
+    };
+    if let Entry::Occupied(mut content) = guard.callsite_statistics.entry(callsite_hash) {
+        content.get_mut().count -= 1;
+        content.get_mut().size -= size;
+        if content.get().count == 0 {
+            content.remove();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    #[serial_test::file_serial]
+    fn test_record_allocation_and_deallocation() {
+        init(Some(2000));
+        let callsite_hash_1 = 777;
+
+        let ptr_1 = 0x1 as *mut u8;
+        let response = record_allocation(callsite_hash_1, 1500, ptr_1);
+        assert!(matches!(
+            response,
+            AllocRecordingResponse::ThresholdNotExceeded
+        ));
+
+        let ptr_2 = 0x2 as *mut u8;
+        let response = record_allocation(callsite_hash_1, 1500, ptr_2);
+        let AllocRecordingResponse::ThresholdExceeded(statistic) = response else {
+            panic!("Expected ThresholdExceeded response");
+        };
+        assert_eq!(statistic.count, 2);
+        assert_eq!(statistic.size, 3000);
+        assert_eq!(statistic.last_print, 0);
+
+        record_deallocation(ptr_2);
+
+        // the threshold was already crossed
+        let ptr_3 = 0x3 as *mut u8;
+        let response = record_allocation(callsite_hash_1, 1500, ptr_3);
+        assert!(matches!(
+            response,
+            AllocRecordingResponse::ThresholdNotExceeded
+        ));
+
+        // this is a brand new call site with different statistics
+        let callsite_hash_2 = 42;
+        let ptr_3 = 0x3 as *mut u8;
+        let response = record_allocation(callsite_hash_2, 1500, ptr_3);
+        assert!(matches!(
+            response,
+            AllocRecordingResponse::ThresholdNotExceeded
+        ));
+    }
+
+    #[test]
+    #[serial_test::file_serial]
+    fn test_tracker_full() {
+        init(Some(1024 * 1024 * 1024));
+        let memory_locations_capacity = ALLOCATION_TRACKER
+            .lock()
+            .unwrap()
+            .memory_locations
+            .capacity();
+
+        for i in 0..memory_locations_capacity {
+            let ptr = (i + 1) as *mut u8;
+            let response = record_allocation(777, 10, ptr);
+            assert!(matches!(
+                response,
+                AllocRecordingResponse::ThresholdNotExceeded
+            ));
+        }
+        let response = record_allocation(777, 10, (memory_locations_capacity + 1) as *mut u8);
+        assert!(matches!(
+            response,
+            AllocRecordingResponse::TrackerFull("memory_locations")
+        ));
+        // make sure that the map didn't grow
+        let current_memory_locations_capacity = ALLOCATION_TRACKER
+            .lock()
+            .unwrap()
+            .memory_locations
+            .capacity();
+        assert_eq!(current_memory_locations_capacity, memory_locations_capacity);
+    }
+}
diff --git a/quickwit/quickwit-common/src/jemalloc_profiled.rs b/quickwit/quickwit-common/src/jemalloc_profiled.rs
new file mode 100644
index 00000000000..179234f09b5
--- /dev/null
+++ b/quickwit/quickwit-common/src/jemalloc_profiled.rs
@@ -0,0 +1,202 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::alloc::{GlobalAlloc, Layout};
+use std::hash::Hasher;
+use std::io::Write;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+
+use tikv_jemallocator::Jemalloc;
+use tracing::{error, info};
+
+use crate::alloc_tracker::{self, AllocRecordingResponse};
+
+const DEFAULT_MIN_ALLOC_SIZE_FOR_PROFILING: usize = 256 * 1024;
+
+// Atomics are used to communicate configurations between the start/stop
+// endpoints and the JemallocProfiled allocator wrapper.
+
+/// The minimum allocation size that is recorded by the tracker.
+static MIN_ALLOC_SIZE_FOR_PROFILING: AtomicUsize =
+    AtomicUsize::new(DEFAULT_MIN_ALLOC_SIZE_FOR_PROFILING);
+
+/// Whether the profiling is started or not.
+static ENABLED: AtomicBool = AtomicBool::new(false);
+
+/// Starts measuring heap allocations and logs important leaks.
+///
+/// This function uses a wrapper around the global Jemalloc allocator to
+/// instrument it. Each time an allocation bigger than
+/// min_alloc_size_for_profiling is performed, it is recorded in a map and the
+/// statistics for its call site are updated.
+///
+/// During profiling, the statistics per call site are used to log when specific
+/// thresholds are exceeded. For each call site, the allocated memory is
+/// logged (with a backtrace) every time it exceeds the last logged allocated
+/// memory by at least alloc_size_triggering_backtrace.
+pub fn start_profiling(
+    min_alloc_size_for_profiling: Option<usize>,
+    alloc_size_triggering_backtrace: Option<u64>,
+) {
+    // Call backtrace once to warmup symbolization allocations (~30MB)
+    backtrace::trace(|frame| {
+        backtrace::resolve_frame(frame, |_| {});
+        true
+    });
+
+    alloc_tracker::init(alloc_size_triggering_backtrace);
+
+    let min_alloc_size_for_profiling =
+        min_alloc_size_for_profiling.unwrap_or(DEFAULT_MIN_ALLOC_SIZE_FOR_PROFILING);
+    // Use strong ordering to make sure all threads see these changes in this order
+    MIN_ALLOC_SIZE_FOR_PROFILING.store(min_alloc_size_for_profiling, Ordering::SeqCst);
+    let previously_enabled = ENABLED.swap(true, Ordering::SeqCst);
+
+    info!(
+        min_alloc_size_for_profiling,
+        alloc_size_triggering_backtrace, previously_enabled, "heap profiling running"
+    );
+}
+
+/// Stops measuring heap allocations.
+///
+/// The allocation tracking tables and the symbol cache are not cleared.
+pub fn stop_profiling() {
+    // Use strong ordering to make sure all threads see these changes in this order
+    let previously_enabled = ENABLED.swap(false, Ordering::SeqCst);
+    MIN_ALLOC_SIZE_FOR_PROFILING.store(DEFAULT_MIN_ALLOC_SIZE_FOR_PROFILING, Ordering::SeqCst);
+
+    info!(previously_enabled, "heap profiling stopped");
+    // alloc_tracker::log_dump();
+    // backtrace::clear_symbol_cache();
+}
+
+/// Wraps the Jemalloc global allocator calls with tracking routines.
+///
+/// The tracking routines are called only when [ENABLED] is set to true (calling
+/// [start_profiling()]), but we don't enforce any synchronization (we load it with
+/// Ordering::Relaxed) because it's fine to miss or record extra allocation events.
+pub struct JemallocProfiled(pub Jemalloc);
+
+unsafe impl GlobalAlloc for JemallocProfiled {
+    #[inline]
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        let ptr = unsafe { self.0.alloc(layout) };
+        if ENABLED.load(Ordering::Relaxed) {
+            track_alloc_call(ptr, layout);
+        }
+        ptr
+    }
+
+    #[inline]
+    unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
+        let ptr = unsafe { self.0.alloc_zeroed(layout) };
+        if ENABLED.load(Ordering::Relaxed) {
+            track_alloc_call(ptr, layout);
+        }
+        ptr
+    }
+
+    #[inline]
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        if ENABLED.load(Ordering::Relaxed) {
+            track_dealloc_call(ptr, layout);
+        }
+        unsafe { self.0.dealloc(ptr, layout) }
+    }
+
+    #[inline]
+    unsafe fn realloc(&self, old_ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
+        let new_ptr = unsafe { self.0.realloc(old_ptr, layout, new_size) };
+        if ENABLED.load(Ordering::Relaxed) {
+            track_realloc_call(old_ptr, new_ptr, layout, new_size);
+        }
+        new_ptr
+    }
+}
+
+#[inline]
+fn print_backtrace(callsite_hash: u64, stat: alloc_tracker::Statistic) {
+    {
+        let mut lock = std::io::stdout().lock();
+        let _ = writeln!(
+            &mut lock,
+            "htrk callsite={} allocs={} size={}MiB",
+            callsite_hash,
+            stat.count,
+            stat.size / 1024 / 1024
+        );
+        backtrace::trace(|frame| {
+            backtrace::resolve_frame(frame, |symbol| {
+                if let Some(symbole_name) = symbol.name() {
+                    let _ = writeln!(&mut lock, "{}", symbole_name);
+                } else {
+                    let _ = writeln!(&mut lock, "symb failed");
+                }
+            });
+            true
+        });
+    }
+}
+
+#[inline]
+fn backtrace_hash() -> u64 {
+    let mut hasher = fnv::FnvHasher::default();
+    backtrace::trace(|frame| {
+        hasher.write_usize(frame.ip() as usize);
+        true
+    });
+    hasher.finish()
+}
+
+#[cold]
+fn track_alloc_call(ptr: *mut u8, layout: Layout) {
+    if layout.size() > MIN_ALLOC_SIZE_FOR_PROFILING.load(Ordering::Relaxed) {
+        let callsite_hash = backtrace_hash();
+        let recording_response =
+            alloc_tracker::record_allocation(callsite_hash, layout.size() as u64, ptr);
+
+        match recording_response {
+            AllocRecordingResponse::ThresholdExceeded(stat_for_trace) => {
+                print_backtrace(callsite_hash, stat_for_trace);
+                // Could we use tracing to caracterize the call site here?
+                // tracing::info!(size = alloc_size_for_trace, "large alloc");
+            }
+            AllocRecordingResponse::TrackerFull(reason) => {
+                // this message might be displayed multiple times but that's fine
+                error!("{reason} full, profiling stopped");
+                ENABLED.store(false, Ordering::Relaxed);
+            }
+            AllocRecordingResponse::ThresholdNotExceeded => {}
+            AllocRecordingResponse::NotStarted => {}
+        }
+    }
+}
+
+#[cold]
+fn track_dealloc_call(ptr: *mut u8, layout: Layout) {
+    if layout.size() > MIN_ALLOC_SIZE_FOR_PROFILING.load(Ordering::Relaxed) {
+        alloc_tracker::record_deallocation(ptr);
+    }
+}
+
+#[cold]
+fn track_realloc_call(
+    _old_ptr: *mut u8,
+    _new_pointer: *mut u8,
+    _current_layout: Layout,
+    _new_size: usize,
+) {
+    // TODO handle realloc
+}
diff --git a/quickwit/quickwit-common/src/lib.rs b/quickwit/quickwit-common/src/lib.rs
index 12987898b0f..2b9fa51474d 100644
--- a/quickwit/quickwit-common/src/lib.rs
+++ b/quickwit/quickwit-common/src/lib.rs
@@ -16,9 +16,13 @@
 
 mod coolid;
 
+#[cfg(feature = "jemalloc-profiled")]
+pub(crate) mod alloc_tracker;
 pub mod binary_heap;
 pub mod fs;
 pub mod io;
+#[cfg(feature = "jemalloc-profiled")]
+pub mod jemalloc_profiled;
 mod kill_switch;
 pub mod metrics;
 pub mod net;
diff --git a/quickwit/quickwit-serve/Cargo.toml b/quickwit/quickwit-serve/Cargo.toml
index 88c3e4278b4..1c66ccc0e31 100644
--- a/quickwit/quickwit-serve/Cargo.toml
+++ b/quickwit/quickwit-serve/Cargo.toml
@@ -106,6 +106,9 @@ quickwit-storage = { workspace = true, features = ["testsuite"] }
 pprof = [
   "dep:pprof"
 ]
+jemalloc-profiled = [
+  "quickwit-common/jemalloc-profiled"
+]
 testsuite = []
 sqs-for-tests = [
   "quickwit-indexing/sqs",
diff --git a/quickwit/quickwit-serve/src/developer_api/heap_prof.rs b/quickwit/quickwit-serve/src/developer_api/heap_prof.rs
new file mode 100644
index 00000000000..deff7311932
--- /dev/null
+++ b/quickwit/quickwit-serve/src/developer_api/heap_prof.rs
@@ -0,0 +1,53 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use quickwit_common::jemalloc_profiled::{start_profiling, stop_profiling};
+use serde::Deserialize;
+use warp::Filter;
+use warp::reply::Reply;
+
+pub fn heap_prof_handlers()
+-> impl Filter<Extract = impl warp::Reply, Error = warp::Rejection> + Clone {
+    #[derive(Deserialize)]
+    struct ProfilerQueryParams {
+        min_alloc_size: Option<usize>,
+        backtrace_every: Option<u64>,
+    }
+
+    let start_profiler = {
+        warp::path!("heap-prof" / "start")
+            .and(warp::query::<ProfilerQueryParams>())
+            .and_then(move |params: ProfilerQueryParams| start_profiler_handler(params))
+    };
+
+    let stop_profiler = { warp::path!("heap-prof" / "stop").and_then(stop_profiler_handler) };
+
+    async fn start_profiler_handler(
+        params: ProfilerQueryParams,
+    ) -> Result<hyper::Response<hyper::Body>, warp::Rejection> {
+        start_profiling(params.min_alloc_size, params.backtrace_every);
+        let resp = warp::reply::with_status("Heap profiling started", warp::http::StatusCode::OK)
+            .into_response();
+        Ok(resp)
+    }
+
+    async fn stop_profiler_handler() -> Result<hyper::Response<hyper::Body>, warp::Rejection> {
+        stop_profiling();
+        let resp = warp::reply::with_status("Heap profiling stopped", warp::http::StatusCode::OK)
+            .into_response();
+        Ok(resp)
+    }
+
+    start_profiler.or(stop_profiler)
+}
diff --git a/quickwit/quickwit-serve/src/developer_api/heap_prof_disabled.rs b/quickwit/quickwit-serve/src/developer_api/heap_prof_disabled.rs
new file mode 100644
index 00000000000..a71f724ae0d
--- /dev/null
+++ b/quickwit/quickwit-serve/src/developer_api/heap_prof_disabled.rs
@@ -0,0 +1,29 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use warp::Filter;
+
+fn not_implemented_handler() -> impl warp::Reply {
+    warp::reply::with_status(
+        "Quickwit was compiled without the `jemalloc-profiled` feature",
+        warp::http::StatusCode::NOT_IMPLEMENTED,
+    )
+}
+
+pub fn heap_prof_handlers()
+-> impl Filter<Extract = impl warp::Reply, Error = warp::Rejection> + Clone {
+    let start_profiler = { warp::path!("heap-prof" / "start").map(not_implemented_handler) };
+    let stop_profiler = { warp::path!("heap-prof" / "stop").map(not_implemented_handler) };
+    start_profiler.or(stop_profiler)
+}
diff --git a/quickwit/quickwit-serve/src/developer_api/mod.rs b/quickwit/quickwit-serve/src/developer_api/mod.rs
index 4163db9c933..c7722d3a581 100644
--- a/quickwit/quickwit-serve/src/developer_api/mod.rs
+++ b/quickwit/quickwit-serve/src/developer_api/mod.rs
@@ -13,14 +13,16 @@
 // limitations under the License.
 
 mod debug;
-mod log_level;
 
+#[cfg_attr(not(feature = "jemalloc-profiled"), path = "heap_prof_disabled.rs")]
+mod heap_prof;
+mod log_level;
 #[cfg_attr(not(feature = "pprof"), path = "pprof_disabled.rs")]
 mod pprof;
-
 mod server;
 
 use debug::debug_handler;
+use heap_prof::heap_prof_handlers;
 use log_level::log_level_handler;
 use pprof::pprof_handlers;
 use quickwit_cluster::Cluster;
@@ -42,7 +44,8 @@ pub(crate) fn developer_api_routes(
         .and(
             debug_handler(cluster.clone())
                 .or(log_level_handler(env_filter_reload_fn.clone()).boxed())
-                .or(pprof_handlers()),
+                .or(pprof_handlers())
+                .or(heap_prof_handlers()),
         )
         .recover(recover_fn)
 }

From b7062e39821c552a036a68e7bed4a1aaa31c9e9d Mon Sep 17 00:00:00 2001
From: Remi Dettai <rdettai@gmail.com>
Date: Wed, 30 Apr 2025 15:59:25 +0200
Subject: [PATCH 2/5] Improve wording and types

---
 quickwit/quickwit-common/src/alloc_tracker.rs | 72 ++++++++++---------
 .../quickwit-common/src/jemalloc_profiled.rs  | 66 +++++++++--------
 .../src/developer_api/heap_prof.rs            |  2 +-
 3 files changed, 75 insertions(+), 65 deletions(-)

diff --git a/quickwit/quickwit-common/src/alloc_tracker.rs b/quickwit/quickwit-common/src/alloc_tracker.rs
index 7c2bf3ae483..655e5962e7d 100644
--- a/quickwit/quickwit-common/src/alloc_tracker.rs
+++ b/quickwit/quickwit-common/src/alloc_tracker.rs
@@ -16,24 +16,29 @@ use std::collections::HashMap;
 use std::collections::hash_map::Entry;
 use std::sync::Mutex;
 
+use bytesize::ByteSize;
 use once_cell::sync::Lazy;
 
-const DEFAULT_REPORTING_INTERVAL: u64 = 1024 * 1024 * 1024;
-
 static ALLOCATION_TRACKER: Lazy<Mutex<Allocations>> =
     Lazy::new(|| Mutex::new(Allocations::default()));
 
 #[derive(Debug)]
 struct Allocation {
     pub callsite_hash: u64,
-    pub size: u64,
+    pub size: ByteSize,
 }
 
 #[derive(Debug, Copy, Clone)]
 pub struct Statistic {
     pub count: u64,
-    pub size: u64,
-    pub last_print: u64,
+    pub size: ByteSize,
+    pub last_report: ByteSize,
+}
+
+#[derive(Debug)]
+enum Status {
+    Started { reporting_interval: ByteSize },
+    Stopped,
 }
 
 /// WARN:
@@ -43,8 +48,7 @@ pub struct Statistic {
 struct Allocations {
     memory_locations: HashMap<usize, Allocation>,
     callsite_statistics: HashMap<u64, Statistic>,
-    is_started: bool,
-    reporting_interval: u64,
+    status: Status,
 }
 
 impl Default for Allocations {
@@ -52,23 +56,18 @@ impl Default for Allocations {
         Self {
             memory_locations: HashMap::with_capacity(128 * 1024),
             callsite_statistics: HashMap::with_capacity(32 * 1024),
-            is_started: false,
-            reporting_interval: DEFAULT_REPORTING_INTERVAL,
+            status: Status::Stopped,
         }
     }
 }
 
-// pub fn log_dump() {
-//     tracing::info!(allocations=?ALLOCATION_TRACKER.lock().unwrap(), "dump");
-// }
-
-pub fn init(alloc_size_triggering_backtrace: Option<u64>) {
+pub fn init(reporting_interval_bytes: u64) {
     let mut guard = ALLOCATION_TRACKER.lock().unwrap();
     guard.memory_locations.clear();
     guard.callsite_statistics.clear();
-    guard.is_started = true;
-    guard.reporting_interval =
-        alloc_size_triggering_backtrace.unwrap_or(DEFAULT_REPORTING_INTERVAL);
+    guard.status = Status::Started {
+        reporting_interval: ByteSize(reporting_interval_bytes),
+    }
 }
 
 pub enum AllocRecordingResponse {
@@ -86,11 +85,15 @@ pub enum AllocRecordingResponse {
 /// allocated size is reported.
 ///
 /// WARN: this function should not allocate!
-pub fn record_allocation(callsite_hash: u64, size: u64, ptr: *mut u8) -> AllocRecordingResponse {
+pub fn record_allocation(
+    callsite_hash: u64,
+    size_bytes: u64,
+    ptr: *mut u8,
+) -> AllocRecordingResponse {
     let mut guard = ALLOCATION_TRACKER.lock().unwrap();
-    if !guard.is_started {
+    let Status::Started { reporting_interval } = guard.status else {
         return AllocRecordingResponse::NotStarted;
-    }
+    };
     if guard.memory_locations.capacity() == guard.memory_locations.len() {
         return AllocRecordingResponse::TrackerFull("memory_locations");
     }
@@ -101,26 +104,25 @@ pub fn record_allocation(callsite_hash: u64, size: u64, ptr: *mut u8) -> AllocRe
         ptr as usize,
         Allocation {
             callsite_hash,
-            size,
+            size: ByteSize(size_bytes),
         },
     );
-    let reporting_interval = guard.reporting_interval;
     let entry = guard
         .callsite_statistics
         .entry(callsite_hash)
         .and_modify(|stat| {
             stat.count += 1;
-            stat.size += size;
+            stat.size += size_bytes;
         })
         .or_insert(Statistic {
             count: 1,
-            size,
-            last_print: 0,
+            size: ByteSize(size_bytes),
+            last_report: ByteSize(0),
         });
-    let new_threshold_exceeded = entry.size > (entry.last_print + reporting_interval);
+    let new_threshold_exceeded = entry.size > (entry.last_report + reporting_interval);
     if new_threshold_exceeded {
         let reported_statistic = *entry;
-        entry.last_print = entry.size;
+        entry.last_report = entry.size;
         AllocRecordingResponse::ThresholdExceeded(reported_statistic)
     } else {
         AllocRecordingResponse::ThresholdNotExceeded
@@ -130,7 +132,7 @@ pub fn record_allocation(callsite_hash: u64, size: u64, ptr: *mut u8) -> AllocRe
 /// WARN: this function should not allocate!
 pub fn record_deallocation(ptr: *mut u8) {
     let mut guard = ALLOCATION_TRACKER.lock().unwrap();
-    if !guard.is_started {
+    if let Status::Stopped = guard.status {
         return;
     }
     let Some(Allocation {
@@ -143,8 +145,10 @@ pub fn record_deallocation(ptr: *mut u8) {
         return;
     };
     if let Entry::Occupied(mut content) = guard.callsite_statistics.entry(callsite_hash) {
-        content.get_mut().count -= 1;
-        content.get_mut().size -= size;
+        let new_size_bytes = content.get().size.0.saturating_sub(size.0);
+        let new_count = content.get().count.saturating_sub(1);
+        content.get_mut().count = new_count;
+        content.get_mut().size = ByteSize(new_size_bytes);
         if content.get().count == 0 {
             content.remove();
         }
@@ -159,7 +163,7 @@ mod tests {
     #[test]
     #[serial_test::file_serial]
     fn test_record_allocation_and_deallocation() {
-        init(Some(2000));
+        init(2000);
         let callsite_hash_1 = 777;
 
         let ptr_1 = 0x1 as *mut u8;
@@ -175,8 +179,8 @@ mod tests {
             panic!("Expected ThresholdExceeded response");
         };
         assert_eq!(statistic.count, 2);
-        assert_eq!(statistic.size, 3000);
-        assert_eq!(statistic.last_print, 0);
+        assert_eq!(statistic.size, ByteSize(3000));
+        assert_eq!(statistic.last_report, ByteSize(0));
 
         record_deallocation(ptr_2);
 
@@ -201,7 +205,7 @@ mod tests {
     #[test]
     #[serial_test::file_serial]
     fn test_tracker_full() {
-        init(Some(1024 * 1024 * 1024));
+        init(1024 * 1024 * 1024);
         let memory_locations_capacity = ALLOCATION_TRACKER
             .lock()
             .unwrap()
diff --git a/quickwit/quickwit-common/src/jemalloc_profiled.rs b/quickwit/quickwit-common/src/jemalloc_profiled.rs
index 179234f09b5..090dcd42d95 100644
--- a/quickwit/quickwit-common/src/jemalloc_profiled.rs
+++ b/quickwit/quickwit-common/src/jemalloc_profiled.rs
@@ -15,21 +15,23 @@
 use std::alloc::{GlobalAlloc, Layout};
 use std::hash::Hasher;
 use std::io::Write;
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 
+use bytesize::ByteSize;
 use tikv_jemallocator::Jemalloc;
 use tracing::{error, info};
 
 use crate::alloc_tracker::{self, AllocRecordingResponse};
 
-const DEFAULT_MIN_ALLOC_SIZE_FOR_PROFILING: usize = 256 * 1024;
+const DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING: u64 = 256 * 1024;
+const DEFAULT_REPORTING_INTERVAL_BYTES: u64 = 1024 * 1024 * 1024;
 
 // Atomics are used to communicate configurations between the start/stop
 // endpoints and the JemallocProfiled allocator wrapper.
 
 /// The minimum allocation size that is recorded by the tracker.
-static MIN_ALLOC_SIZE_FOR_PROFILING: AtomicUsize =
-    AtomicUsize::new(DEFAULT_MIN_ALLOC_SIZE_FOR_PROFILING);
+static MIN_ALLOC_BYTES_FOR_PROFILING: AtomicU64 =
+    AtomicU64::new(DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING);
 
 /// Whether the profiling is started or not.
 static ENABLED: AtomicBool = AtomicBool::new(false);
@@ -38,16 +40,16 @@ static ENABLED: AtomicBool = AtomicBool::new(false);
 ///
 /// This function uses a wrapper around the global Jemalloc allocator to
 /// instrument it. Each time an allocation bigger than
-/// min_alloc_size_for_profiling is performed, it is recorded in a map and the
-/// statistics for its call site are updated.
+/// min_alloc_bytes_for_profiling is performed, it is recorded in a map and
+/// the statistics for its call site are updated.
 ///
 /// During profiling, the statistics per call site are used to log when specific
-/// thresholds are exceeded. For each call site, the allocated memory is
-/// logged (with a backtrace) every time it exceeds the last logged allocated
-/// memory by at least alloc_size_triggering_backtrace.
+/// thresholds are exceeded. For each call site, the allocated memory is logged
+/// (with a backtrace) every time it exceeds the last logged allocated memory by
+/// at least alloc_bytes_triggering_backtrace.
 pub fn start_profiling(
-    min_alloc_size_for_profiling: Option<usize>,
-    alloc_size_triggering_backtrace: Option<u64>,
+    min_alloc_bytes_for_profiling: Option<u64>,
+    alloc_bytes_triggering_backtrace: Option<u64>,
 ) {
     // Call backtrace once to warmup symbolization allocations (~30MB)
     backtrace::trace(|frame| {
@@ -55,17 +57,21 @@ pub fn start_profiling(
         true
     });
 
-    alloc_tracker::init(alloc_size_triggering_backtrace);
+    let alloc_bytes_triggering_backtrace =
+        alloc_bytes_triggering_backtrace.unwrap_or(DEFAULT_REPORTING_INTERVAL_BYTES);
+    alloc_tracker::init(alloc_bytes_triggering_backtrace);
 
-    let min_alloc_size_for_profiling =
-        min_alloc_size_for_profiling.unwrap_or(DEFAULT_MIN_ALLOC_SIZE_FOR_PROFILING);
+    let min_alloc_bytes_for_profiling =
+        min_alloc_bytes_for_profiling.unwrap_or(DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING);
     // Use strong ordering to make sure all threads see these changes in this order
-    MIN_ALLOC_SIZE_FOR_PROFILING.store(min_alloc_size_for_profiling, Ordering::SeqCst);
+    MIN_ALLOC_BYTES_FOR_PROFILING.store(min_alloc_bytes_for_profiling, Ordering::SeqCst);
     let previously_enabled = ENABLED.swap(true, Ordering::SeqCst);
 
     info!(
-        min_alloc_size_for_profiling,
-        alloc_size_triggering_backtrace, previously_enabled, "heap profiling running"
+        min_alloc_for_profiling = %ByteSize(min_alloc_bytes_for_profiling),
+        alloc_triggering_backtrace = %ByteSize(alloc_bytes_triggering_backtrace),
+        previously_enabled,
+        "heap profiling running"
     );
 }
 
@@ -75,11 +81,9 @@ pub fn start_profiling(
 pub fn stop_profiling() {
     // Use strong ordering to make sure all threads see these changes in this order
     let previously_enabled = ENABLED.swap(false, Ordering::SeqCst);
-    MIN_ALLOC_SIZE_FOR_PROFILING.store(DEFAULT_MIN_ALLOC_SIZE_FOR_PROFILING, Ordering::SeqCst);
+    MIN_ALLOC_BYTES_FOR_PROFILING.store(DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING, Ordering::SeqCst);
 
     info!(previously_enabled, "heap profiling stopped");
-    // alloc_tracker::log_dump();
-    // backtrace::clear_symbol_cache();
 }
 
 /// Wraps the Jemalloc global allocator calls with tracking routines.
@@ -126,16 +130,15 @@ unsafe impl GlobalAlloc for JemallocProfiled {
     }
 }
 
+/// Uses backtrace::trace() which does allocate
 #[inline]
 fn print_backtrace(callsite_hash: u64, stat: alloc_tracker::Statistic) {
     {
         let mut lock = std::io::stdout().lock();
         let _ = writeln!(
             &mut lock,
-            "htrk callsite={} allocs={} size={}MiB",
-            callsite_hash,
-            stat.count,
-            stat.size / 1024 / 1024
+            "htrk callsite={} allocs={} size={}",
+            callsite_hash, stat.count, stat.size
         );
         backtrace::trace(|frame| {
             backtrace::resolve_frame(frame, |symbol| {
@@ -150,6 +153,7 @@ fn print_backtrace(callsite_hash: u64, stat: alloc_tracker::Statistic) {
     }
 }
 
+/// Uses backtrace::trace() which does allocate
 #[inline]
 fn backtrace_hash() -> u64 {
     let mut hasher = fnv::FnvHasher::default();
@@ -160,22 +164,23 @@ fn backtrace_hash() -> u64 {
     hasher.finish()
 }
 
+/// Warning: allocating inside this function can cause a deadlock.
 #[cold]
 fn track_alloc_call(ptr: *mut u8, layout: Layout) {
-    if layout.size() > MIN_ALLOC_SIZE_FOR_PROFILING.load(Ordering::Relaxed) {
+    if layout.size() >= MIN_ALLOC_BYTES_FOR_PROFILING.load(Ordering::Relaxed) as usize {
+        // warning: backtrace_hash() allocates
         let callsite_hash = backtrace_hash();
         let recording_response =
             alloc_tracker::record_allocation(callsite_hash, layout.size() as u64, ptr);
 
         match recording_response {
             AllocRecordingResponse::ThresholdExceeded(stat_for_trace) => {
+                // warning: print_backtrace() allocates
                 print_backtrace(callsite_hash, stat_for_trace);
-                // Could we use tracing to caracterize the call site here?
-                // tracing::info!(size = alloc_size_for_trace, "large alloc");
             }
-            AllocRecordingResponse::TrackerFull(reason) => {
+            AllocRecordingResponse::TrackerFull(table_name) => {
                 // this message might be displayed multiple times but that's fine
-                error!("{reason} full, profiling stopped");
+                error!("heap profiling stopped, {table_name} full");
                 ENABLED.store(false, Ordering::Relaxed);
             }
             AllocRecordingResponse::ThresholdNotExceeded => {}
@@ -184,9 +189,10 @@ fn track_alloc_call(ptr: *mut u8, layout: Layout) {
     }
 }
 
+/// Warning: allocating inside this function can cause a deadlock.
 #[cold]
 fn track_dealloc_call(ptr: *mut u8, layout: Layout) {
-    if layout.size() > MIN_ALLOC_SIZE_FOR_PROFILING.load(Ordering::Relaxed) {
+    if layout.size() >= MIN_ALLOC_BYTES_FOR_PROFILING.load(Ordering::Relaxed) as usize {
         alloc_tracker::record_deallocation(ptr);
     }
 }
diff --git a/quickwit/quickwit-serve/src/developer_api/heap_prof.rs b/quickwit/quickwit-serve/src/developer_api/heap_prof.rs
index deff7311932..0e777bae3c8 100644
--- a/quickwit/quickwit-serve/src/developer_api/heap_prof.rs
+++ b/quickwit/quickwit-serve/src/developer_api/heap_prof.rs
@@ -21,7 +21,7 @@ pub fn heap_prof_handlers()
 -> impl Filter<Extract = impl warp::Reply, Error = warp::Rejection> + Clone {
     #[derive(Deserialize)]
     struct ProfilerQueryParams {
-        min_alloc_size: Option<usize>,
+        min_alloc_size: Option<u64>,
         backtrace_every: Option<u64>,
     }
 

From 8e0fe59fe400ff2bebfe73f43c2db31e3eb74ae3 Mon Sep 17 00:00:00 2001
From: Remi Dettai <rdettai@gmail.com>
Date: Wed, 30 Apr 2025 17:15:37 +0200
Subject: [PATCH 3/5] Better rationalization around nested allocations

---
 .../quickwit-common/src/jemalloc_profiled.rs  | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/quickwit/quickwit-common/src/jemalloc_profiled.rs b/quickwit/quickwit-common/src/jemalloc_profiled.rs
index 090dcd42d95..969c4498fcf 100644
--- a/quickwit/quickwit-common/src/jemalloc_profiled.rs
+++ b/quickwit/quickwit-common/src/jemalloc_profiled.rs
@@ -51,6 +51,12 @@ pub fn start_profiling(
     min_alloc_bytes_for_profiling: Option<u64>,
     alloc_bytes_triggering_backtrace: Option<u64>,
 ) {
+    #[cfg(miri)]
+    warn!(
+        "heap profiling is not supported with Miri because in that case the `backtrace` crate \
+         allocates"
+    );
+
     // Call backtrace once to warmup symbolization allocations (~30MB)
     backtrace::trace(|frame| {
         backtrace::resolve_frame(frame, |_| {});
@@ -63,16 +69,19 @@ pub fn start_profiling(
 
     let min_alloc_bytes_for_profiling =
         min_alloc_bytes_for_profiling.unwrap_or(DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING);
-    // Use strong ordering to make sure all threads see these changes in this order
-    MIN_ALLOC_BYTES_FOR_PROFILING.store(min_alloc_bytes_for_profiling, Ordering::SeqCst);
-    let previously_enabled = ENABLED.swap(true, Ordering::SeqCst);
 
+    // stdout() might allocate a buffer on first use. If the first allocation
+    // tracked comes from stdout, it will trigger a deadlock. Logging here
+    // guarantees that it doesn't happen.
     info!(
         min_alloc_for_profiling = %ByteSize(min_alloc_bytes_for_profiling),
         alloc_triggering_backtrace = %ByteSize(alloc_bytes_triggering_backtrace),
-        previously_enabled,
         "heap profiling running"
     );
+
+    // Use strong ordering to make sure all threads see these changes in this order
+    MIN_ALLOC_BYTES_FOR_PROFILING.store(min_alloc_bytes_for_profiling, Ordering::SeqCst);
+    ENABLED.store(true, Ordering::SeqCst);
 }
 
 /// Stops measuring heap allocations.
@@ -91,6 +100,8 @@ pub fn stop_profiling() {
 /// The tracking routines are called only when [ENABLED] is set to true (calling
 /// [start_profiling()]), but we don't enforce any synchronization (we load it with
 /// Ordering::Relaxed) because it's fine to miss or record extra allocation events.
+///
+/// It's important to ensure that no allocations are performed inside the allocator!
 pub struct JemallocProfiled(pub Jemalloc);
 
 unsafe impl GlobalAlloc for JemallocProfiled {
@@ -130,7 +141,7 @@ unsafe impl GlobalAlloc for JemallocProfiled {
     }
 }
 
-/// Uses backtrace::trace() which does allocate
+/// Warning: stdout allocates a buffer on first use.
 #[inline]
 fn print_backtrace(callsite_hash: u64, stat: alloc_tracker::Statistic) {
     {
@@ -153,7 +164,6 @@ fn print_backtrace(callsite_hash: u64, stat: alloc_tracker::Statistic) {
     }
 }
 
-/// Uses backtrace::trace() which does allocate
 #[inline]
 fn backtrace_hash() -> u64 {
     let mut hasher = fnv::FnvHasher::default();
@@ -164,22 +174,22 @@ fn backtrace_hash() -> u64 {
     hasher.finish()
 }
 
-/// Warning: allocating inside this function can cause a deadlock.
+/// Warning: allocating inside this function can cause an error (abort, panic or even deadlock).
 #[cold]
 fn track_alloc_call(ptr: *mut u8, layout: Layout) {
     if layout.size() >= MIN_ALLOC_BYTES_FOR_PROFILING.load(Ordering::Relaxed) as usize {
-        // warning: backtrace_hash() allocates
         let callsite_hash = backtrace_hash();
         let recording_response =
             alloc_tracker::record_allocation(callsite_hash, layout.size() as u64, ptr);
 
         match recording_response {
             AllocRecordingResponse::ThresholdExceeded(stat_for_trace) => {
-                // warning: print_backtrace() allocates
+                // warning: stdout might allocate a buffer on first use
                 print_backtrace(callsite_hash, stat_for_trace);
             }
             AllocRecordingResponse::TrackerFull(table_name) => {
                 // this message might be displayed multiple times but that's fine
+                // warning: stdout might allocate a buffer on first use
                 error!("heap profiling stopped, {table_name} full");
                 ENABLED.store(false, Ordering::Relaxed);
             }
@@ -189,7 +199,7 @@ fn track_alloc_call(ptr: *mut u8, layout: Layout) {
     }
 }
 
-/// Warning: allocating inside this function can cause a deadlock.
+/// Warning: allocating inside this function can cause an error (abort, panic or even deadlock).
 #[cold]
 fn track_dealloc_call(ptr: *mut u8, layout: Layout) {
     if layout.size() >= MIN_ALLOC_BYTES_FOR_PROFILING.load(Ordering::Relaxed) as usize {

From 036ecc05941799537cabb5345e425dcf68997319 Mon Sep 17 00:00:00 2001
From: Remi Dettai <rdettai@gmail.com>
Date: Tue, 6 May 2025 10:07:33 +0200
Subject: [PATCH 4/5] Pad the atomic flags to the cache line size

---
 .../quickwit-common/src/jemalloc_profiled.rs  | 67 ++++++++++++-------
 1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/quickwit/quickwit-common/src/jemalloc_profiled.rs b/quickwit/quickwit-common/src/jemalloc_profiled.rs
index 969c4498fcf..371026d7881 100644
--- a/quickwit/quickwit-common/src/jemalloc_profiled.rs
+++ b/quickwit/quickwit-common/src/jemalloc_profiled.rs
@@ -23,30 +23,45 @@ use tracing::{error, info};
 
 use crate::alloc_tracker::{self, AllocRecordingResponse};
 
-const DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING: u64 = 256 * 1024;
+const DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING: u64 = 64 * 1024;
 const DEFAULT_REPORTING_INTERVAL_BYTES: u64 = 1024 * 1024 * 1024;
 
-// Atomics are used to communicate configurations between the start/stop
-// endpoints and the JemallocProfiled allocator wrapper.
-
-/// The minimum allocation size that is recorded by the tracker.
-static MIN_ALLOC_BYTES_FOR_PROFILING: AtomicU64 =
-    AtomicU64::new(DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING);
+/// Atomics are used to communicate configurations between the start/stop
+/// endpoints and the JemallocProfiled allocator wrapper.
+///
+/// The flags are padded to avoid false sharing of the CPU cache line between
+/// threads. 128 bytes is the cache line size on x86_64 and arm64.
+#[repr(align(128))]
+struct Flags {
+    /// The minimum allocation size that is recorded by the tracker.
+    min_alloc_bytes_for_profiling: AtomicU64,
+    /// Whether the profiling is started or not.
+    enabled: AtomicBool,
+}
 
-/// Whether the profiling is started or not.
-static ENABLED: AtomicBool = AtomicBool::new(false);
+static FLAGS: Flags = Flags {
+    min_alloc_bytes_for_profiling: AtomicU64::new(DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING),
+    enabled: AtomicBool::new(false),
+};
 
 /// Starts measuring heap allocations and logs important leaks.
 ///
 /// This function uses a wrapper around the global Jemalloc allocator to
-/// instrument it. Each time an allocation bigger than
-/// min_alloc_bytes_for_profiling is performed, it is recorded in a map and
-/// the statistics for its call site are updated.
+/// instrument it.
+///
+/// Each time an allocation bigger than min_alloc_bytes_for_profiling is
+/// performed, it is recorded in a map and the statistics for its call site are
+/// updated. Tracking allocations is costly because it requires acquiring a
+/// global mutex. Setting a reasonable value for min_alloc_bytes_for_profiling
+/// is crucial. For instance for a search aggregation request, tracking every
+/// allocations (min_alloc_bytes_for_profiling=1) is typically 100x slower than
+/// using a minimum of 64kB.
 ///
 /// During profiling, the statistics per call site are used to log when specific
 /// thresholds are exceeded. For each call site, the allocated memory is logged
 /// (with a backtrace) every time it exceeds the last logged allocated memory by
-/// at least alloc_bytes_triggering_backtrace.
+/// at least alloc_bytes_triggering_backtrace. This logging interval should
+/// usually be set to a value of at least 500MB to limit the logging verbosity.
 pub fn start_profiling(
     min_alloc_bytes_for_profiling: Option<u64>,
     alloc_bytes_triggering_backtrace: Option<u64>,
@@ -80,8 +95,10 @@ pub fn start_profiling(
     );
 
     // Use strong ordering to make sure all threads see these changes in this order
-    MIN_ALLOC_BYTES_FOR_PROFILING.store(min_alloc_bytes_for_profiling, Ordering::SeqCst);
-    ENABLED.store(true, Ordering::SeqCst);
+    FLAGS
+        .min_alloc_bytes_for_profiling
+        .store(min_alloc_bytes_for_profiling, Ordering::SeqCst);
+    FLAGS.enabled.store(true, Ordering::SeqCst);
 }
 
 /// Stops measuring heap allocations.
@@ -89,8 +106,10 @@ pub fn start_profiling(
 /// The allocation tracking tables and the symbol cache are not cleared.
 pub fn stop_profiling() {
     // Use strong ordering to make sure all threads see these changes in this order
-    let previously_enabled = ENABLED.swap(false, Ordering::SeqCst);
-    MIN_ALLOC_BYTES_FOR_PROFILING.store(DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING, Ordering::SeqCst);
+    let previously_enabled = FLAGS.enabled.swap(false, Ordering::SeqCst);
+    FLAGS
+        .min_alloc_bytes_for_profiling
+        .store(DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING, Ordering::SeqCst);
 
     info!(previously_enabled, "heap profiling stopped");
 }
@@ -108,7 +127,7 @@ unsafe impl GlobalAlloc for JemallocProfiled {
     #[inline]
     unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
         let ptr = unsafe { self.0.alloc(layout) };
-        if ENABLED.load(Ordering::Relaxed) {
+        if FLAGS.enabled.load(Ordering::Relaxed) {
             track_alloc_call(ptr, layout);
         }
         ptr
@@ -117,7 +136,7 @@ unsafe impl GlobalAlloc for JemallocProfiled {
     #[inline]
     unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
         let ptr = unsafe { self.0.alloc_zeroed(layout) };
-        if ENABLED.load(Ordering::Relaxed) {
+        if FLAGS.enabled.load(Ordering::Relaxed) {
             track_alloc_call(ptr, layout);
         }
         ptr
@@ -125,7 +144,7 @@ unsafe impl GlobalAlloc for JemallocProfiled {
 
     #[inline]
     unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
-        if ENABLED.load(Ordering::Relaxed) {
+        if FLAGS.enabled.load(Ordering::Relaxed) {
             track_dealloc_call(ptr, layout);
         }
         unsafe { self.0.dealloc(ptr, layout) }
@@ -134,7 +153,7 @@ unsafe impl GlobalAlloc for JemallocProfiled {
     #[inline]
     unsafe fn realloc(&self, old_ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
         let new_ptr = unsafe { self.0.realloc(old_ptr, layout, new_size) };
-        if ENABLED.load(Ordering::Relaxed) {
+        if FLAGS.enabled.load(Ordering::Relaxed) {
             track_realloc_call(old_ptr, new_ptr, layout, new_size);
         }
         new_ptr
@@ -177,7 +196,7 @@ fn backtrace_hash() -> u64 {
 /// Warning: allocating inside this function can cause an error (abort, panic or even deadlock).
 #[cold]
 fn track_alloc_call(ptr: *mut u8, layout: Layout) {
-    if layout.size() >= MIN_ALLOC_BYTES_FOR_PROFILING.load(Ordering::Relaxed) as usize {
+    if layout.size() >= FLAGS.min_alloc_bytes_for_profiling.load(Ordering::Relaxed) as usize {
         let callsite_hash = backtrace_hash();
         let recording_response =
             alloc_tracker::record_allocation(callsite_hash, layout.size() as u64, ptr);
@@ -191,7 +210,7 @@ fn track_alloc_call(ptr: *mut u8, layout: Layout) {
                 // this message might be displayed multiple times but that's fine
                 // warning: stdout might allocate a buffer on first use
                 error!("heap profiling stopped, {table_name} full");
-                ENABLED.store(false, Ordering::Relaxed);
+                FLAGS.enabled.store(false, Ordering::Relaxed);
             }
             AllocRecordingResponse::ThresholdNotExceeded => {}
             AllocRecordingResponse::NotStarted => {}
@@ -202,7 +221,7 @@ fn track_alloc_call(ptr: *mut u8, layout: Layout) {
 /// Warning: allocating inside this function can cause an error (abort, panic or even deadlock).
 #[cold]
 fn track_dealloc_call(ptr: *mut u8, layout: Layout) {
-    if layout.size() >= MIN_ALLOC_BYTES_FOR_PROFILING.load(Ordering::Relaxed) as usize {
+    if layout.size() >= FLAGS.min_alloc_bytes_for_profiling.load(Ordering::Relaxed) as usize {
         alloc_tracker::record_deallocation(ptr);
     }
 }

From 7fa1b07ba37b67084eb7c4cd0839c515d5fd6a59 Mon Sep 17 00:00:00 2001
From: Remi Dettai <rdettai@gmail.com>
Date: Tue, 6 May 2025 11:34:41 +0200
Subject: [PATCH 5/5] Add padding

---
 quickwit/quickwit-common/src/jemalloc_profiled.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/quickwit/quickwit-common/src/jemalloc_profiled.rs b/quickwit/quickwit-common/src/jemalloc_profiled.rs
index 371026d7881..495cdcf31cc 100644
--- a/quickwit/quickwit-common/src/jemalloc_profiled.rs
+++ b/quickwit/quickwit-common/src/jemalloc_profiled.rs
@@ -37,11 +37,14 @@ struct Flags {
     min_alloc_bytes_for_profiling: AtomicU64,
     /// Whether the profiling is started or not.
     enabled: AtomicBool,
+    /// Padding to make sure we fill the cache line.
+    _padding: [u8; 119], // 128 (align) - 8 (u64) - 1 (bool)
 }
 
 static FLAGS: Flags = Flags {
     min_alloc_bytes_for_profiling: AtomicU64::new(DEFAULT_MIN_ALLOC_BYTES_FOR_PROFILING),
     enabled: AtomicBool::new(false),
+    _padding: [0; 119],
 };
 
 /// Starts measuring heap allocations and logs important leaks.
@@ -235,3 +238,13 @@ fn track_realloc_call(
 ) {
     // TODO handle realloc
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_size_of_flags() {
+        assert_eq!(std::mem::size_of::<Flags>(), 128);
+    }
+}