diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04891fad8..0016eb17f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -198,6 +198,7 @@ set(DD_PROFILING_SOURCES
     src/ddprof_cmdline.cc
     src/ddres_list.cc
     src/ipc.cc
+    src/lib/address_bitset.cc
     src/lib/allocation_tracker.cc
     src/lib/dd_profiling.cc
     src/lib/elfutils.cc
diff --git a/include/ddprof_perf_event.hpp b/include/ddprof_perf_event.hpp
index ceda11982..ecd4f8c73 100644
--- a/include/ddprof_perf_event.hpp
+++ b/include/ddprof_perf_event.hpp
@@ -8,6 +8,8 @@
 #include <linux/perf_event.h>
 #include <type_traits>
 
+#include "perf.hpp"
+
 // Extend the perf event types
 // There are <30 different perf events (starting at 1000 seems safe)
 enum : uint32_t {
diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp
new file mode 100644
index 000000000..c07c7ead1
--- /dev/null
+++ b/include/lib/address_bitset.hpp
@@ -0,0 +1,48 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0. This product includes software
+// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
+// Datadog, Inc.
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <stdint.h>
+#include <string.h>
+
+namespace ddprof {
+class AddressBitset {
+public:
+  explicit AddressBitset() {}
+  ~AddressBitset();
+  AddressBitset(AddressBitset &other) = delete;
+  AddressBitset &operator=(AddressBitset &other) = delete;
+
+  // returns true if the element was inserted
+  bool add(uintptr_t addr);
+  // returns true if the element was removed
+  bool remove(uintptr_t addr);
+  void clear();
+  int count() const { return _nb_addresses; }
+
+private:
+  static constexpr unsigned _lower_bits_ignored = 4;
+  static constexpr int _k_default_max_mid_levels = 10;
+  // element type
+  using Word_t = uint64_t;
+  constexpr static unsigned _nb_bits_per_word = sizeof(Word_t) * 8;
+  static constexpr unsigned _k_nb_words = 4096 / 64;       // (64)
+  static constexpr unsigned _nb_entries_per_level = 65536; // 2^16
+
+  struct LeafLevel {
+    std::atomic<Word_t> leaf[_k_nb_words] = {};
+  };
+
+  struct MidLevel {
+    std::atomic<LeafLevel *> mid[_nb_entries_per_level] = {};
+  };
+
+  std::atomic<MidLevel *> _top_level[_nb_entries_per_level] = {};
+  std::atomic<int> _nb_addresses = 0;
+  std::atomic<int> _nb_mid_levels = 0;
+};
+} // namespace ddprof
diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp
index 4ce24a3fb..5b4a9eccd 100644
--- a/include/lib/allocation_tracker.hpp
+++ b/include/lib/allocation_tracker.hpp
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include "address_bitset.hpp"
 #include "allocation_tracker_tls.hpp"
 #include "ddprof_base.hpp"
 #include "ddres_def.hpp"
@@ -83,7 +84,8 @@ class AllocationTracker {
   uint64_t next_sample_interval(std::minstd_rand &gen);
 
   DDRes init(uint64_t mem_profile_interval, bool deterministic_sampling,
-             uint32_t stack_sample_size, const RingBufferInfo &ring_buffer);
+             bool track_deallocations, uint32_t stack_sample_size,
+             const RingBufferInfo &ring_buffer);
   void free();
 
   static AllocationTracker *create_instance();
@@ -113,7 +115,8 @@ class AllocationTracker {
   uint32_t _stack_sample_size;
   PEvent _pevent;
   bool _deterministic_sampling;
-  AdressSet _address_set;
+
+  std::unique_ptr<AddressBitset> _allocated_address_set;
 
   // These can not be tied to the internal state of the instance.
   // The creation of the instance depends on this
diff --git a/include/lib/lib_logger.hpp b/include/lib/lib_logger.hpp
index 49529406d..c8c83f1e6 100644
--- a/include/lib/lib_logger.hpp
+++ b/include/lib/lib_logger.hpp
@@ -10,13 +10,37 @@
 #include <mutex>
 
 namespace ddprof {
-template <typename... Args>
-void log_once(char const *const format, Args... args) {
-#ifndef DEBUG
-  static std::once_flag flag;
-  std::call_once(flag, [&, format]() { fprintf(stderr, format, args...); });
+
+#ifdef NDEBUG
+template <typename Func>
+void log_once_helper(std::once_flag &flag, Func &&func) {
+  std::call_once(flag, std::forward<Func>(func));
 #else
-  fprintf(stderr, format, args...);
+template <typename Func> void log_once_helper(std::once_flag &, Func &&func) {
+  func();
 #endif
 }
+
+template <typename Func>
+void log_always_once_helper(std::once_flag &flag, Func &&func) {
+  std::call_once(flag, std::forward<Func>(func));
+}
+
+#define LOG_ALWAYS_ONCE(format, ...)                                           \
+  do {                                                                         \
+    static std::once_flag UNIQUE_ONCE_FLAG_##__COUNTER__;                      \
+    ddprof::log_always_once_helper(UNIQUE_ONCE_FLAG_##__COUNTER__, [&]() {     \
+      fprintf(stderr, (format), ##__VA_ARGS__);                                \
+    });                                                                        \
+  } while (0)
+
+// create a once flag for the line and file where this is called:
+#define LOG_ONCE(format, ...)                                                  \
+  do {                                                                         \
+    static std::once_flag UNIQUE_ONCE_FLAG_##__COUNTER__;                      \
+    ddprof::log_once_helper(UNIQUE_ONCE_FLAG_##__COUNTER__, [&]() {            \
+      fprintf(stderr, (format), ##__VA_ARGS__);                                \
+    });                                                                        \
+  } while (0)
+
 } // namespace ddprof
diff --git a/include/live_allocation-c.hpp b/include/live_allocation-c.hpp
index 657e84b1b..5ccfc136e 100644
--- a/include/live_allocation-c.hpp
+++ b/include/live_allocation-c.hpp
@@ -11,7 +11,7 @@ namespace liveallocation {
 // build time override to reduce execution time of test
 static constexpr auto kMaxTracked = KMAX_TRACKED_ALLOCATIONS;
 #else
-static constexpr auto kMaxTracked = 500000;
+static constexpr auto kMaxTracked = 524288; // 2^19
 #endif
 } // namespace liveallocation
 } // namespace ddprof
diff --git a/src/ddprof_worker.cc b/src/ddprof_worker.cc
index da10aca4f..4d32bb063 100644
--- a/src/ddprof_worker.cc
+++ b/src/ddprof_worker.cc
@@ -306,7 +306,8 @@ DDRes ddprof_pr_sample(DDProfContext &ctx, perf_event_sample *sample,
   // Aggregate if unwinding went well (todo : fatal error propagation)
   if (!IsDDResFatal(res)) {
     struct UnwindState *us = ctx.worker_ctx.us;
-    if (Any(EventConfMode::kLiveCallgraph & watcher->output_mode)) {
+    if (Any(EventConfMode::kLiveCallgraph & watcher->output_mode) &&
+        sample->addr) { // null address means we should not account it
       // Live callgraph mode
       // for now we hard code the live aggregation mode
       ctx.worker_ctx.live_allocation.register_allocation(
diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc
new file mode 100644
index 000000000..041b1bf1e
--- /dev/null
+++ b/src/lib/address_bitset.cc
@@ -0,0 +1,170 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0. This product includes software
+// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
+// Datadog, Inc.
+#include "address_bitset.hpp"
+
+#include <algorithm>
+#include <bit>
+#include <cassert>
+#include <functional>
+#include <lib_logger.hpp>
+#include <unlikely.hpp>
+
+namespace ddprof {
+
+bool AddressBitset::add(uintptr_t addr) {
+  // Extract top 16 bits for top-level index
+  unsigned top_index = (addr >> 32) & 0xFFFF;
+
+  // If the entry at this index is null, allocate and try to atomically set it
+  MidLevel *expected_mid =
+      _top_level[top_index].load(std::memory_order_relaxed);
+  if (!expected_mid) {
+    if (_nb_mid_levels >= _k_default_max_mid_levels) {
+      // Every new level adds half a meg of overhead
+      LOG_ALWAYS_ONCE(
+          "<Warn> ddprof: Address bitset reached maximum number of mid levels\n");
+      return false;
+    }
+    expected_mid = new MidLevel();
+    MidLevel *old_mid = nullptr;
+    if (!_top_level[top_index].compare_exchange_strong(old_mid, expected_mid)) {
+      delete expected_mid;
+      expected_mid = old_mid;
+    } else {
+      ++_nb_mid_levels;
+    }
+    assert(expected_mid);
+    if (!expected_mid) {
+      // something went wrong
+      return false;
+    }
+  }
+
+  // Extract middle 16 bits for mid-level index
+  unsigned mid_index = (addr >> 16) & 0xFFFF;
+
+  // If the entry at this mid-level index is null, allocate and try to
+  // atomically set it
+  LeafLevel *expected_leaf =
+      expected_mid->mid[mid_index].load(std::memory_order_acquire);
+  if (!expected_leaf) {
+    expected_leaf = new LeafLevel();
+    LeafLevel *old_leaf = nullptr;
+    if (!expected_mid->mid[mid_index].compare_exchange_strong(old_leaf,
+                                                              expected_leaf)) {
+      delete expected_leaf;
+      expected_leaf = old_leaf;
+    }
+    assert(expected_leaf);
+    if (!expected_leaf) {
+      // something went wrong
+      return false;
+    }
+  }
+
+  // Extract lower 16 bits and ignore lower bits (12 remain)
+  unsigned leaf_index = (addr & 0xFFFF) >> _lower_bits_ignored;
+  unsigned index_array = leaf_index / _nb_bits_per_word;
+  assert(index_array < _k_nb_words);
+  unsigned bit_offset = leaf_index % _nb_bits_per_word;
+  Word_t bit_in_element = (1UL << bit_offset);
+
+  if (!(expected_leaf->leaf[index_array].fetch_or(bit_in_element) &
+        bit_in_element)) {
+    ++_nb_addresses;
+    return true;
+  }
+  return false; // Collision
+}
+
+bool AddressBitset::remove(uintptr_t addr) {
+  // Extract top 16 bits for top-level index
+  unsigned top_index = (addr >> 32) & 0xFFFF;
+
+  // Try to get the mid-level pointer. If it's null, return false.
+  MidLevel *mid = _top_level[top_index].load(std::memory_order_acquire);
+  if (unlikely(!mid)) {
+    return false;
+  }
+
+  // Extract middle 16 bits for mid-level index
+  unsigned mid_index = (addr >> 16) & 0xFFFF;
+
+  // Try to get the leaf-level pointer from the mid-level. If it's null, return
+  // false.
+  LeafLevel *leaf = mid->mid[mid_index].load(std::memory_order_acquire);
+  if (unlikely(!leaf)) {
+    return false;
+  }
+
+  // Extract lower 16 bits and ignore lower bits (12 remain)
+  unsigned leaf_index = (addr & 0xFFFF) >> _lower_bits_ignored;
+  unsigned index_array = leaf_index / _nb_bits_per_word;
+  assert(index_array < _k_nb_words);
+  unsigned bit_offset = leaf_index % _nb_bits_per_word;
+  Word_t bit_in_element = (1UL << bit_offset);
+
+  // Use fetch_and to zero the bit
+  if (leaf->leaf[index_array].fetch_and(~bit_in_element) & bit_in_element) {
+    _nb_addresses.fetch_sub(1, std::memory_order_relaxed);
+    return true;
+  }
+  // Otherwise, the bit wasn't set to begin with, so return false
+  return false;
+}
+
+// TODO: the performance of this clear is horrible
+// For now we will avoid calling it
+void AddressBitset::clear() {
+  for (unsigned t = 0; t < _nb_entries_per_level; ++t) {
+    MidLevel *mid = _top_level[t].load(std::memory_order_acquire);
+    if (mid) { // if mid-level exists
+      for (unsigned m = 0; m < _nb_entries_per_level; ++m) {
+        LeafLevel *leaf = mid->mid[m].load(std::memory_order_acquire);
+        if (leaf) { // if leaf-level exists
+          for (unsigned l = 0; l < _k_nb_words; ++l) {
+            Word_t original_value = leaf->leaf[l].exchange(0);
+            // Count number of set bits in original_value
+            int num_set_bits = std::popcount(original_value);
+            if (num_set_bits > 0) {
+              _nb_addresses.fetch_sub(num_set_bits, std::memory_order_relaxed);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+AddressBitset::~AddressBitset() {
+#ifdef DEBUG
+  unsigned mid_count = 0;
+  unsigned leaf_count = 0;
+#endif
+  for (unsigned t = 0; t < _nb_entries_per_level; ++t) {
+    MidLevel *mid = _top_level[t].load(std::memory_order_acquire);
+    if (mid) { // if mid-level exists
+#ifdef DEBUG
+      ++mid_count;
+#endif
+      for (unsigned m = 0; m < _nb_entries_per_level; ++m) {
+        LeafLevel *leaf = mid->mid[m].load(std::memory_order_acquire);
+        if (leaf) { // if leaf-level exists
+#ifdef DEBUG
+          ++leaf_count;
+#endif
+          delete leaf;
+        }
+      }
+      delete mid;
+    }
+  }
+#ifdef DEBUG
+  fprintf(stderr, "Mid count = %u \n", mid_count);
+  fprintf(stderr, "Leaf count = %u \n", leaf_count);
+#endif
+}
+
+} // namespace ddprof
diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc
index d095445d0..53fcf2228 100644
--- a/src/lib/allocation_tracker.cc
+++ b/src/lib/allocation_tracker.cc
@@ -19,7 +19,6 @@
 #include <atomic>
 #include <cassert>
 #include <cstdlib>
-
 #include <unistd.h>
 
 namespace ddprof {
@@ -68,7 +67,7 @@ TrackerThreadLocalState *AllocationTracker::init_tl_state() {
 
   if (res_set) {
     // should return 0
-    log_once("Error: Unable to store tl_state. error %d \n", res_set);
+    LOG_ONCE("Error: Unable to store tl_state. error %d \n", res_set);
     delete tl_state;
     tl_state = nullptr;
   }
@@ -119,9 +118,9 @@ DDRes AllocationTracker::allocation_tracking_init(
   void *volatile p = ::malloc(1);
   ::free(p);
 
-  DDRES_CHECK_FWD(instance->init(allocation_profiling_rate,
-                                 flags & kDeterministicSampling,
-                                 stack_sample_size, ring_buffer));
+  DDRES_CHECK_FWD(instance->init(
+      allocation_profiling_rate, flags & kDeterministicSampling,
+      flags & kTrackDeallocations, stack_sample_size, ring_buffer));
   _instance = instance;
 
   state.init(true, flags & kTrackDeallocations);
@@ -131,6 +130,7 @@ DDRes AllocationTracker::allocation_tracking_init(
 
 DDRes AllocationTracker::init(uint64_t mem_profile_interval,
                               bool deterministic_sampling,
+                              bool track_deallocations,
                               uint32_t stack_sample_size,
                               const RingBufferInfo &ring_buffer) {
   _sampling_interval = mem_profile_interval;
@@ -140,6 +140,10 @@ DDRes AllocationTracker::init(uint64_t mem_profile_interval,
       static_cast<int>(RingBufferType::kMPSCRingBuffer)) {
     return ddres_error(DD_WHAT_PERFRB);
   }
+  if (track_deallocations) {
+    // 16 times as we want to probability of collision to be low enough
+    _allocated_address_set = std::make_unique<AddressBitset>();
+  }
   return ddprof::ring_buffer_attach(ring_buffer, &_pevent);
 }
 
@@ -165,7 +169,7 @@ void AllocationTracker::allocation_tracking_free() {
   }
   TrackerThreadLocalState *tl_state = get_tl_state();
   if (unlikely(!tl_state)) {
-    log_once("Error: Unable to find tl_state during %s\n", __FUNCTION__);
+    LOG_ONCE("Error: Unable to find tl_state during %s\n", __FUNCTION__);
     instance->free();
     return;
   }
@@ -229,26 +233,20 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size,
   tl_state.remaining_bytes = remaining_bytes;
   uint64_t total_size = nsamples * sampling_interval;
 
+  if (_state.track_deallocations) {
+    if (_allocated_address_set->count() < ddprof::liveallocation::kMaxTracked &&
+        _allocated_address_set->add(addr)) {
+    } else {
+      // null the address to avoid using this for live heap profiling
+      // pushing a sample is still good to have a good representation
+      // of the allocations.
+      addr = 0;
+    }
+  }
   bool success = IsDDResOK(push_alloc_sample(addr, total_size, tl_state));
   free_on_consecutive_failures(success);
-
-  if (success && _state.track_deallocations) {
-    // \fixme{r1viollet} adjust set to be lock free
-    // We should still lock the clear of live allocations (which is unlikely)
-    std::lock_guard lock{_state.mutex};
-
-    // ensure we track this dealloc if it occurs
-    _address_set.insert(addr);
-    if (unlikely(_address_set.size() > ddprof::liveallocation::kMaxTracked)) {
-      if (IsDDResOK(push_clear_live_allocation(tl_state))) {
-        _address_set.clear();
-      } else {
-        log_once(
-            "Error: %s",
-            "Stop allocation profiling. Unable to clear live allocation \n");
-        free();
-      }
-    }
+  if (unlikely(!success) && _state.track_deallocations && addr) {
+    _allocated_address_set->remove(addr);
   }
 }
 
@@ -262,11 +260,8 @@ void AllocationTracker::track_deallocation(uintptr_t addr,
     return;
   }
 
-  { // Grab the lock to check if this allocation was stored in the set
-    std::lock_guard lock{_state.mutex};
-    if (!_state.track_deallocations || !_address_set.erase(addr)) {
-      return;
-    }
+  if (!_state.track_deallocations || !_allocated_address_set->remove(addr)) {
+    return;
   }
 
   bool success = IsDDResOK(push_dealloc_sample(addr, tl_state));
@@ -494,7 +489,7 @@ void AllocationTracker::notify_thread_start() {
   if (unlikely(!tl_state)) {
     tl_state = init_tl_state();
     if (!tl_state) {
-      log_once("Error: Unable to start allocation profiling on thread %d",
+      LOG_ONCE("Error: Unable to start allocation profiling on thread %d",
                ddprof::gettid());
       return;
     }
@@ -514,7 +509,7 @@ void AllocationTracker::notify_fork() {
   if (unlikely(!tl_state)) {
     // The state should already exist if we forked.
     // This would mean that we were not able to create the state before forking
-    log_once("Error: Unable to retrieve tl state after fork thread %d",
+    LOG_ONCE("Error: Unable to retrieve tl state after fork thread %d",
              ddprof::gettid());
     return;
   } else {
diff --git a/src/lib/dd_profiling.cc b/src/lib/dd_profiling.cc
index b66544621..880c276d3 100644
--- a/src/lib/dd_profiling.cc
+++ b/src/lib/dd_profiling.cc
@@ -277,8 +277,7 @@ int ddprof_start_profiling_internal() {
         // fails ?
         g_state.allocation_profiling_started = true;
       } else {
-        ddprof::log_once("Error: %s",
-                         "Failure to start allocation profiling\n");
+        LOG_ONCE("Error: %s", "Failure to start allocation profiling\n");
       }
     }
   } catch (const ddprof::DDException &e) { return -1; }
@@ -286,7 +285,7 @@ int ddprof_start_profiling_internal() {
   if (g_state.allocation_profiling_started) {
     int res = pthread_atfork(nullptr, nullptr, notify_fork);
     if (res) {
-      ddprof::log_once("Error:%s", "Unable to setup notify fork");
+      LOG_ONCE("Error:%s", "Unable to setup notify fork");
       assert(0);
     }
   }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a1aad2f94..1ffc85d6c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -255,6 +255,7 @@ add_unit_test(
   allocation_tracker-ut
   allocation_tracker-ut.cc
   ../src/lib/allocation_tracker.cc
+  ../src/lib/address_bitset.cc
   ../src/base_frame_symbol_lookup.cc
   ../src/build_id.cc
   ../src/container_id.cc
@@ -296,7 +297,7 @@ add_unit_test(
   ../src/unwind_helpers.cc
   ../src/unwind_metrics.cc
   LIBRARIES ${ELFUTILS_LIBRARIES} llvm-demangle DDProf::Parser
-  DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=100)
+  DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=16384)
 
 add_unit_test(sys_utils-ut sys_utils-ut.cc ../src/sys_utils.cc)
 
@@ -352,6 +353,10 @@ add_unit_test(
 
 add_unit_test(pthread_tls-ut pthread_tls-ut.cc)
 
+add_unit_test(address_bitset-ut address_bitset-ut.cc ../src/lib/address_bitset.cc)
+
+add_unit_test(lib_logger-ut ./lib_logger-ut.cc)
+
 add_benchmark(savecontext-bench savecontext-bench.cc ../src/lib/pthread_fixes.cc
               ../src/lib/savecontext.cc ../src/lib/saveregisters.cc)
 
@@ -360,6 +365,7 @@ add_benchmark(timer-bench timer-bench.cc ../src/timer.cc ../src/perf.cc)
 add_benchmark(
   allocation_tracker-bench
   allocation_tracker-bench.cc
+  ../src/lib/address_bitset.cc
   ../src/lib/allocation_tracker.cc
   ../src/pevent_lib.cc
   ../src/perf.cc
@@ -373,7 +379,7 @@ add_benchmark(
   ../src/user_override.cc
   ../src/sys_utils.cc
   LIBRARIES ${ELFUTILS_LIBRARIES} llvm-demangle DDProf::Parser
-  DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=10000)
+  DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=16384)
 
 if(NOT CMAKE_BUILD_TYPE STREQUAL "SanitizedDebug")
   add_exe(
diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc
new file mode 100644
index 000000000..aa7259a99
--- /dev/null
+++ b/test/address_bitset-ut.cc
@@ -0,0 +1,81 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0. This product includes software
+// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
+// Datadog, Inc.
+#include <gtest/gtest.h>
+
+#include "address_bitset.hpp"
+#include <cstdint>
+#include <iostream>
+#include <random>
+#include <set>
+#include <unordered_set>
+
+namespace ddprof {
+
+TEST(address_bitset, simple) {
+  AddressBitset address_bitset{};
+  EXPECT_TRUE(address_bitset.add(0xbadbeef));
+  EXPECT_FALSE(address_bitset.add(0xbadbeef));
+  EXPECT_TRUE(address_bitset.remove(0xbadbeef));
+}
+
+TEST(address_bitset, triple_remove) {
+  AddressBitset address_bitset{};
+  EXPECT_TRUE(address_bitset.add(0xbadbeef));
+  EXPECT_TRUE(address_bitset.remove(0xbadbeef));
+  EXPECT_FALSE(address_bitset.remove(0xbadbeef));
+  EXPECT_FALSE(address_bitset.remove(0xbadbeef));
+}
+
+TEST(address_bitset, many_addresses) {
+  AddressBitset address_bitset{};
+  std::random_device rd;
+  std::mt19937 gen(rd());
+// #define WORST_CASE
+#ifdef WORST_CASE
+  // if addresses are too spread out, we just can not
+  // keep track of everything
+  std::uniform_int_distribution<uintptr_t> dis(
+      0, std::numeric_limits<uintptr_t>::max());
+#else
+  // with more reasonable ranges, we are OK
+  std::uniform_int_distribution<uintptr_t> dis(0x1000, 0x1000000);
+#endif
+  std::set<uintptr_t> addresses;
+  unsigned nb_elements = 100000;
+  for (unsigned i = 0; i < nb_elements; ++i) {
+    // avoid allocations that are too close to each other
+    uintptr_t addr = (dis(gen) << 4);
+    auto res = addresses.insert(addr);
+    if (res.second) {
+      EXPECT_TRUE(address_bitset.add(addr));
+    }
+  }
+  for (auto addr : addresses) {
+    EXPECT_TRUE(address_bitset.remove(addr));
+  }
+  EXPECT_EQ(0, address_bitset.count());
+  for (auto addr : addresses) {
+    EXPECT_FALSE(address_bitset.remove(addr));
+  }
+}
+
+// the aim of this test is just not blow up
+TEST(address_bitset, sparse_addressing) {
+  AddressBitset address_bitset{};
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  // if addresses are too spread out, we just can not
+  // keep track of everything
+  std::uniform_int_distribution<uintptr_t> dis(
+      0, std::numeric_limits<uintptr_t>::max());
+  unsigned nb_elements = 300000;
+  for (unsigned i = 0; i < nb_elements; ++i) {
+    // avoid allocations that are too close to each other
+    uintptr_t addr = (dis(gen) << 4);
+    address_bitset.add(addr);
+  }
+}
+
+} // namespace ddprof
diff --git a/test/allocation_tracker-bench.cc b/test/allocation_tracker-bench.cc
index 5fa708737..f98477695 100644
--- a/test/allocation_tracker-bench.cc
+++ b/test/allocation_tracker-bench.cc
@@ -8,31 +8,58 @@
 #include <vector>
 
 #include "allocation_tracker.hpp"
+#include "ddprof_perf_event.hpp"
 #include "loghandle.hpp"
 #include "ringbuffer_holder.hpp"
 
 // Global bench settings
 // Activate live heap tracking
-// #define LIVE_HEAP
+#define LIVE_HEAP
 // Sampling rate: default rate is 524288
-static constexpr uint64_t k_rate = 20000;
+static constexpr uint64_t k_rate = 200000;
 
+// The reader thread is interesting, though it starts dominating the CPU
+// the benchmark focuses on the capture of allocation events.
 #define READER_THREAD
 std::atomic<bool> reader_continue{true};
+std::atomic<bool> error_in_reader{false};
 
 // Reader worker thread function
 void read_buffer(ddprof::RingBufferHolder &holder) {
-  int nb_samples = 0;
+  int nb_alloc_samples = 0;
+  int nb_dealloc_samples = 0;
+  int nb_unknown_samples = 0;
+
+  error_in_reader = false;
   while (reader_continue) {
     ddprof::MPSCRingBufferReader reader(holder.get_ring_buffer());
+    //    fprintf(stderr, "size = %lu! \n", reader.available_size());
     auto buf = reader.read_sample();
     if (!buf.empty()) {
-      ++nb_samples;
-      //      fprintf(stderr, "Yep, got sample ! \n");
+      const perf_event_header *hdr =
+          reinterpret_cast<const perf_event_header *>(buf.data());
+
+      if (hdr->type == PERF_RECORD_SAMPLE) {
+
+        ++nb_alloc_samples;
+
+      } else if (hdr->type == PERF_CUSTOM_EVENT_DEALLOCATION) {
+        ++nb_dealloc_samples;
+      } else {
+        ++nb_unknown_samples;
+      }
     }
     std::chrono::microseconds(10000);
   }
-  fprintf(stderr, "Reader thread exit, nb_samples=%d\n", nb_samples);
+  fprintf(stderr,
+          "Reader thread exit,"
+          "nb_alloc_samples=%d,"
+          "nb_dealloc_samples=%d,"
+          "nb_unknown_samples=%d\n",
+          nb_alloc_samples, nb_dealloc_samples, nb_unknown_samples);
+  if (nb_alloc_samples == 0) {
+    error_in_reader = true;
+  }
 }
 
 DDPROF_NOINLINE void my_malloc(size_t size, uintptr_t addr = 0xdeadbeef) {
@@ -40,8 +67,8 @@ DDPROF_NOINLINE void my_malloc(size_t size, uintptr_t addr = 0xdeadbeef) {
       ddprof::AllocationTracker::get_tl_state();
   if (tl_state) { // tl_state is null if we are not tracking allocations
     ddprof::AllocationTracker::track_allocation_s(addr, size, *tl_state);
-    DDPROF_BLOCK_TAIL_CALL_OPTIMIZATION();
   }
+  DDPROF_BLOCK_TAIL_CALL_OPTIMIZATION();
 }
 
 DDPROF_NOINLINE void my_free(uintptr_t addr) {
@@ -62,28 +89,26 @@ void perform_memory_operations(bool track_allocations,
       ddprof::AllocationTracker::kTrackDeallocations;
 #endif
 
-  ddprof::RingBufferHolder ring_buffer{buf_size_order,
-                                       RingBufferType::kMPSCRingBuffer};
-
-  if (track_allocations) {
-    ddprof::AllocationTracker::allocation_tracking_init(
-        rate, flags, k_default_perf_stack_sample_size,
-        ring_buffer.get_buffer_info());
-  }
-
   int nb_threads = 4;
   std::vector<std::thread> threads;
   int num_allocations = 1000;
   size_t page_size = 0x1000;
   std::random_device rd;
   std::mt19937 gen(rd());
-
+  ddprof::RingBufferHolder ring_buffer{buf_size_order,
+                                       RingBufferType::kMPSCRingBuffer};
 #ifdef READER_THREAD
   // create reader worker thread
   reader_continue = true;
   std::thread reader_thread{read_buffer, std::ref(ring_buffer)};
 #endif
 
+  if (track_allocations) {
+    ddprof::AllocationTracker::allocation_tracking_init(
+        rate, flags, k_default_perf_stack_sample_size,
+        ring_buffer.get_buffer_info());
+  }
+
   for (auto _ : state) {
     // Initialize threads and clear addresses
     threads.clear();
@@ -91,11 +116,14 @@ void perform_memory_operations(bool track_allocations,
 
     for (int i = 0; i < nb_threads; ++i) {
       threads.emplace_back([&, i] {
-        std::uniform_int_distribution<> dis(i * page_size,
-                                            (i + 1) * page_size - 1);
+        // in theory we automatically hook in thread creation
+        // though in the benchmark we can not do this.
+        ddprof::AllocationTracker::init_tl_state();
+        std::uniform_int_distribution<uintptr_t> dis(i * page_size,
+                                                     (i + 1) * page_size - 1);
 
         for (int j = 0; j < num_allocations; ++j) {
-          uintptr_t addr = dis(gen);
+          uintptr_t addr = dis(gen) << 4;
           my_malloc(1024, addr);
           thread_addresses[i].push_back(addr);
         }
@@ -109,6 +137,7 @@ void perform_memory_operations(bool track_allocations,
     threads.clear();
     for (int i = 0; i < nb_threads; ++i) {
       threads.emplace_back([&, i] {
+        ddprof::AllocationTracker::init_tl_state();
         for (auto addr : thread_addresses[i]) {
           my_free(addr);
         }
@@ -119,11 +148,20 @@ void perform_memory_operations(bool track_allocations,
       t.join();
     }
   }
+
+  // wait for the reader thread to receive all samples
+  std::this_thread::sleep_for(std::chrono::microseconds(5000));
+
 #ifdef READER_THREAD
   reader_continue = false;
   reader_thread.join();
 #endif
-  ddprof::AllocationTracker::allocation_tracking_free();
+  if (track_allocations) {
+    ddprof::AllocationTracker::allocation_tracking_free();
+    if (error_in_reader) {
+      exit(1);
+    }
+  }
 }
 
 // Benchmark without allocation tracking
@@ -143,6 +181,7 @@ class WorkerThread {
 
   WorkerThread() : stop(false), perform_task(false) {
     worker_thread = std::thread([this] {
+      ddprof::AllocationTracker::init_tl_state();
       while (!stop) {
         std::unique_lock<std::mutex> lock(mutex);
         cv.wait(lock, [this] { return perform_task || stop; });
@@ -225,9 +264,10 @@ void perform_memory_operations_2(bool track_allocations,
   std::mt19937 gen(rd());
 
   for (int i = 0; i < nb_threads; ++i) {
-    std::uniform_int_distribution<> dis(i * page_size, (i + 1) * page_size - 1);
+    std::uniform_int_distribution<uintptr_t> dis(i * page_size,
+                                                 (i + 1) * page_size - 1);
     for (int j = 0; j < num_allocations; ++j) {
-      uintptr_t addr = dis(gen);
+      uintptr_t addr = dis(gen) << 4;
       thread_addresses[i].push_back(addr);
     }
   }
@@ -248,6 +288,9 @@ void perform_memory_operations_2(bool track_allocations,
     std::this_thread::sleep_for(std::chrono::microseconds(100));
   }
 
+  // wait for the reader thread to receive all samples
+  std::this_thread::sleep_for(std::chrono::microseconds(1000));
+
 #ifdef READER_THREAD
   reader_continue = false;
   reader_thread.join();
diff --git a/test/allocation_tracker-ut.cc b/test/allocation_tracker-ut.cc
index 6bca2f83e..fc1282e9f 100644
--- a/test/allocation_tracker-ut.cc
+++ b/test/allocation_tracker-ut.cc
@@ -149,44 +149,35 @@ TEST(allocation_tracker, max_tracked_allocs) {
       k_default_perf_stack_sample_size, ring_buffer.get_buffer_info());
 
   ASSERT_TRUE(ddprof::AllocationTracker::is_active());
-
-  for (int i = 0; i <= ddprof::liveallocation::kMaxTracked + 1; ++i) {
-    my_malloc(1, 0x1000 + i);
+  bool max_reached = false;
+  uint64_t nb_samples = 0;
+  for (int i = 0; i <= ddprof::liveallocation::kMaxTracked +
+           ddprof::liveallocation::kMaxTracked / 10;
+       ++i) {
+    uintptr_t addr = 0x1000 + (i * 16);
+    my_malloc(1, addr);
     ddprof::MPSCRingBufferReader reader{ring_buffer.get_ring_buffer()};
-    if (i <=
-        ddprof::liveallocation::kMaxTracked) { // check that we get the relevant
-                                               // info for this allocation
-      ASSERT_GT(reader.available_size(), 0);
+    while (reader.available_size() > 0) {
       auto buf = reader.read_sample();
       ASSERT_FALSE(buf.empty());
       const perf_event_header *hdr =
           reinterpret_cast<const perf_event_header *>(buf.data());
-      ASSERT_EQ(hdr->type, PERF_RECORD_SAMPLE);
-
-      perf_event_sample *sample =
-          hdr2samp(hdr, perf_event_default_sample_type() | PERF_SAMPLE_ADDR);
-
-      ASSERT_EQ(sample->period, 1);
-      ASSERT_EQ(sample->pid, getpid());
-      ASSERT_EQ(sample->tid, ddprof::gettid());
-      ASSERT_EQ(sample->addr, 0x1000 + i);
-    } else {
-      bool clear_found = false;
-      int nb_read = 0;
-      ddprof::ConstBuffer buf;
-      do {
-        buf = reader.read_sample();
-        ++nb_read;
-        if (buf.empty())
-          break;
-        const perf_event_header *hdr =
-            reinterpret_cast<const perf_event_header *>(buf.data());
-        if (hdr->type == PERF_CUSTOM_EVENT_CLEAR_LIVE_ALLOCATION) {
-          clear_found = true;
+      if (hdr->type == PERF_RECORD_SAMPLE) {
+        ++nb_samples;
+        perf_event_sample *sample =
+            hdr2samp(hdr, perf_event_default_sample_type() | PERF_SAMPLE_ADDR);
+        ASSERT_EQ(sample->period, 1);
+        ASSERT_EQ(sample->pid, getpid());
+        ASSERT_EQ(sample->tid, ddprof::gettid());
+        if (sample->addr == 0) {
+          max_reached = true;
+        } else {
+          EXPECT_EQ(sample->addr, addr);
         }
-      } while (true);
-      EXPECT_EQ(nb_read, 3);
-      EXPECT_TRUE(clear_found);
+      }
     }
   }
+  fprintf(stderr, "Number of found samples %lu (vs max = %d) \n", nb_samples,
+          ddprof::liveallocation::kMaxTracked);
+  EXPECT_TRUE(max_reached);
 }
diff --git a/test/lib_logger-ut.cc b/test/lib_logger-ut.cc
new file mode 100644
index 000000000..1fee22033
--- /dev/null
+++ b/test/lib_logger-ut.cc
@@ -0,0 +1,30 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0. This product includes software
+// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
+// Datadog, Inc.
+
+#include <gtest/gtest.h>
+
+#include "lib/lib_logger.hpp"
+
+namespace ddprof {
+
+void foo_log(const std::string &str) { LOG_ONCE("%s", str.c_str()); }
+
+TEST(LibLogger, simple) {
+  testing::internal::CaptureStderr();
+  LOG_ONCE("something ");
+  LOG_ONCE("else ");
+  foo_log("one "); // this shows
+  foo_log("two "); // this does not show
+  const char *some_string = "some_string";
+  LOG_ONCE("three %s\n", some_string);
+  std::string output = testing::internal::GetCapturedStderr();
+  fprintf(stderr, "%s", output.c_str());
+#ifdef NDEBUG
+  EXPECT_EQ(output, "something else one three some_string\n");
+#else
+  EXPECT_EQ(output, "something else one two three some_string\n");
+#endif
+}
+} // namespace ddprof
diff --git a/test/reentry_guard-ut.cc b/test/reentry_guard-ut.cc
index 4b733581e..1c0f6f0aa 100644
--- a/test/reentry_guard-ut.cc
+++ b/test/reentry_guard-ut.cc
@@ -26,8 +26,8 @@ TEST(ReentryGuardTest, null_init) {
     guard.register_guard(&reentry_guard);
     EXPECT_TRUE(guard);
     {
-      ddprof::ReentryGuard guard(&reentry_guard);
-      EXPECT_FALSE(guard);
+      ddprof::ReentryGuard guard2(&reentry_guard);
+      EXPECT_FALSE(guard2);
     }
   }
 }