diff --git a/CMakeLists.txt b/CMakeLists.txt index 04891fad8..0016eb17f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,6 +198,7 @@ set(DD_PROFILING_SOURCES src/ddprof_cmdline.cc src/ddres_list.cc src/ipc.cc + src/lib/address_bitset.cc src/lib/allocation_tracker.cc src/lib/dd_profiling.cc src/lib/elfutils.cc diff --git a/include/ddprof_perf_event.hpp b/include/ddprof_perf_event.hpp index ceda11982..ecd4f8c73 100644 --- a/include/ddprof_perf_event.hpp +++ b/include/ddprof_perf_event.hpp @@ -8,6 +8,8 @@ #include #include +#include "perf.hpp" + // Extend the perf event types // There are <30 different perf events (starting at 1000 seems safe) enum : uint32_t { diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp new file mode 100644 index 000000000..c07c7ead1 --- /dev/null +++ b/include/lib/address_bitset.hpp @@ -0,0 +1,48 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. +#pragma once + +#include +#include +#include +#include + +namespace ddprof { +class AddressBitset { +public: + explicit AddressBitset() {} + ~AddressBitset(); + AddressBitset(AddressBitset &other) = delete; + AddressBitset &operator=(AddressBitset &other) = delete; + + // returns true if the element was inserted + bool add(uintptr_t addr); + // returns true if the element was removed + bool remove(uintptr_t addr); + void clear(); + int count() const { return _nb_addresses; } + +private: + static constexpr unsigned _lower_bits_ignored = 4; + static constexpr int _k_default_max_mid_levels = 10; + // element type + using Word_t = uint64_t; + constexpr static unsigned _nb_bits_per_word = sizeof(Word_t) * 8; + static constexpr unsigned _k_nb_words = 4096 / 64; // (64) + static constexpr unsigned _nb_entries_per_level = 65536; // 2^16 + + struct LeafLevel { + std::atomic leaf[_k_nb_words] = {}; + }; + + struct MidLevel { + std::atomic mid[_nb_entries_per_level] = {}; + }; + + std::atomic _top_level[_nb_entries_per_level] = {}; + std::atomic _nb_addresses = 0; + std::atomic _nb_mid_levels = 0; +}; +} // namespace ddprof diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp index 4ce24a3fb..5b4a9eccd 100644 --- a/include/lib/allocation_tracker.hpp +++ b/include/lib/allocation_tracker.hpp @@ -5,6 +5,7 @@ #pragma once +#include "address_bitset.hpp" #include "allocation_tracker_tls.hpp" #include "ddprof_base.hpp" #include "ddres_def.hpp" @@ -83,7 +84,8 @@ class AllocationTracker { uint64_t next_sample_interval(std::minstd_rand &gen); DDRes init(uint64_t mem_profile_interval, bool deterministic_sampling, - uint32_t stack_sample_size, const RingBufferInfo &ring_buffer); + bool track_deallocations, uint32_t stack_sample_size, + const RingBufferInfo &ring_buffer); void free(); static AllocationTracker *create_instance(); @@ -113,7 +115,8 @@ class AllocationTracker { uint32_t _stack_sample_size; PEvent _pevent; bool _deterministic_sampling; - AdressSet _address_set; + + std::unique_ptr _allocated_address_set; // These can not be tied to the internal state of the instance. // The creation of the instance depends on this diff --git a/include/lib/lib_logger.hpp b/include/lib/lib_logger.hpp index 49529406d..c8c83f1e6 100644 --- a/include/lib/lib_logger.hpp +++ b/include/lib/lib_logger.hpp @@ -10,13 +10,37 @@ #include namespace ddprof { -template -void log_once(char const *const format, Args... args) { -#ifndef DEBUG - static std::once_flag flag; - std::call_once(flag, [&, format]() { fprintf(stderr, format, args...); }); + +#ifdef NDEBUG +template +void log_once_helper(std::once_flag &flag, Func &&func) { + std::call_once(flag, std::forward(func)); #else - fprintf(stderr, format, args...); +template void log_once_helper(std::once_flag &, Func &&func) { + func(); #endif } + +template +void log_always_once_helper(std::once_flag &flag, Func &&func) { + std::call_once(flag, std::forward(func)); +} + +#define LOG_ALWAYS_ONCE(format, ...) \ + do { \ + static std::once_flag UNIQUE_ONCE_FLAG_##__COUNTER__; \ + ddprof::log_always_once_helper(UNIQUE_ONCE_FLAG_##__COUNTER__, [&]() { \ + fprintf(stderr, (format), ##__VA_ARGS__); \ + }); \ + } while (0) + +// create a once flag for the line and file where this is called: +#define LOG_ONCE(format, ...) \ + do { \ + static std::once_flag UNIQUE_ONCE_FLAG_##__COUNTER__; \ + ddprof::log_once_helper(UNIQUE_ONCE_FLAG_##__COUNTER__, [&]() { \ + fprintf(stderr, (format), ##__VA_ARGS__); \ + }); \ + } while (0) + } // namespace ddprof diff --git a/include/live_allocation-c.hpp b/include/live_allocation-c.hpp index 657e84b1b..5ccfc136e 100644 --- a/include/live_allocation-c.hpp +++ b/include/live_allocation-c.hpp @@ -11,7 +11,7 @@ namespace liveallocation { // build time override to reduce execution time of test static constexpr auto kMaxTracked = KMAX_TRACKED_ALLOCATIONS; #else -static constexpr auto kMaxTracked = 500000; +static constexpr auto kMaxTracked = 524288; // 2^19 #endif } // namespace liveallocation } // namespace ddprof diff --git a/src/ddprof_worker.cc b/src/ddprof_worker.cc index da10aca4f..4d32bb063 100644 --- a/src/ddprof_worker.cc +++ b/src/ddprof_worker.cc @@ -306,7 +306,8 @@ DDRes ddprof_pr_sample(DDProfContext &ctx, perf_event_sample *sample, // Aggregate if unwinding went well (todo : fatal error propagation) if (!IsDDResFatal(res)) { struct UnwindState *us = ctx.worker_ctx.us; - if (Any(EventConfMode::kLiveCallgraph & watcher->output_mode)) { + if (Any(EventConfMode::kLiveCallgraph & watcher->output_mode) && + sample->addr) { // null address means we should not account it // Live callgraph mode // for now we hard code the live aggregation mode ctx.worker_ctx.live_allocation.register_allocation( diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc new file mode 100644 index 000000000..041b1bf1e --- /dev/null +++ b/src/lib/address_bitset.cc @@ -0,0 +1,170 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. +#include "address_bitset.hpp" + +#include +#include +#include +#include +#include +#include + +namespace ddprof { + +bool AddressBitset::add(uintptr_t addr) { + // Extract top 16 bits for top-level index + unsigned top_index = (addr >> 32) & 0xFFFF; + + // If the entry at this index is null, allocate and try to atomically set it + MidLevel *expected_mid = + _top_level[top_index].load(std::memory_order_relaxed); + if (!expected_mid) { + if (_nb_mid_levels >= _k_default_max_mid_levels) { + // Every new level adds half a meg of overhead + LOG_ALWAYS_ONCE( + " ddprof: Address bitset reached maximum number of mid levels\n"); + return false; + } + expected_mid = new MidLevel(); + MidLevel *old_mid = nullptr; + if (!_top_level[top_index].compare_exchange_strong(old_mid, expected_mid)) { + delete expected_mid; + expected_mid = old_mid; + } else { + ++_nb_mid_levels; + } + assert(expected_mid); + if (!expected_mid) { + // something went wrong + return false; + } + } + + // Extract middle 16 bits for mid-level index + unsigned mid_index = (addr >> 16) & 0xFFFF; + + // If the entry at this mid-level index is null, allocate and try to + // atomically set it + LeafLevel *expected_leaf = + expected_mid->mid[mid_index].load(std::memory_order_acquire); + if (!expected_leaf) { + expected_leaf = new LeafLevel(); + LeafLevel *old_leaf = nullptr; + if (!expected_mid->mid[mid_index].compare_exchange_strong(old_leaf, + expected_leaf)) { + delete expected_leaf; + expected_leaf = old_leaf; + } + assert(expected_leaf); + if (!expected_leaf) { + // something went wrong + return false; + } + } + + // Extract lower 16 bits and ignore lower bits (12 remain) + unsigned leaf_index = (addr & 0xFFFF) >> _lower_bits_ignored; + unsigned index_array = leaf_index / _nb_bits_per_word; + assert(index_array < _k_nb_words); + unsigned bit_offset = leaf_index % _nb_bits_per_word; + Word_t bit_in_element = (1UL << bit_offset); + + if (!(expected_leaf->leaf[index_array].fetch_or(bit_in_element) & + bit_in_element)) { + ++_nb_addresses; + return true; + } + return false; // Collision +} + +bool AddressBitset::remove(uintptr_t addr) { + // Extract top 16 bits for top-level index + unsigned top_index = (addr >> 32) & 0xFFFF; + + // Try to get the mid-level pointer. If it's null, return false. + MidLevel *mid = _top_level[top_index].load(std::memory_order_acquire); + if (unlikely(!mid)) { + return false; + } + + // Extract middle 16 bits for mid-level index + unsigned mid_index = (addr >> 16) & 0xFFFF; + + // Try to get the leaf-level pointer from the mid-level. If it's null, return + // false. + LeafLevel *leaf = mid->mid[mid_index].load(std::memory_order_acquire); + if (unlikely(!leaf)) { + return false; + } + + // Extract lower 16 bits and ignore lower bits (12 remain) + unsigned leaf_index = (addr & 0xFFFF) >> _lower_bits_ignored; + unsigned index_array = leaf_index / _nb_bits_per_word; + assert(index_array < _k_nb_words); + unsigned bit_offset = leaf_index % _nb_bits_per_word; + Word_t bit_in_element = (1UL << bit_offset); + + // Use fetch_and to zero the bit + if (leaf->leaf[index_array].fetch_and(~bit_in_element) & bit_in_element) { + _nb_addresses.fetch_sub(1, std::memory_order_relaxed); + return true; + } + // Otherwise, the bit wasn't set to begin with, so return false + return false; +} + +// TODO: the performance of this clear is horrible +// For now we will avoid calling it +void AddressBitset::clear() { + for (unsigned t = 0; t < _nb_entries_per_level; ++t) { + MidLevel *mid = _top_level[t].load(std::memory_order_acquire); + if (mid) { // if mid-level exists + for (unsigned m = 0; m < _nb_entries_per_level; ++m) { + LeafLevel *leaf = mid->mid[m].load(std::memory_order_acquire); + if (leaf) { // if leaf-level exists + for (unsigned l = 0; l < _k_nb_words; ++l) { + Word_t original_value = leaf->leaf[l].exchange(0); + // Count number of set bits in original_value + int num_set_bits = std::popcount(original_value); + if (num_set_bits > 0) { + _nb_addresses.fetch_sub(num_set_bits, std::memory_order_relaxed); + } + } + } + } + } + } +} + +AddressBitset::~AddressBitset() { +#ifdef DEBUG + unsigned mid_count = 0; + unsigned leaf_count = 0; +#endif + for (unsigned t = 0; t < _nb_entries_per_level; ++t) { + MidLevel *mid = _top_level[t].load(std::memory_order_acquire); + if (mid) { // if mid-level exists +#ifdef DEBUG + ++mid_count; +#endif + for (unsigned m = 0; m < _nb_entries_per_level; ++m) { + LeafLevel *leaf = mid->mid[m].load(std::memory_order_acquire); + if (leaf) { // if leaf-level exists +#ifdef DEBUG + ++leaf_count; +#endif + delete leaf; + } + } + delete mid; + } + } +#ifdef DEBUG + fprintf(stderr, "Mid count = %u \n", mid_count); + fprintf(stderr, "Leaf count = %u \n", leaf_count); +#endif +} + +} // namespace ddprof diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc index d095445d0..53fcf2228 100644 --- a/src/lib/allocation_tracker.cc +++ b/src/lib/allocation_tracker.cc @@ -19,7 +19,6 @@ #include #include #include - #include namespace ddprof { @@ -68,7 +67,7 @@ TrackerThreadLocalState *AllocationTracker::init_tl_state() { if (res_set) { // should return 0 - log_once("Error: Unable to store tl_state. error %d \n", res_set); + LOG_ONCE("Error: Unable to store tl_state. error %d \n", res_set); delete tl_state; tl_state = nullptr; } @@ -119,9 +118,9 @@ DDRes AllocationTracker::allocation_tracking_init( void *volatile p = ::malloc(1); ::free(p); - DDRES_CHECK_FWD(instance->init(allocation_profiling_rate, - flags & kDeterministicSampling, - stack_sample_size, ring_buffer)); + DDRES_CHECK_FWD(instance->init( + allocation_profiling_rate, flags & kDeterministicSampling, + flags & kTrackDeallocations, stack_sample_size, ring_buffer)); _instance = instance; state.init(true, flags & kTrackDeallocations); @@ -131,6 +130,7 @@ DDRes AllocationTracker::allocation_tracking_init( DDRes AllocationTracker::init(uint64_t mem_profile_interval, bool deterministic_sampling, + bool track_deallocations, uint32_t stack_sample_size, const RingBufferInfo &ring_buffer) { _sampling_interval = mem_profile_interval; @@ -140,6 +140,10 @@ DDRes AllocationTracker::init(uint64_t mem_profile_interval, static_cast(RingBufferType::kMPSCRingBuffer)) { return ddres_error(DD_WHAT_PERFRB); } + if (track_deallocations) { + // 16 times as we want to probability of collision to be low enough + _allocated_address_set = std::make_unique(); + } return ddprof::ring_buffer_attach(ring_buffer, &_pevent); } @@ -165,7 +169,7 @@ void AllocationTracker::allocation_tracking_free() { } TrackerThreadLocalState *tl_state = get_tl_state(); if (unlikely(!tl_state)) { - log_once("Error: Unable to find tl_state during %s\n", __FUNCTION__); + LOG_ONCE("Error: Unable to find tl_state during %s\n", __FUNCTION__); instance->free(); return; } @@ -229,26 +233,20 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size, tl_state.remaining_bytes = remaining_bytes; uint64_t total_size = nsamples * sampling_interval; + if (_state.track_deallocations) { + if (_allocated_address_set->count() < ddprof::liveallocation::kMaxTracked && + _allocated_address_set->add(addr)) { + } else { + // null the address to avoid using this for live heap profiling + // pushing a sample is still good to have a good representation + // of the allocations. + addr = 0; + } + } bool success = IsDDResOK(push_alloc_sample(addr, total_size, tl_state)); free_on_consecutive_failures(success); - - if (success && _state.track_deallocations) { - // \fixme{r1viollet} adjust set to be lock free - // We should still lock the clear of live allocations (which is unlikely) - std::lock_guard lock{_state.mutex}; - - // ensure we track this dealloc if it occurs - _address_set.insert(addr); - if (unlikely(_address_set.size() > ddprof::liveallocation::kMaxTracked)) { - if (IsDDResOK(push_clear_live_allocation(tl_state))) { - _address_set.clear(); - } else { - log_once( - "Error: %s", - "Stop allocation profiling. Unable to clear live allocation \n"); - free(); - } - } + if (unlikely(!success) && _state.track_deallocations && addr) { + _allocated_address_set->remove(addr); } } @@ -262,11 +260,8 @@ void AllocationTracker::track_deallocation(uintptr_t addr, return; } - { // Grab the lock to check if this allocation was stored in the set - std::lock_guard lock{_state.mutex}; - if (!_state.track_deallocations || !_address_set.erase(addr)) { - return; - } + if (!_state.track_deallocations || !_allocated_address_set->remove(addr)) { + return; } bool success = IsDDResOK(push_dealloc_sample(addr, tl_state)); @@ -494,7 +489,7 @@ void AllocationTracker::notify_thread_start() { if (unlikely(!tl_state)) { tl_state = init_tl_state(); if (!tl_state) { - log_once("Error: Unable to start allocation profiling on thread %d", + LOG_ONCE("Error: Unable to start allocation profiling on thread %d", ddprof::gettid()); return; } @@ -514,7 +509,7 @@ void AllocationTracker::notify_fork() { if (unlikely(!tl_state)) { // The state should already exist if we forked. // This would mean that we were not able to create the state before forking - log_once("Error: Unable to retrieve tl state after fork thread %d", + LOG_ONCE("Error: Unable to retrieve tl state after fork thread %d", ddprof::gettid()); return; } else { diff --git a/src/lib/dd_profiling.cc b/src/lib/dd_profiling.cc index b66544621..880c276d3 100644 --- a/src/lib/dd_profiling.cc +++ b/src/lib/dd_profiling.cc @@ -277,8 +277,7 @@ int ddprof_start_profiling_internal() { // fails ? g_state.allocation_profiling_started = true; } else { - ddprof::log_once("Error: %s", - "Failure to start allocation profiling\n"); + LOG_ONCE("Error: %s", "Failure to start allocation profiling\n"); } } } catch (const ddprof::DDException &e) { return -1; } @@ -286,7 +285,7 @@ int ddprof_start_profiling_internal() { if (g_state.allocation_profiling_started) { int res = pthread_atfork(nullptr, nullptr, notify_fork); if (res) { - ddprof::log_once("Error:%s", "Unable to setup notify fork"); + LOG_ONCE("Error:%s", "Unable to setup notify fork"); assert(0); } } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a1aad2f94..1ffc85d6c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -255,6 +255,7 @@ add_unit_test( allocation_tracker-ut allocation_tracker-ut.cc ../src/lib/allocation_tracker.cc + ../src/lib/address_bitset.cc ../src/base_frame_symbol_lookup.cc ../src/build_id.cc ../src/container_id.cc @@ -296,7 +297,7 @@ add_unit_test( ../src/unwind_helpers.cc ../src/unwind_metrics.cc LIBRARIES ${ELFUTILS_LIBRARIES} llvm-demangle DDProf::Parser - DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=100) + DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=16384) add_unit_test(sys_utils-ut sys_utils-ut.cc ../src/sys_utils.cc) @@ -352,6 +353,10 @@ add_unit_test( add_unit_test(pthread_tls-ut pthread_tls-ut.cc) +add_unit_test(address_bitset-ut address_bitset-ut.cc ../src/lib/address_bitset.cc) + +add_unit_test(lib_logger-ut ./lib_logger-ut.cc) + add_benchmark(savecontext-bench savecontext-bench.cc ../src/lib/pthread_fixes.cc ../src/lib/savecontext.cc ../src/lib/saveregisters.cc) @@ -360,6 +365,7 @@ add_benchmark(timer-bench timer-bench.cc ../src/timer.cc ../src/perf.cc) add_benchmark( allocation_tracker-bench allocation_tracker-bench.cc + ../src/lib/address_bitset.cc ../src/lib/allocation_tracker.cc ../src/pevent_lib.cc ../src/perf.cc @@ -373,7 +379,7 @@ add_benchmark( ../src/user_override.cc ../src/sys_utils.cc LIBRARIES ${ELFUTILS_LIBRARIES} llvm-demangle DDProf::Parser - DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=10000) + DEFINITIONS ${DDPROF_DEFINITION_LIST} KMAX_TRACKED_ALLOCATIONS=16384) if(NOT CMAKE_BUILD_TYPE STREQUAL "SanitizedDebug") add_exe( diff --git a/test/address_bitset-ut.cc b/test/address_bitset-ut.cc new file mode 100644 index 000000000..aa7259a99 --- /dev/null +++ b/test/address_bitset-ut.cc @@ -0,0 +1,81 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. +#include + +#include "address_bitset.hpp" +#include +#include +#include +#include +#include + +namespace ddprof { + +TEST(address_bitset, simple) { + AddressBitset address_bitset{}; + EXPECT_TRUE(address_bitset.add(0xbadbeef)); + EXPECT_FALSE(address_bitset.add(0xbadbeef)); + EXPECT_TRUE(address_bitset.remove(0xbadbeef)); +} + +TEST(address_bitset, triple_remove) { + AddressBitset address_bitset{}; + EXPECT_TRUE(address_bitset.add(0xbadbeef)); + EXPECT_TRUE(address_bitset.remove(0xbadbeef)); + EXPECT_FALSE(address_bitset.remove(0xbadbeef)); + EXPECT_FALSE(address_bitset.remove(0xbadbeef)); +} + +TEST(address_bitset, many_addresses) { + AddressBitset address_bitset{}; + std::random_device rd; + std::mt19937 gen(rd()); +// #define WORST_CASE +#ifdef WORST_CASE + // if addresses are too spread out, we just can not + // keep track of everything + std::uniform_int_distribution dis( + 0, std::numeric_limits::max()); +#else + // with more reasonable ranges, we are OK + std::uniform_int_distribution dis(0x1000, 0x1000000); +#endif + std::set addresses; + unsigned nb_elements = 100000; + for (unsigned i = 0; i < nb_elements; ++i) { + // avoid allocations that are too close to each other + uintptr_t addr = (dis(gen) << 4); + auto res = addresses.insert(addr); + if (res.second) { + EXPECT_TRUE(address_bitset.add(addr)); + } + } + for (auto addr : addresses) { + EXPECT_TRUE(address_bitset.remove(addr)); + } + EXPECT_EQ(0, address_bitset.count()); + for (auto addr : addresses) { + EXPECT_FALSE(address_bitset.remove(addr)); + } +} + +// the aim of this test is just not blow up +TEST(address_bitset, sparse_addressing) { + AddressBitset address_bitset{}; + std::random_device rd; + std::mt19937 gen(rd()); + // if addresses are too spread out, we just can not + // keep track of everything + std::uniform_int_distribution dis( + 0, std::numeric_limits::max()); + unsigned nb_elements = 300000; + for (unsigned i = 0; i < nb_elements; ++i) { + // avoid allocations that are too close to each other + uintptr_t addr = (dis(gen) << 4); + address_bitset.add(addr); + } +} + +} // namespace ddprof diff --git a/test/allocation_tracker-bench.cc b/test/allocation_tracker-bench.cc index 5fa708737..f98477695 100644 --- a/test/allocation_tracker-bench.cc +++ b/test/allocation_tracker-bench.cc @@ -8,31 +8,58 @@ #include #include "allocation_tracker.hpp" +#include "ddprof_perf_event.hpp" #include "loghandle.hpp" #include "ringbuffer_holder.hpp" // Global bench settings // Activate live heap tracking -// #define LIVE_HEAP +#define LIVE_HEAP // Sampling rate: default rate is 524288 -static constexpr uint64_t k_rate = 20000; +static constexpr uint64_t k_rate = 200000; +// The reader thread is interesting, though it starts dominating the CPU +// the benchmark focuses on the capture of allocation events. #define READER_THREAD std::atomic reader_continue{true}; +std::atomic error_in_reader{false}; // Reader worker thread function void read_buffer(ddprof::RingBufferHolder &holder) { - int nb_samples = 0; + int nb_alloc_samples = 0; + int nb_dealloc_samples = 0; + int nb_unknown_samples = 0; + + error_in_reader = false; while (reader_continue) { ddprof::MPSCRingBufferReader reader(holder.get_ring_buffer()); + // fprintf(stderr, "size = %lu! \n", reader.available_size()); auto buf = reader.read_sample(); if (!buf.empty()) { - ++nb_samples; - // fprintf(stderr, "Yep, got sample ! \n"); + const perf_event_header *hdr = + reinterpret_cast(buf.data()); + + if (hdr->type == PERF_RECORD_SAMPLE) { + + ++nb_alloc_samples; + + } else if (hdr->type == PERF_CUSTOM_EVENT_DEALLOCATION) { + ++nb_dealloc_samples; + } else { + ++nb_unknown_samples; + } } std::chrono::microseconds(10000); } - fprintf(stderr, "Reader thread exit, nb_samples=%d\n", nb_samples); + fprintf(stderr, + "Reader thread exit," + "nb_alloc_samples=%d," + "nb_dealloc_samples=%d," + "nb_unknown_samples=%d\n", + nb_alloc_samples, nb_dealloc_samples, nb_unknown_samples); + if (nb_alloc_samples == 0) { + error_in_reader = true; + } } DDPROF_NOINLINE void my_malloc(size_t size, uintptr_t addr = 0xdeadbeef) { @@ -40,8 +67,8 @@ DDPROF_NOINLINE void my_malloc(size_t size, uintptr_t addr = 0xdeadbeef) { ddprof::AllocationTracker::get_tl_state(); if (tl_state) { // tl_state is null if we are not tracking allocations ddprof::AllocationTracker::track_allocation_s(addr, size, *tl_state); - DDPROF_BLOCK_TAIL_CALL_OPTIMIZATION(); } + DDPROF_BLOCK_TAIL_CALL_OPTIMIZATION(); } DDPROF_NOINLINE void my_free(uintptr_t addr) { @@ -62,28 +89,26 @@ void perform_memory_operations(bool track_allocations, ddprof::AllocationTracker::kTrackDeallocations; #endif - ddprof::RingBufferHolder ring_buffer{buf_size_order, - RingBufferType::kMPSCRingBuffer}; - - if (track_allocations) { - ddprof::AllocationTracker::allocation_tracking_init( - rate, flags, k_default_perf_stack_sample_size, - ring_buffer.get_buffer_info()); - } - int nb_threads = 4; std::vector threads; int num_allocations = 1000; size_t page_size = 0x1000; std::random_device rd; std::mt19937 gen(rd()); - + ddprof::RingBufferHolder ring_buffer{buf_size_order, + RingBufferType::kMPSCRingBuffer}; #ifdef READER_THREAD // create reader worker thread reader_continue = true; std::thread reader_thread{read_buffer, std::ref(ring_buffer)}; #endif + if (track_allocations) { + ddprof::AllocationTracker::allocation_tracking_init( + rate, flags, k_default_perf_stack_sample_size, + ring_buffer.get_buffer_info()); + } + for (auto _ : state) { // Initialize threads and clear addresses threads.clear(); @@ -91,11 +116,14 @@ void perform_memory_operations(bool track_allocations, for (int i = 0; i < nb_threads; ++i) { threads.emplace_back([&, i] { - std::uniform_int_distribution<> dis(i * page_size, - (i + 1) * page_size - 1); + // in theory we automatically hook in thread creation + // though in the benchmark we can not do this. + ddprof::AllocationTracker::init_tl_state(); + std::uniform_int_distribution dis(i * page_size, + (i + 1) * page_size - 1); for (int j = 0; j < num_allocations; ++j) { - uintptr_t addr = dis(gen); + uintptr_t addr = dis(gen) << 4; my_malloc(1024, addr); thread_addresses[i].push_back(addr); } @@ -109,6 +137,7 @@ void perform_memory_operations(bool track_allocations, threads.clear(); for (int i = 0; i < nb_threads; ++i) { threads.emplace_back([&, i] { + ddprof::AllocationTracker::init_tl_state(); for (auto addr : thread_addresses[i]) { my_free(addr); } @@ -119,11 +148,20 @@ void perform_memory_operations(bool track_allocations, t.join(); } } + + // wait for the reader thread to receive all samples + std::this_thread::sleep_for(std::chrono::microseconds(5000)); + #ifdef READER_THREAD reader_continue = false; reader_thread.join(); #endif - ddprof::AllocationTracker::allocation_tracking_free(); + if (track_allocations) { + ddprof::AllocationTracker::allocation_tracking_free(); + if (error_in_reader) { + exit(1); + } + } } // Benchmark without allocation tracking @@ -143,6 +181,7 @@ class WorkerThread { WorkerThread() : stop(false), perform_task(false) { worker_thread = std::thread([this] { + ddprof::AllocationTracker::init_tl_state(); while (!stop) { std::unique_lock lock(mutex); cv.wait(lock, [this] { return perform_task || stop; }); @@ -225,9 +264,10 @@ void perform_memory_operations_2(bool track_allocations, std::mt19937 gen(rd()); for (int i = 0; i < nb_threads; ++i) { - std::uniform_int_distribution<> dis(i * page_size, (i + 1) * page_size - 1); + std::uniform_int_distribution dis(i * page_size, + (i + 1) * page_size - 1); for (int j = 0; j < num_allocations; ++j) { - uintptr_t addr = dis(gen); + uintptr_t addr = dis(gen) << 4; thread_addresses[i].push_back(addr); } } @@ -248,6 +288,9 @@ void perform_memory_operations_2(bool track_allocations, std::this_thread::sleep_for(std::chrono::microseconds(100)); } + // wait for the reader thread to receive all samples + std::this_thread::sleep_for(std::chrono::microseconds(1000)); + #ifdef READER_THREAD reader_continue = false; reader_thread.join(); diff --git a/test/allocation_tracker-ut.cc b/test/allocation_tracker-ut.cc index 6bca2f83e..fc1282e9f 100644 --- a/test/allocation_tracker-ut.cc +++ b/test/allocation_tracker-ut.cc @@ -149,44 +149,35 @@ TEST(allocation_tracker, max_tracked_allocs) { k_default_perf_stack_sample_size, ring_buffer.get_buffer_info()); ASSERT_TRUE(ddprof::AllocationTracker::is_active()); - - for (int i = 0; i <= ddprof::liveallocation::kMaxTracked + 1; ++i) { - my_malloc(1, 0x1000 + i); + bool max_reached = false; + uint64_t nb_samples = 0; + for (int i = 0; i <= ddprof::liveallocation::kMaxTracked + + ddprof::liveallocation::kMaxTracked / 10; + ++i) { + uintptr_t addr = 0x1000 + (i * 16); + my_malloc(1, addr); ddprof::MPSCRingBufferReader reader{ring_buffer.get_ring_buffer()}; - if (i <= - ddprof::liveallocation::kMaxTracked) { // check that we get the relevant - // info for this allocation - ASSERT_GT(reader.available_size(), 0); + while (reader.available_size() > 0) { auto buf = reader.read_sample(); ASSERT_FALSE(buf.empty()); const perf_event_header *hdr = reinterpret_cast(buf.data()); - ASSERT_EQ(hdr->type, PERF_RECORD_SAMPLE); - - perf_event_sample *sample = - hdr2samp(hdr, perf_event_default_sample_type() | PERF_SAMPLE_ADDR); - - ASSERT_EQ(sample->period, 1); - ASSERT_EQ(sample->pid, getpid()); - ASSERT_EQ(sample->tid, ddprof::gettid()); - ASSERT_EQ(sample->addr, 0x1000 + i); - } else { - bool clear_found = false; - int nb_read = 0; - ddprof::ConstBuffer buf; - do { - buf = reader.read_sample(); - ++nb_read; - if (buf.empty()) - break; - const perf_event_header *hdr = - reinterpret_cast(buf.data()); - if (hdr->type == PERF_CUSTOM_EVENT_CLEAR_LIVE_ALLOCATION) { - clear_found = true; + if (hdr->type == PERF_RECORD_SAMPLE) { + ++nb_samples; + perf_event_sample *sample = + hdr2samp(hdr, perf_event_default_sample_type() | PERF_SAMPLE_ADDR); + ASSERT_EQ(sample->period, 1); + ASSERT_EQ(sample->pid, getpid()); + ASSERT_EQ(sample->tid, ddprof::gettid()); + if (sample->addr == 0) { + max_reached = true; + } else { + EXPECT_EQ(sample->addr, addr); } - } while (true); - EXPECT_EQ(nb_read, 3); - EXPECT_TRUE(clear_found); + } } } + fprintf(stderr, "Number of found samples %lu (vs max = %d) \n", nb_samples, + ddprof::liveallocation::kMaxTracked); + EXPECT_TRUE(max_reached); } diff --git a/test/lib_logger-ut.cc b/test/lib_logger-ut.cc new file mode 100644 index 000000000..1fee22033 --- /dev/null +++ b/test/lib_logger-ut.cc @@ -0,0 +1,30 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. + +#include + +#include "lib/lib_logger.hpp" + +namespace ddprof { + +void foo_log(const std::string &str) { LOG_ONCE("%s", str.c_str()); } + +TEST(LibLogger, simple) { + testing::internal::CaptureStderr(); + LOG_ONCE("something "); + LOG_ONCE("else "); + foo_log("one "); // this shows + foo_log("two "); // this does not show + const char *some_string = "some_string"; + LOG_ONCE("three %s\n", some_string); + std::string output = testing::internal::GetCapturedStderr(); + fprintf(stderr, "%s", output.c_str()); +#ifdef NDEBUG + EXPECT_EQ(output, "something else one three some_string\n"); +#else + EXPECT_EQ(output, "something else one two three some_string\n"); +#endif +} +} // namespace ddprof diff --git a/test/reentry_guard-ut.cc b/test/reentry_guard-ut.cc index 4b733581e..1c0f6f0aa 100644 --- a/test/reentry_guard-ut.cc +++ b/test/reentry_guard-ut.cc @@ -26,8 +26,8 @@ TEST(ReentryGuardTest, null_init) { guard.register_guard(&reentry_guard); EXPECT_TRUE(guard); { - ddprof::ReentryGuard guard(&reentry_guard); - EXPECT_FALSE(guard); + ddprof::ReentryGuard guard2(&reentry_guard); + EXPECT_FALSE(guard2); } } }