diff --git a/include/perf_watcher_lookup.hpp b/include/perf_watcher_lookup.hpp new file mode 100644 index 000000000..31467d036 --- /dev/null +++ b/include/perf_watcher_lookup.hpp @@ -0,0 +1,201 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. This product includes software +// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present +// Datadog, Inc. + +#pragma once + +#include "perf_ringbuffer.hpp" +#include "perf_watcher.hpp" +#include "pevent.hpp" + +#include +#include + +namespace ddprof { + +struct PEvent { + PerfWatcher *watcher; + int fd; // Underlying perf event FD for perf_events, otherwise an eventfd that + // signals data is available in ring buffer + int mapfd; // FD for ring buffer, same as `fd` for perf events + int cpu; // CPU id + int attr_idx; // matching perf_event_attr + size_t ring_buffer_size; // size of the ring buffer + RingBufferType ring_buffer_type; + RingBuffer rb; // metadata and buffers for processing perf ringbuffer +}; + +class PEventTable { +private: + PEventTable() {} + + // Lookups + std::unordered_map id_to_pevent; + std::unordered_map cpu_to_fd; + + // Stashed attrs + std::vector attrs; + +public: + PEventTable(const PEventTable&) = delete; + PEventTable& operator=(const PEventTable&) = delete; + + static PEventTable& get_instance() { + static PEventTable instance; + return instance; + } + + PEvent *pevent_from_id(uint64_t id) { + auto it = id_to_pevent.find(id); + return (it != id_to_pevent.end()) ? it->second : nullptr; + } + + bool open_custom_watcher(PerfWatcher &watcher, pid_t pid, PerfClockSource perf_clock_source) { + PEvent event = { + fd, + fd, + cpu, + attr_id, + buffer_size_order, + RingBufferType::kPerfRingBuffer, + false, + {} + }; + int const order = pevent_compute_min_mmap_order( + k_mpsc_buffer_size_shift, watcher->options.stack_sample_size, + k_min_number_samples_per_ring_buffer); + DDRES_CHECK_FWD(ring_buffer_create(order, RingBufferType::kMPSCRingBuffer, + true, &event)); + } + + bool open_perf_watcher(PerfWatcher &watcher, pid_t pid, PerfClockSource perf_clock_source) { + std::vector possible_attrs = all_perf_configs_from_watcher(&watcher, true, perf_clock_source); + + // We have a number of configurations and we need to try them on all CPUs. We prefer the earlier configurations, + // but can failover to the later ones. If a configuration fails, it should not be used again. Generally, either + // all or none of a configuration will work. If we fail midway through, we take what we can get. We return + // false if no configs succeed + for (int cpu = 0; cpu < num_cpu; ++cpu) { + auto it = possible_attrs.begin(); + while (it != possible_attrs.end()) { + int fd = perf_event_open(it, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC); + if (fd == -1) { +# warning TODO add error here + it = possible_attrs.erase(it); // Don't retry this config + } + + // Get the ID + uint64_t sample_id = 0; + if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &sample_id)) { + // If I can't get the sample, then I can't use this event. + LG_WARN("Error getting perf sample ID\n"); + close(fd); + continue; + } + + // Store the attr + int attr_id = attrs.size(); + attrs.push_back(it); + + // Figure out which buffer size to use + static_bool log_once = true; + int const buffer_size_order = pevent_compute_min_mmap_order( + k_default_buffer_size_shift, stack_sample_size, + k_min_number_samples_per_ring_buffer); + if (buffer_size_order > k_default_buffer_size_shift && log_once) { +# warning TODO add more errors here + } + + // Make a PEvent now for the next part; this will get moved into storage if the next operations are successful, but it can't + // be moved yet because mapping may still fail + PEvent event = { + fd, + fd, + cpu, + attr_id, + buffer_size_order, + RingBufferType::kPerfRingBuffer, + false, + {} + }; + + // We have enough information to configure the ringbuffer. If this CPU + // already has a perf event on it, then multiplex the ringbuffer + auto fd_it = cpu_to_fd.find(cpu); + if (fd_it != fd_it.end()) { + // This CPU already has a perf_event ringbuffer, so just use that + auto cpu_fd = fd_it->second; + if (ioctl(event->mapfd, PERF_EVENT_IOC_SET_OUTPUT, cpu_fd)) { +# warning TODO add more errors + } + event->mapfd = fd_it->second; + } else { + // This CPU does not have a perf_event ringbuffer, so make one + pevent_mmap_event(event); + cpu_to_fd[cpu] = event->mapfd; + } + + // Successful, don't retry anymore! + id_to_pevent.emplace(sample_id, std::move(event)); + } // try to open + } // cpu + } + + bool open_watcher(PerfWatcher &watcher, pid_t pid, PerfClockSource perf_clock_source) { + if (watcher->type < kDDPROF_TYPE_CUSTOM) { + ... + } else { + } + + } + + DDRes enable_all() { + // Just before we enter the main loop, force the enablement of the perf + // contexts + for (const auto& [_, event] : id_to_pevent) { + (void)_; + if (event.watcher->type < kDDPROF_TYPE_CUSTOM) { +# warning TODO better error +// DDRES_CHECK_INT(ioctl(event.fd, PERF_EVENT_IOC_ENABLE), +// DD_WHAT_IOCTL, "Error ioctl fd=%d (idx#%zu)", +// event.fd, i); + } + } + return {}; + } + + DDRes cleanup() { + DDRes ret = ddres_init(); + + // Cleanup both, storing the error if one was generated + for (const auto& [_, event] : id_to_pevent) { + (void)_; + if (DDRes const ret_tmp = pevent_munmap_event(event), !IsDDResOK((ret_tmp))) { + ret = ret_tmp; + } + if (DDRes const ret_tmp = pevent_close_event(event), !IsDDResOK((ret_tmp))) { + ret = ret_tmp; + } + } + + // Now let's reset the storage + id_to_pevent.clear(); + cpu_to_fd.clear(); + attrs.clear(); + + return ret; + } + + void pollfd_setup(struct pollfd *pfd, int *pfd_len) { + // Setup poll() to watch perf_event file descriptors + for (const auto& [_, event] : id_to_pevent) { + // NOTE: if fd is negative, it will be ignored + pfd[i].fd = event.fd; + pfd[i].events = POLLIN | POLLERR | POLLHUP; + } + } + +}; + +} // namespace ddprof diff --git a/src/ddprof.cc b/src/ddprof.cc index e6a9c51cd..19dbd3899 100644 --- a/src/ddprof.cc +++ b/src/ddprof.cc @@ -68,17 +68,14 @@ void display_system_info() { } // namespace DDRes ddprof_setup(DDProfContext &ctx) { - PEventHdr *pevent_hdr = &ctx.worker_ctx.pevent_hdr; try { - pevent_init(pevent_hdr); - display_system_info(); // Open perf events and mmap events right now to start receiving events - // mmaps from perf fds will be lost after fork, that why we mmap them again - // in worker (but kernel only accounts for the pinned memory once). - DDRES_CHECK_FWD( - pevent_setup(ctx, ctx.params.pid, ctx.params.num_cpu, pevent_hdr)); + auto pe_table = PEventTable::get_instance(); + for (auto *watcher : ctx.watchers) { + pe_table.open_watcher(watcher, pid, num_cpu, ctx.perf_clock_source); + } // Setup signal handler if defined if (ctx.params.fault_info) { @@ -100,17 +97,17 @@ DDRes ddprof_setup(DDProfContext &ctx) { DDRES_CHECK_FWD(ddprof_stats_init()); - DDRES_CHECK_FWD(pevent_enable(pevent_hdr)); + DDRES_CHECK_FWD(pe_table.enable_all()); } CatchExcept2DDRes(); return {}; } -DDRes ddprof_teardown(DDProfContext &ctx) { - PEventHdr *pevent_hdr = &ctx.worker_ctx.pevent_hdr; - - if (IsDDResNotOK(pevent_cleanup(pevent_hdr))) { - LG_WRN("Error when calling pevent_cleanup."); +DDRes ddprof_teardown() { + + auto &pe_table = PEventTable::get_instance(); + if (IsDDResNotOK(pe_table.cleanup())) { + LG_WRN("Error when calling pe_table.cleanup."); } if (IsDDResNotOK(ddprof_stats_free())) { diff --git a/src/ddprof_worker.cc b/src/ddprof_worker.cc index d28e97321..95f641a25 100644 --- a/src/ddprof_worker.cc +++ b/src/ddprof_worker.cc @@ -337,7 +337,7 @@ DDRes worker_library_init(DDProfContext &ctx, // register the existing persistent storage for the state ctx.worker_ctx.persistent_worker_state = persistent_worker_state; - PEventHdr *pevent_hdr = &ctx.worker_ctx.pevent_hdr; + auto &pe_table = PETable::get_instance(); // If we're here, then we are a child spawned during the startup operation. // That means we need to iterate through the perf_event_open() handles and diff --git a/src/perf_mainloop.cc b/src/perf_mainloop.cc index ca72f63ea..5244ed7a6 100644 --- a/src/perf_mainloop.cc +++ b/src/perf_mainloop.cc @@ -109,18 +109,6 @@ DDRes spawn_workers(PersistentWorkerState *persistent_worker_state, return {}; } -void pollfd_setup(const PEventHdr *pevent_hdr, struct pollfd *pfd, - int *pfd_len) { - *pfd_len = pevent_hdr->size; - const PEvent *pes = pevent_hdr->pes; - // Setup poll() to watch perf_event file descriptors - for (int i = 0; i < *pfd_len; ++i) { - // NOTE: if fd is negative, it will be ignored - pfd[i].fd = pes[i].fd; - pfd[i].events = POLLIN | POLLERR | POLLHUP; - } -} - DDRes signalfd_setup(pollfd *pfd) { sigset_t mask; diff --git a/src/perf_watcher.cc b/src/perf_watcher.cc index d2eb6e82c..5128822df 100644 --- a/src/perf_watcher.cc +++ b/src/perf_watcher.cc @@ -13,9 +13,9 @@ namespace ddprof { -#define BASE_STYPES \ - (PERF_SAMPLE_STACK_USER | PERF_SAMPLE_REGS_USER | PERF_SAMPLE_TID | \ - PERF_SAMPLE_TIME | PERF_SAMPLE_PERIOD) +#define BASE_STYPES \ + (PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_STACK_USER | PERF_SAMPLE_REGS_USER | \ + PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_PERIOD) uint64_t perf_event_default_sample_type() { return BASE_STYPES; } diff --git a/src/pevent_lib.cc b/src/pevent_lib.cc index 976d4ceed..e693aafa8 100644 --- a/src/pevent_lib.cc +++ b/src/pevent_lib.cc @@ -27,18 +27,6 @@ namespace ddprof { namespace { -DDRes pevent_create(PEventHdr *pevent_hdr, int watcher_idx, - size_t *pevent_idx) { - if (pevent_hdr->size >= pevent_hdr->max_size) { - DDRES_RETURN_ERROR_LOG(DD_WHAT_PERFOPEN, - "Reached max number of watchers (%lu)", - pevent_hdr->max_size); - } - *pevent_idx = pevent_hdr->size++; - pevent_hdr->pes[*pevent_idx].watcher_pos = watcher_idx; - return {}; -} - void display_system_config() { int val; DDRes const res = sys_perf_event_paranoid(val); @@ -48,108 +36,8 @@ void display_system_config() { LG_WRN("Unable to access system configuration"); } } - -// set info for a perf_event_open type of buffer -void pevent_set_info(int fd, int attr_idx, PEvent &pevent, - uint32_t stack_sample_size) { - static bool log_once = true; - pevent.fd = fd; - pevent.mapfd = fd; - int const buffer_size_order = pevent_compute_min_mmap_order( - k_default_buffer_size_shift, stack_sample_size, - k_min_number_samples_per_ring_buffer); - if (buffer_size_order > k_default_buffer_size_shift && log_once) { - LG_NTC("Increasing size order of the ring buffer to %d (from %d)", - buffer_size_order, k_default_buffer_size_shift); - log_once = false; // avoid flooding for all CPUs - } - pevent.ring_buffer_size = perf_mmap_size(buffer_size_order); - pevent.custom_event = false; - pevent.ring_buffer_type = RingBufferType::kPerfRingBuffer; - pevent.attr_idx = attr_idx; -} - -DDRes pevent_register_cpu_0(const PerfWatcher *watcher, int watcher_idx, - pid_t pid, PerfClockSource perf_clock_source, - PEventHdr *pevent_hdr, size_t &pevent_idx) { - // register cpu 0 and find a working config - PEvent *pes = pevent_hdr->pes; - std::vector perf_event_data = - all_perf_configs_from_watcher(watcher, true, perf_clock_source); - DDRES_CHECK_FWD(pevent_create(pevent_hdr, watcher_idx, &pevent_idx)); - - // attempt with different configs - for (auto &attr : perf_event_data) { - // register cpu 0 - int const fd = perf_event_open(&attr, pid, 0, -1, PERF_FLAG_FD_CLOEXEC); - if (fd != -1) { - // Copy the successful config - pevent_hdr->attrs[pevent_hdr->nb_attrs] = attr; - pevent_set_info(fd, pevent_hdr->nb_attrs, pes[pevent_idx], - watcher->options.stack_sample_size); - ++pevent_hdr->nb_attrs; - assert(pevent_hdr->nb_attrs <= kMaxTypeWatcher); - break; - } - LG_NFO("Expected failure (we retry with different settings) " - "perf_event_open for watcher: %s - with attr.type=%s, " - "exclude_kernel=%d", - watcher->desc.c_str(), perf_type_str(attr.type), - static_cast(attr.exclude_kernel)); - } - // check if one of the configs was successful - if (pes[pevent_idx].attr_idx == -1) { - display_system_config(); - DDRES_RETURN_ERROR_LOG(DD_WHAT_PERFOPEN, - "Error calling perf_event_open on watcher %d.0 (%s)", - watcher_idx, strerror(errno)); - } - - return {}; -} - -DDRes pevent_open_all_cpus(const PerfWatcher *watcher, int watcher_idx, - pid_t pid, int num_cpu, - PerfClockSource perf_clock_source, - PEventHdr *pevent_hdr) { - PEvent *pes = pevent_hdr->pes; - - size_t template_pevent_idx = -1; - DDRES_CHECK_FWD(pevent_register_cpu_0(watcher, watcher_idx, pid, - perf_clock_source, pevent_hdr, - template_pevent_idx)); - int const template_attr_idx = pes[template_pevent_idx].attr_idx; - perf_event_attr *attr = &pevent_hdr->attrs[template_attr_idx]; - - // used the fixed attr for the others - for (int cpu_idx = 1; cpu_idx < num_cpu; ++cpu_idx) { - size_t pevent_idx = -1; - DDRES_CHECK_FWD(pevent_create(pevent_hdr, watcher_idx, &pevent_idx)); - int const fd = - perf_event_open(attr, pid, cpu_idx, -1, PERF_FLAG_FD_CLOEXEC); - if (fd == -1) { - DDRES_RETURN_ERROR_LOG(DD_WHAT_PERFOPEN, - "Error calling perfopen on watcher %d.%d (%s)", - watcher_idx, cpu_idx, strerror(errno)); - } - pevent_set_info(fd, pes[template_pevent_idx].attr_idx, pes[pevent_idx], - watcher->options.stack_sample_size); - } - return {}; -} - } // namespace -void pevent_init(PEventHdr *pevent_hdr) { - memset(pevent_hdr, 0, sizeof(PEventHdr)); - pevent_hdr->max_size = k_max_nb_perf_event_open; - for (size_t k = 0; k < pevent_hdr->max_size; ++k) { - pevent_hdr->pes[k].fd = -1; - pevent_hdr->pes[k].mapfd = -1; - pevent_hdr->pes[k].attr_idx = -1; - } -} - int pevent_compute_min_mmap_order(int min_buffer_size_order, uint32_t stack_sample_size, unsigned min_number_samples) { @@ -164,103 +52,37 @@ int pevent_compute_min_mmap_order(int min_buffer_size_order, return ret_order; } -DDRes pevent_open(DDProfContext &ctx, pid_t pid, int num_cpu, - PEventHdr *pevent_hdr) { - assert(pevent_hdr->size == 0); // check for previous init - for (unsigned long watcher_idx = 0; watcher_idx < ctx.watchers.size(); - ++watcher_idx) { - PerfWatcher *watcher = &ctx.watchers[watcher_idx]; - if (watcher->type < kDDPROF_TYPE_CUSTOM) { - DDRES_CHECK_FWD(pevent_open_all_cpus(watcher, watcher_idx, pid, num_cpu, - ctx.perf_clock_source, pevent_hdr)); - } else { - // custom event, eg.allocation profiling - size_t pevent_idx = 0; - DDRES_CHECK_FWD(pevent_create(pevent_hdr, watcher_idx, &pevent_idx)); - int const order = pevent_compute_min_mmap_order( - k_mpsc_buffer_size_shift, watcher->options.stack_sample_size, - k_min_number_samples_per_ring_buffer); - DDRES_CHECK_FWD(ring_buffer_create(order, RingBufferType::kMPSCRingBuffer, - true, &pevent_hdr->pes[pevent_idx])); - } - } - return {}; -} - DDRes pevent_mmap_event(PEvent *event) { - if (event->mapfd != -1) { - void *region = perfown_sz(event->mapfd, event->ring_buffer_size); - if (!region) { - DDRES_RETURN_ERROR_LOG( - DD_WHAT_PERFMMAP, - "Could not mmap memory for watcher #%d: %s. " - "Please increase kernel limits on pinned memory (ulimit -l). " - "OR associate the IPC_LOCK capability to this process.", - event->watcher_pos, strerror(errno)); - } - if (!rb_init(&event->rb, region, event->ring_buffer_size, - event->ring_buffer_type)) { - DDRES_RETURN_ERROR_LOG(DD_WHAT_PERFMMAP, - "Could not initialize ring buffer for watcher #%d", - event->watcher_pos); - } - } - return {}; -} + if (event->mapfd == -1) + return {}; -DDRes pevent_mmap(PEventHdr *pevent_hdr, bool use_override) { - // Switch user if needed (when root switch to nobody user) - // Pinned memory is accounted by the kernel by (real) uid across containers - // (uid 1000 in the host and in containers will share the same count). - // Sometimes root allowance (when no CAP_IPC_LOCK/CAP_SYS_ADMIN in a - // container) is already exhausted, hence we switch to a different user. UIDInfo info; - if (use_override) { - /* perf_event_mlock_kb is accounted per real user id */ + void *region; + if (!(region = perfown_sz(event->mapfd, event->ring_buffer_size))) { + // Switch user if needed (when root switch to nobody user) + // Pinned memory is accounted by the kernel by (real) uid across containers + // (uid 1000 in the host and in containers will share the same count). + // Sometimes root allowance (when no CAP_IPC_LOCK/CAP_SYS_ADMIN in a + // container) is already exhausted, hence we switch to a different user. DDRES_CHECK_FWD(user_override_to_nobody_if_root(&info)); - } - - defer { - if (use_override) { - user_override(info.uid, info.gid); - } - }; - - auto defer_munmap = make_defer([&] { pevent_munmap(pevent_hdr); }); - PEvent *pes = pevent_hdr->pes; - for (size_t k = 0; k < pevent_hdr->size; ++k) { - DDRES_CHECK_FWD(pevent_mmap_event(&pes[k])); + if (!(region = perfown_sz(event->mapfd, event->ring_buffer_size))) { + DDRES_RETURN_ERROR_LOG( + DD_WHAT_PERFMMAP, + "Could not mmap memory for watcher" + "Please increase kernel limits on pinned memory (ulimit -l). " + "OR associate the IPC_LOCK capability to this process."); + } } - defer_munmap.release(); - - return {}; -} - -DDRes pevent_setup(DDProfContext &ctx, pid_t pid, int num_cpu, - PEventHdr *pevent_hdr) { - DDRES_CHECK_FWD(pevent_open(ctx, pid, num_cpu, pevent_hdr)); - if (!IsDDResOK(pevent_mmap(pevent_hdr, true))) { - LG_NTC("Retrying attachment without user override"); - DDRES_CHECK_FWD(pevent_mmap(pevent_hdr, false)); + if (!rb_init(&event->rb, region, event->ring_buffer_size, + event->ring_buffer_type)) { + DDRES_RETURN_ERROR_LOG(DD_WHAT_PERFMMAP, + "Could not initialize ring buffer for watcher"); } - return {}; } -DDRes pevent_enable(PEventHdr *pevent_hdr) { - // Just before we enter the main loop, force the enablement of the perf - // contexts - for (size_t i = 0; i < pevent_hdr->size; ++i) { - if (!pevent_hdr->pes[i].custom_event) { - DDRES_CHECK_INT(ioctl(pevent_hdr->pes[i].fd, PERF_EVENT_IOC_ENABLE), - DD_WHAT_IOCTL, "Error ioctl fd=%d (idx#%zu)", - pevent_hdr->pes[i].fd, i); - } - } - return {}; -} DDRes pevent_munmap_event(PEvent *event) { if (event->rb.base) { @@ -276,20 +98,6 @@ DDRes pevent_munmap_event(PEvent *event) { } /// Clean the mmap buffer -DDRes pevent_munmap(PEventHdr *pevent_hdr) { - PEvent *pes = pevent_hdr->pes; - DDRes res{}; - - for (size_t k = 0; k < pevent_hdr->size; ++k) { - DDRes const local_res = pevent_munmap_event(&pes[k]); - if (!IsDDResOK(local_res)) { - res = local_res; - } - } - - return res; -} - DDRes pevent_close_event(PEvent *event) { if (event->fd != -1) { if (close(event->fd) == -1) { @@ -309,38 +117,4 @@ DDRes pevent_close_event(PEvent *event) { return {}; } -DDRes pevent_close(PEventHdr *pevent_hdr) { - PEvent *pes = pevent_hdr->pes; - DDRes res{}; - for (size_t k = 0; k < pevent_hdr->size; ++k) { - DDRes const local_res = pevent_close_event(&pes[k]); - if (!IsDDResOK(local_res)) { - res = local_res; - } - } - pevent_hdr->size = 0; - return res; -} - -bool pevent_include_kernel_events(const PEventHdr *pevent_hdr) { - for (size_t i = 0; i < pevent_hdr->nb_attrs; ++i) { - if (pevent_hdr->attrs[i].exclude_kernel == 0) { - return true; - } - } - return false; -} - -DDRes pevent_cleanup(PEventHdr *pevent_hdr) { - DDRes ret = ddres_init(); - - // Cleanup both, storing the error if one was generated - if (DDRes const ret_tmp = pevent_munmap(pevent_hdr); !IsDDResOK((ret_tmp))) { - ret = ret_tmp; - } - if (DDRes const ret_tmp = pevent_close(pevent_hdr); !IsDDResOK((ret_tmp))) { - ret = ret_tmp; - } - return ret; -} } // namespace ddprof diff --git a/src/ringbuffer_utils.cc b/src/ringbuffer_utils.cc index 15f32f22f..fb2f7cc2b 100644 --- a/src/ringbuffer_utils.cc +++ b/src/ringbuffer_utils.cc @@ -72,7 +72,6 @@ DDRes ring_buffer_create(size_t buffer_size_page_order, "Error calling evenfd on watcher %d (%s)", pevent->watcher_pos, strerror(errno)); } - pevent->custom_event = custom_event; pevent->ring_buffer_type = ring_buffer_type; pevent->ring_buffer_size = buffer_size;