Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 201 additions & 0 deletions include/perf_watcher_lookup.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0. This product includes software
// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
// Datadog, Inc.

#pragma once

#include "perf_ringbuffer.hpp"
#include "perf_watcher.hpp"
#include "pevent.hpp"

#include <sys/types.h>
#include <unordered_map>

namespace ddprof {

struct PEvent {
PerfWatcher *watcher;
int fd; // Underlying perf event FD for perf_events, otherwise an eventfd that
// signals data is available in ring buffer
int mapfd; // FD for ring buffer, same as `fd` for perf events
int cpu; // CPU id
int attr_idx; // matching perf_event_attr
size_t ring_buffer_size; // size of the ring buffer
RingBufferType ring_buffer_type;
RingBuffer rb; // metadata and buffers for processing perf ringbuffer
};

class PEventTable {
private:
PEventTable() {}

// Lookups
std::unordered_map<uint64_t, PEvent> id_to_pevent;
std::unordered_map<int, int> cpu_to_fd;

// Stashed attrs
std::vector<perf_event_attr> attrs;

public:
PEventTable(const PEventTable&) = delete;
PEventTable& operator=(const PEventTable&) = delete;

static PEventTable& get_instance() {
static PEventTable instance;
return instance;
}

PEvent *pevent_from_id(uint64_t id) {
auto it = id_to_pevent.find(id);
return (it != id_to_pevent.end()) ? it->second : nullptr;
}

bool open_custom_watcher(PerfWatcher &watcher, pid_t pid, PerfClockSource perf_clock_source) {
PEvent event = {
fd,
fd,
cpu,
attr_id,
buffer_size_order,
RingBufferType::kPerfRingBuffer,
false,
{}
};
int const order = pevent_compute_min_mmap_order(
k_mpsc_buffer_size_shift, watcher->options.stack_sample_size,
k_min_number_samples_per_ring_buffer);
DDRES_CHECK_FWD(ring_buffer_create(order, RingBufferType::kMPSCRingBuffer,
true, &event));
}

bool open_perf_watcher(PerfWatcher &watcher, pid_t pid, PerfClockSource perf_clock_source) {
std::vector<perf_event_attr> possible_attrs = all_perf_configs_from_watcher(&watcher, true, perf_clock_source);

// We have a number of configurations and we need to try them on all CPUs. We prefer the earlier configurations,
// but can failover to the later ones. If a configuration fails, it should not be used again. Generally, either
// all or none of a configuration will work. If we fail midway through, we take what we can get. We return
// false if no configs succeed
for (int cpu = 0; cpu < num_cpu; ++cpu) {
auto it = possible_attrs.begin();
while (it != possible_attrs.end()) {
int fd = perf_event_open(it, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
if (fd == -1) {
# warning TODO add error here
it = possible_attrs.erase(it); // Don't retry this config
}

// Get the ID
uint64_t sample_id = 0;
if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &sample_id)) {
// If I can't get the sample, then I can't use this event.
LG_WARN("Error getting perf sample ID\n");
close(fd);
continue;
}

// Store the attr
int attr_id = attrs.size();
attrs.push_back(it);

// Figure out which buffer size to use
static_bool log_once = true;
int const buffer_size_order = pevent_compute_min_mmap_order(
k_default_buffer_size_shift, stack_sample_size,
k_min_number_samples_per_ring_buffer);
if (buffer_size_order > k_default_buffer_size_shift && log_once) {
# warning TODO add more errors here
}

// Make a PEvent now for the next part; this will get moved into storage if the next operations are successful, but it can't
// be moved yet because mapping may still fail
PEvent event = {
fd,
fd,
cpu,
attr_id,
buffer_size_order,
RingBufferType::kPerfRingBuffer,
false,
{}
};

// We have enough information to configure the ringbuffer. If this CPU
// already has a perf event on it, then multiplex the ringbuffer
auto fd_it = cpu_to_fd.find(cpu);
if (fd_it != fd_it.end()) {
// This CPU already has a perf_event ringbuffer, so just use that
auto cpu_fd = fd_it->second;
if (ioctl(event->mapfd, PERF_EVENT_IOC_SET_OUTPUT, cpu_fd)) {
# warning TODO add more errors
}
event->mapfd = fd_it->second;
} else {
// This CPU does not have a perf_event ringbuffer, so make one
pevent_mmap_event(event);
cpu_to_fd[cpu] = event->mapfd;
}

// Successful, don't retry anymore!
id_to_pevent.emplace(sample_id, std::move(event));
} // try to open
} // cpu
}

bool open_watcher(PerfWatcher &watcher, pid_t pid, PerfClockSource perf_clock_source) {
if (watcher->type < kDDPROF_TYPE_CUSTOM) {
...
} else {
}

}

DDRes enable_all() {
// Just before we enter the main loop, force the enablement of the perf
// contexts
for (const auto& [_, event] : id_to_pevent) {
(void)_;
if (event.watcher->type < kDDPROF_TYPE_CUSTOM) {
# warning TODO better error
// DDRES_CHECK_INT(ioctl(event.fd, PERF_EVENT_IOC_ENABLE),
// DD_WHAT_IOCTL, "Error ioctl fd=%d (idx#%zu)",
// event.fd, i);
}
}
return {};
}

DDRes cleanup() {
DDRes ret = ddres_init();

// Cleanup both, storing the error if one was generated
for (const auto& [_, event] : id_to_pevent) {
(void)_;
if (DDRes const ret_tmp = pevent_munmap_event(event), !IsDDResOK((ret_tmp))) {
ret = ret_tmp;
}
if (DDRes const ret_tmp = pevent_close_event(event), !IsDDResOK((ret_tmp))) {
ret = ret_tmp;
}
}

// Now let's reset the storage
id_to_pevent.clear();
cpu_to_fd.clear();
attrs.clear();

return ret;
}

void pollfd_setup(struct pollfd *pfd, int *pfd_len) {
// Setup poll() to watch perf_event file descriptors
for (const auto& [_, event] : id_to_pevent) {
// NOTE: if fd is negative, it will be ignored
pfd[i].fd = event.fd;
pfd[i].events = POLLIN | POLLERR | POLLHUP;
}
}

};

} // namespace ddprof
23 changes: 10 additions & 13 deletions src/ddprof.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,14 @@ void display_system_info() {
} // namespace

DDRes ddprof_setup(DDProfContext &ctx) {
PEventHdr *pevent_hdr = &ctx.worker_ctx.pevent_hdr;
try {
pevent_init(pevent_hdr);

display_system_info();

// Open perf events and mmap events right now to start receiving events
// mmaps from perf fds will be lost after fork, that why we mmap them again
// in worker (but kernel only accounts for the pinned memory once).
DDRES_CHECK_FWD(
pevent_setup(ctx, ctx.params.pid, ctx.params.num_cpu, pevent_hdr));
auto pe_table = PEventTable::get_instance();
for (auto *watcher : ctx.watchers) {
pe_table.open_watcher(watcher, pid, num_cpu, ctx.perf_clock_source);
}

// Setup signal handler if defined
if (ctx.params.fault_info) {
Expand All @@ -100,17 +97,17 @@ DDRes ddprof_setup(DDProfContext &ctx) {

DDRES_CHECK_FWD(ddprof_stats_init());

DDRES_CHECK_FWD(pevent_enable(pevent_hdr));
DDRES_CHECK_FWD(pe_table.enable_all());
}
CatchExcept2DDRes();
return {};
}

DDRes ddprof_teardown(DDProfContext &ctx) {
PEventHdr *pevent_hdr = &ctx.worker_ctx.pevent_hdr;

if (IsDDResNotOK(pevent_cleanup(pevent_hdr))) {
LG_WRN("Error when calling pevent_cleanup.");
DDRes ddprof_teardown() {

auto &pe_table = PEventTable::get_instance();
if (IsDDResNotOK(pe_table.cleanup())) {
LG_WRN("Error when calling pe_table.cleanup.");
}

if (IsDDResNotOK(ddprof_stats_free())) {
Expand Down
2 changes: 1 addition & 1 deletion src/ddprof_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ DDRes worker_library_init(DDProfContext &ctx,
// register the existing persistent storage for the state
ctx.worker_ctx.persistent_worker_state = persistent_worker_state;

PEventHdr *pevent_hdr = &ctx.worker_ctx.pevent_hdr;
auto &pe_table = PETable::get_instance();

// If we're here, then we are a child spawned during the startup operation.
// That means we need to iterate through the perf_event_open() handles and
Expand Down
12 changes: 0 additions & 12 deletions src/perf_mainloop.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,18 +109,6 @@ DDRes spawn_workers(PersistentWorkerState *persistent_worker_state,
return {};
}

void pollfd_setup(const PEventHdr *pevent_hdr, struct pollfd *pfd,
int *pfd_len) {
*pfd_len = pevent_hdr->size;
const PEvent *pes = pevent_hdr->pes;
// Setup poll() to watch perf_event file descriptors
for (int i = 0; i < *pfd_len; ++i) {
// NOTE: if fd is negative, it will be ignored
pfd[i].fd = pes[i].fd;
pfd[i].events = POLLIN | POLLERR | POLLHUP;
}
}

DDRes signalfd_setup(pollfd *pfd) {
sigset_t mask;

Expand Down
6 changes: 3 additions & 3 deletions src/perf_watcher.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@

namespace ddprof {

#define BASE_STYPES \
(PERF_SAMPLE_STACK_USER | PERF_SAMPLE_REGS_USER | PERF_SAMPLE_TID | \
PERF_SAMPLE_TIME | PERF_SAMPLE_PERIOD)
#define BASE_STYPES \
(PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_STACK_USER | PERF_SAMPLE_REGS_USER | \
PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_PERIOD)

uint64_t perf_event_default_sample_type() { return BASE_STYPES; }

Expand Down
Loading