diff --git a/examples/c/.gitignore b/examples/c/.gitignore index 2a7351f4..fc33e7f0 100644 --- a/examples/c/.gitignore +++ b/examples/c/.gitignore @@ -15,3 +15,4 @@ /lsm /cmake-build-debug/ /cmake-build-release/ +/sim diff --git a/examples/c/Makefile b/examples/c/Makefile index faefebc9..db6a113a 100644 --- a/examples/c/Makefile +++ b/examples/c/Makefile @@ -25,7 +25,7 @@ CFLAGS := -g -Wall ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) APPS = minimal minimal_legacy minimal_ns bootstrap uprobe kprobe fentry \ - usdt sockfilter tc ksyscall task_iter lsm + usdt sockfilter tc ksyscall task_iter lsm sim CARGO ?= $(shell which cargo) ifeq ($(strip $(CARGO)),) diff --git a/examples/c/hashmap.c b/examples/c/hashmap.c new file mode 100644 index 00000000..140ee405 --- /dev/null +++ b/examples/c/hashmap.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Generic non-thread safe hash map implementation. + * + * Copyright (c) 2019 Facebook + */ +#include +#include +#include +#include +#include +#include "hashmap.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +/* prevent accidental re-addition of reallocarray() */ +#pragma GCC poison reallocarray + +/* start with 4 buckets */ +#define HASHMAP_MIN_CAP_BITS 2 + +static void hashmap_add_entry(struct hashmap_entry **pprev, + struct hashmap_entry *entry) +{ + entry->next = *pprev; + *pprev = entry; +} + +static void hashmap_del_entry(struct hashmap_entry **pprev, + struct hashmap_entry *entry) +{ + *pprev = entry->next; + entry->next = NULL; +} + +void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, void *ctx) +{ + map->hash_fn = hash_fn; + map->equal_fn = equal_fn; + map->ctx = ctx; + + map->buckets = NULL; + map->cap = 0; + map->cap_bits = 0; + map->sz = 0; +} + +struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, + void *ctx) +{ + struct hashmap *map = malloc(sizeof(struct hashmap)); + + if (!map) + return ERR_PTR(-ENOMEM); + hashmap__init(map, hash_fn, equal_fn, ctx); + return map; +} + +void hashmap__clear(struct hashmap *map) +{ + struct hashmap_entry *cur, *tmp; + size_t bkt; + + hashmap__for_each_entry_safe(map, cur, tmp, bkt) { + free(cur); + } + free(map->buckets); + map->buckets = NULL; + map->cap = map->cap_bits = map->sz = 0; +} + +void hashmap__free(struct hashmap *map) +{ + if (IS_ERR_OR_NULL(map)) + return; + + hashmap__clear(map); + free(map); +} + +size_t hashmap__size(const struct hashmap *map) +{ + return map->sz; +} + +size_t hashmap__capacity(const struct hashmap *map) +{ + return map->cap; +} + +static bool hashmap_needs_to_grow(struct hashmap *map) +{ + /* grow if empty or more than 75% filled */ + return (map->cap == 0) || ((map->sz + 1) * 4 / 3 > map->cap); +} + +static int hashmap_grow(struct hashmap *map) +{ + struct hashmap_entry **new_buckets; + struct hashmap_entry *cur, *tmp; + size_t new_cap_bits, new_cap; + size_t h, bkt; + + new_cap_bits = map->cap_bits + 1; + if (new_cap_bits < HASHMAP_MIN_CAP_BITS) + new_cap_bits = HASHMAP_MIN_CAP_BITS; + + new_cap = 1UL << new_cap_bits; + new_buckets = calloc(new_cap, sizeof(new_buckets[0])); + if (!new_buckets) + return -ENOMEM; + + hashmap__for_each_entry_safe(map, cur, tmp, bkt) { + h = hash_bits(map->hash_fn(cur->key, map->ctx), new_cap_bits); + hashmap_add_entry(&new_buckets[h], cur); + } + + map->cap = new_cap; + map->cap_bits = new_cap_bits; + free(map->buckets); + map->buckets = new_buckets; + + return 0; +} + +static bool hashmap_find_entry(const struct hashmap *map, + const long key, size_t hash, + struct hashmap_entry ***pprev, + struct hashmap_entry **entry) +{ + struct hashmap_entry *cur, **prev_ptr; + + if (!map->buckets) + return false; + + for (prev_ptr = &map->buckets[hash], cur = *prev_ptr; + cur; + prev_ptr = &cur->next, cur = cur->next) { + if (map->equal_fn(cur->key, key, map->ctx)) { + if (pprev) + *pprev = prev_ptr; + *entry = cur; + return true; + } + } + + return false; +} + +int hashmap_insert(struct hashmap *map, long key, long value, + enum hashmap_insert_strategy strategy, + long *old_key, long *old_value) +{ + struct hashmap_entry *entry; + size_t h; + int err; + + if (old_key) + *old_key = 0; + if (old_value) + *old_value = 0; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (strategy != HASHMAP_APPEND && + hashmap_find_entry(map, key, h, NULL, &entry)) { + if (old_key) + *old_key = entry->key; + if (old_value) + *old_value = entry->value; + + if (strategy == HASHMAP_SET || strategy == HASHMAP_UPDATE) { + entry->key = key; + entry->value = value; + return 0; + } else if (strategy == HASHMAP_ADD) { + return -EEXIST; + } + } + + if (strategy == HASHMAP_UPDATE) + return -ENOENT; + + if (hashmap_needs_to_grow(map)) { + err = hashmap_grow(map); + if (err) + return err; + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + } + + entry = malloc(sizeof(struct hashmap_entry)); + if (!entry) + return -ENOMEM; + + entry->key = key; + entry->value = value; + hashmap_add_entry(&map->buckets[h], entry); + map->sz++; + + return 0; +} + +bool hashmap_find(const struct hashmap *map, long key, long *value) +{ + struct hashmap_entry *entry; + size_t h; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (!hashmap_find_entry(map, key, h, NULL, &entry)) + return false; + + if (value) + *value = entry->value; + return true; +} + +bool hashmap_delete(struct hashmap *map, long key, + long *old_key, long *old_value) +{ + struct hashmap_entry **pprev, *entry; + size_t h; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (!hashmap_find_entry(map, key, h, &pprev, &entry)) + return false; + + if (old_key) + *old_key = entry->key; + if (old_value) + *old_value = entry->value; + + hashmap_del_entry(pprev, entry); + free(entry); + map->sz--; + + return true; +} diff --git a/examples/c/hashmap.h b/examples/c/hashmap.h new file mode 100644 index 00000000..c12f8320 --- /dev/null +++ b/examples/c/hashmap.h @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Generic non-thread safe hash map implementation. + * + * Copyright (c) 2019 Facebook + */ +#ifndef __LIBBPF_HASHMAP_H +#define __LIBBPF_HASHMAP_H + +#include +#include +#include + +static inline size_t hash_bits(size_t h, int bits) +{ + /* shuffle bits and return requested number of upper bits */ + if (bits == 0) + return 0; + +#if (__SIZEOF_SIZE_T__ == __SIZEOF_LONG_LONG__) + /* LP64 case */ + return (h * 11400714819323198485llu) >> (__SIZEOF_LONG_LONG__ * 8 - bits); +#elif (__SIZEOF_SIZE_T__ <= __SIZEOF_LONG__) + return (h * 2654435769lu) >> (__SIZEOF_LONG__ * 8 - bits); +#else +# error "Unsupported size_t size" +#endif +} + +/* generic C-string hashing function */ +static inline size_t str_hash(const char *s) +{ + size_t h = 0; + + while (*s) { + h = h * 31 + *s; + s++; + } + return h; +} + +typedef size_t (*hashmap_hash_fn)(long key, void *ctx); +typedef bool (*hashmap_equal_fn)(long key1, long key2, void *ctx); + +/* + * Hashmap interface is polymorphic, keys and values could be either + * long-sized integers or pointers, this is achieved as follows: + * - interface functions that operate on keys and values are hidden + * behind auxiliary macros, e.g. hashmap_insert <-> hashmap__insert; + * - these auxiliary macros cast the key and value parameters as + * long or long *, so the user does not have to specify the casts explicitly; + * - for pointer parameters (e.g. old_key) the size of the pointed + * type is verified by hashmap_cast_ptr using _Static_assert; + * - when iterating using hashmap__for_each_* forms + * hasmap_entry->key should be used for integer keys and + * hasmap_entry->pkey should be used for pointer keys, + * same goes for values. + */ +struct hashmap_entry { + union { + long key; + const void *pkey; + }; + union { + long value; + void *pvalue; + }; + struct hashmap_entry *next; +}; + +struct hashmap { + hashmap_hash_fn hash_fn; + hashmap_equal_fn equal_fn; + void *ctx; + + struct hashmap_entry **buckets; + size_t cap; + size_t cap_bits; + size_t sz; +}; + +void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, void *ctx); +struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, + void *ctx); +void hashmap__clear(struct hashmap *map); +void hashmap__free(struct hashmap *map); + +size_t hashmap__size(const struct hashmap *map); +size_t hashmap__capacity(const struct hashmap *map); + +/* + * Hashmap insertion strategy: + * - HASHMAP_ADD - only add key/value if key doesn't exist yet; + * - HASHMAP_SET - add key/value pair if key doesn't exist yet; otherwise, + * update value; + * - HASHMAP_UPDATE - update value, if key already exists; otherwise, do + * nothing and return -ENOENT; + * - HASHMAP_APPEND - always add key/value pair, even if key already exists. + * This turns hashmap into a multimap by allowing multiple values to be + * associated with the same key. Most useful read API for such hashmap is + * hashmap__for_each_key_entry() iteration. If hashmap__find() is still + * used, it will return last inserted key/value entry (first in a bucket + * chain). + */ +enum hashmap_insert_strategy { + HASHMAP_ADD, + HASHMAP_SET, + HASHMAP_UPDATE, + HASHMAP_APPEND, +}; + +#define hashmap_cast_ptr(p) ({ \ + _Static_assert((__builtin_constant_p((p)) ? (p) == NULL : 0) || \ + sizeof(*(p)) == sizeof(long), \ + #p " pointee should be a long-sized integer or a pointer"); \ + (long *)(p); \ +}) + +/* + * hashmap__insert() adds key/value entry w/ various semantics, depending on + * provided strategy value. If a given key/value pair replaced already + * existing key/value pair, both old key and old value will be returned + * through old_key and old_value to allow calling code do proper memory + * management. + */ +int hashmap_insert(struct hashmap *map, long key, long value, + enum hashmap_insert_strategy strategy, + long *old_key, long *old_value); + +#define hashmap__insert(map, key, value, strategy, old_key, old_value) \ + hashmap_insert((map), (long)(key), (long)(value), (strategy), \ + hashmap_cast_ptr(old_key), \ + hashmap_cast_ptr(old_value)) + +#define hashmap__add(map, key, value) \ + hashmap__insert((map), (key), (value), HASHMAP_ADD, NULL, NULL) + +#define hashmap__set(map, key, value, old_key, old_value) \ + hashmap__insert((map), (key), (value), HASHMAP_SET, (old_key), (old_value)) + +#define hashmap__update(map, key, value, old_key, old_value) \ + hashmap__insert((map), (key), (value), HASHMAP_UPDATE, (old_key), (old_value)) + +#define hashmap__append(map, key, value) \ + hashmap__insert((map), (key), (value), HASHMAP_APPEND, NULL, NULL) + +bool hashmap_delete(struct hashmap *map, long key, long *old_key, long *old_value); + +#define hashmap__delete(map, key, old_key, old_value) \ + hashmap_delete((map), (long)(key), \ + hashmap_cast_ptr(old_key), \ + hashmap_cast_ptr(old_value)) + +bool hashmap_find(const struct hashmap *map, long key, long *value); + +#define hashmap__find(map, key, value) \ + hashmap_find((map), (long)(key), hashmap_cast_ptr(value)) + +/* + * hashmap__for_each_entry - iterate over all entries in hashmap + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @bkt: integer used as a bucket loop cursor + */ +#define hashmap__for_each_entry(map, cur, bkt) \ + for (bkt = 0; bkt < map->cap; bkt++) \ + for (cur = map->buckets[bkt]; cur; cur = cur->next) + +/* + * hashmap__for_each_entry_safe - iterate over all entries in hashmap, safe + * against removals + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @tmp: struct hashmap_entry * used as a temporary next cursor storage + * @bkt: integer used as a bucket loop cursor + */ +#define hashmap__for_each_entry_safe(map, cur, tmp, bkt) \ + for (bkt = 0; bkt < map->cap; bkt++) \ + for (cur = map->buckets[bkt]; \ + cur && ({tmp = cur->next; true; }); \ + cur = tmp) + +/* + * hashmap__for_each_key_entry - iterate over entries associated with given key + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @key: key to iterate entries for + */ +#define hashmap__for_each_key_entry(map, cur, _key) \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ + cur; \ + cur = cur->next) \ + if (map->equal_fn(cur->key, (_key), map->ctx)) + +#define hashmap__for_each_key_entry_safe(map, cur, tmp, _key) \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ + cur && ({ tmp = cur->next; true; }); \ + cur = tmp) \ + if (map->equal_fn(cur->key, (_key), map->ctx)) + +#endif /* __LIBBPF_HASHMAP_H */ diff --git a/examples/c/profile.bpf.c b/examples/c/profile.bpf.c index bdcc029f..3fd72fa9 100644 --- a/examples/c/profile.bpf.c +++ b/examples/c/profile.bpf.c @@ -9,35 +9,207 @@ char LICENSE[] SEC("license") = "Dual BSD/GPL"; +struct task_state { + __u64 ts; + enum task_status status; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); /* task_id, see task_id() */ + __type(value, struct task_state); + __uint(max_entries, MAX_THREAD_CNT); +} states SEC(".maps"); + struct { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 256 * 1024); -} events SEC(".maps"); + __uint(max_entries, RINGBUF_SZ); +} rb SEC(".maps"); -SEC("perf_event") -int profile(void *ctx) +__u64 session_start_ts; + +const volatile int cpu_id = 0; + +static struct task_state empty_task_state; + +static __always_inline int task_id(int pid) { - int pid = bpf_get_current_pid_tgid() >> 32; - int cpu_id = bpf_get_smp_processor_id(); - struct stacktrace_event *event; - int cp; + /* use CPU ID for identifying idle tasks */ + return pid ?: -(bpf_get_smp_processor_id() + 1); +} - event = bpf_ringbuf_reserve(&events, sizeof(*event), 0); - if (!event) - return 1; +static struct task_state *task_state(int pid) +{ + struct task_state *s; + int id = task_id(pid); - event->pid = pid; - event->cpu_id = cpu_id; + s = bpf_map_lookup_elem(&states, &id); + if (!s) { + bpf_map_update_elem(&states, &id, &empty_task_state, BPF_NOEXIST); + s = bpf_map_lookup_elem(&states, &id); + } - if (bpf_get_current_comm(event->comm, sizeof(event->comm))) - event->comm[0] = 0; + return s; +} + +/* don't create an entry if it's not there already */ +static struct task_state *task_state_peek(int pid) +{ + int id = task_id(pid); + + return bpf_map_lookup_elem(&states, &id); +} + +static void task_state_delete(int pid) +{ + int id = task_id(pid); + bpf_map_delete_elem(&states, &id); +} + +static int emit_event(enum event_kind kind, u64 now_ts, struct task_struct *p, u64 duration_ns) +{ + struct wprof_event *e; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return -1; + + e->kind = kind; + e->ts = now_ts; + e->cpu_id = bpf_get_smp_processor_id(); + e->pid = p->pid; + e->tgid = p->tgid; + __builtin_memcpy(e->comm, p->comm, sizeof(e->comm)); + + e->duration_ns = duration_ns; + + e->kstack_sz = 0; + e->ustack_sz = 0; + + /* event->kstack_sz = bpf_get_stack(ctx, event->kstack, sizeof(event->kstack), 0); event->ustack_sz = bpf_get_stack(ctx, event->ustack, sizeof(event->ustack), BPF_F_USER_STACK); + */ + + bpf_ringbuf_submit(e, 0); + return 0; +} + +SEC("perf_event") +int wprof_tick(void *ctx) +{ + struct task_state *scur; + struct task_struct *cur = bpf_get_current_task_btf(); + u64 now_ts, dur_ns; + + if (!session_start_ts) + return 0; + + if (cpu_id && bpf_get_smp_processor_id() != cpu_id) + return 0; + + scur = task_state(cur->pid); + if (!scur) + return 0; /* shouldn't happen, unless we ran out of space */ + + now_ts = bpf_ktime_get_ns(); + + /* cur task was on-cpu since last checkpoint */ + dur_ns = now_ts - (scur->ts ?: session_start_ts); + emit_event(EV_TIMER, now_ts, cur, dur_ns); + + scur->ts = now_ts; + scur->status = STATUS_ON_CPU; + + return 0; +} + +SEC("tp_btf/sched_switch") +int BPF_PROG(wprof_switch, + bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned prev_state) +{ + struct task_state *sprev, *snext; + u64 now_ts, dur_ns; + + if (!session_start_ts) + return 0; + + if (cpu_id && bpf_get_smp_processor_id() != cpu_id) + return 0; + + sprev = task_state(prev->pid); + snext = task_state(next->pid); + if (!sprev || !snext) + return 0; + + now_ts = bpf_ktime_get_ns(); + + /* prev task was on-cpu since last checkpoint */ + dur_ns = now_ts - (sprev->ts ?: session_start_ts); + emit_event(EV_ON_CPU, now_ts, prev, dur_ns); + + /* next task was off-cpu since last checkpoint */ + dur_ns = now_ts - (snext->ts ?: session_start_ts); + emit_event(EV_OFF_CPU, now_ts, next, dur_ns); + + sprev->ts = now_ts; + snext->ts = now_ts; + + return 0; +} + +SEC("tp_btf/sched_wakeup_new") +int BPF_PROG(wprof_task_wakeup_new, struct task_struct *p) +{ + struct task_state *s; + u64 now_ts; + + if (!session_start_ts) + return 0; + + if (cpu_id && bpf_get_smp_processor_id() != cpu_id) + return 0; + + s = task_state(p->pid); + if (!s) + return 0; + + now_ts = bpf_ktime_get_ns(); + s->ts = now_ts; + s->status = STATUS_OFF_CPU; + + return 0; +} + +SEC("tp_btf/sched_process_exit") +int BPF_PROG(wprof_task_exit, struct task_struct *p) +{ + struct task_state *s; + enum event_kind kind; + u64 now_ts; + int id; + + if (!session_start_ts) + return 0; + + if (cpu_id && bpf_get_smp_processor_id() != cpu_id) + return 0; + + s = task_state_peek(p->pid); + if (!s) + return 0; + + now_ts = bpf_ktime_get_ns(); + kind = s->status == STATUS_ON_CPU ? EV_ON_CPU : EV_OFF_CPU; + emit_event(kind, now_ts, p, now_ts - s->ts); - bpf_ringbuf_submit(event, 0); + task_state_delete(p->pid); return 0; } diff --git a/examples/c/profile.c b/examples/c/profile.c index 4be01a66..96c8b379 100644 --- a/examples/c/profile.c +++ b/examples/c/profile.c @@ -11,10 +11,14 @@ #include #include #include +#include +#include +#include #include "profile.skel.h" #include "profile.h" #include "blazesym.h" +#include "hashmap.h" /* * This function is from libbpf, but it is not a public API and can only be @@ -36,31 +40,31 @@ static struct blaze_symbolizer *symbolizer; static void print_frame(const char *name, uintptr_t input_addr, uintptr_t addr, uint64_t offset, const blaze_symbolize_code_info* code_info) { - // If we have an input address we have a new symbol. - if (input_addr != 0) { - printf("%016lx: %s @ 0x%lx+0x%lx", input_addr, name, addr, offset); - if (code_info != NULL && code_info->dir != NULL && code_info->file != NULL) { - printf(" %s/%s:%u\n", code_info->dir, code_info->file, code_info->line); - } else if (code_info != NULL && code_info->file != NULL) { - printf(" %s:%u\n", code_info->file, code_info->line); - } else { - printf("\n"); - } - } else { - printf("%16s %s", "", name); - if (code_info != NULL && code_info->dir != NULL && code_info->file != NULL) { - printf("@ %s/%s:%u [inlined]\n", code_info->dir, code_info->file, code_info->line); - } else if (code_info != NULL && code_info->file != NULL) { - printf("@ %s:%u [inlined]\n", code_info->file, code_info->line); - } else { - printf("[inlined]\n"); - } - } + /* If we have an input address we have a new symbol. */ + if (input_addr != 0) { + printf("%016lx: %s @ 0x%lx+0x%lx", input_addr, name, addr, offset); + if (code_info != NULL && code_info->dir != NULL && code_info->file != NULL) { + printf(" %s/%s:%u\n", code_info->dir, code_info->file, code_info->line); + } else if (code_info != NULL && code_info->file != NULL) { + printf(" %s:%u\n", code_info->file, code_info->line); + } else { + printf("\n"); + } + } else { + printf("%16s %s", "", name); + if (code_info != NULL && code_info->dir != NULL && code_info->file != NULL) { + printf("@ %s/%s:%u [inlined]\n", code_info->dir, code_info->file, code_info->line); + } else if (code_info != NULL && code_info->file != NULL) { + printf("@ %s:%u [inlined]\n", code_info->file, code_info->line); + } else { + printf("[inlined]\n"); + } + } } static void show_stack_trace(__u64 *stack, int stack_sz, pid_t pid) { - const struct blaze_symbolize_inlined_fn* inlined; + const struct blaze_symbolize_inlined_fn* inlined; const struct blaze_result *result; const struct blaze_sym *sym; int i, j; @@ -91,46 +95,165 @@ static void show_stack_trace(__u64 *stack, int stack_sz, pid_t pid) continue; } - sym = &result->syms[i]; - print_frame(sym->name, stack[i], sym->addr, sym->offset, &sym->code_info); + sym = &result->syms[i]; + print_frame(sym->name, stack[i], sym->addr, sym->offset, &sym->code_info); - for (j = 0; j < sym->inlined_cnt; j++) { - inlined = &sym->inlined[j]; - print_frame(sym->name, 0, 0, 0, &inlined->code_info); - } + for (j = 0; j < sym->inlined_cnt; j++) { + inlined = &sym->inlined[j]; + print_frame(sym->name, 0, 0, 0, &inlined->code_info); + } } blaze_result_free(result); } +static long task_id(int pid, int cpu_id) +{ + return pid ?: -(cpu_id + 1); +} + +#define STATS_PERIOD_MS 5000 + +struct task_stats { + uint32_t on_cpu_us; + uint32_t off_cpu_us; +}; + +static struct hashmap *stats; +static bool verbose; + +static size_t hash_identity_fn(long key, void *ctx) +{ + return key; +} + +static bool hash_equal_fn(long k1, long k2, void *ctx) +{ + return k1 == k2; +} + +static void sig_timer(int sig) +{ + struct hashmap_entry *cur, *tmp; + int bkt; + union { + struct task_stats st; + long opaque; + } v; + + printf("===============================\n"); + hashmap__for_each_entry_safe(stats, cur, tmp, bkt) { + v.opaque = cur->value; + if (cur->key < 0) { + printf("IDLE(%ld): ONCPU = %ums OFFCPU = %ums\n", + -cur->key - 1, v.st.on_cpu_us / 1000, v.st.off_cpu_us / 1000); + } else { + printf("PID(%ld): ONCPU = %ums OFFCPU = %ums\n", + cur->key, v.st.on_cpu_us / 1000, v.st.off_cpu_us / 1000); + } + + hashmap__delete(stats, cur->key, NULL, NULL); + } + printf("-------------------------------\n"); +} + /* Receive events from the ring buffer. */ static int event_handler(void *_ctx, void *data, size_t size) { - struct stacktrace_event *event = data; + struct wprof_event *e = data; + const char *status; + unsigned long key = task_id(e->tgid, e->cpu_id); + union { + long opaque; + struct task_stats stats; + } v; + + if (!hashmap__find(stats, key, &v.opaque)) + v.opaque = 0; + + switch (e->kind) { + case EV_ON_CPU: + status = "ONCPU"; + v.stats.on_cpu_us += e->duration_ns / 1000; + break; + case EV_OFF_CPU: + status = "OFFCPU"; + v.stats.off_cpu_us += e->duration_ns / 1000; + break; + case EV_TIMER: + status = "TIMER"; + v.stats.on_cpu_us += e->duration_ns / 1000; + break; + default: status = "UNKNOWN"; break; + } - if (event->kstack_sz <= 0 && event->ustack_sz <= 0) - return 1; + hashmap__set(stats, key, v.opaque, NULL, NULL); - printf("COMM: %s (pid=%d) @ CPU %d\n", event->comm, event->pid, event->cpu_id); + if (!verbose) + return 0; - if (event->kstack_sz > 0) { + printf("%s (%d/%d) @ CPU %d %s %lldus\n", e->comm, e->pid, e->tgid, e->cpu_id, + status, e->duration_ns / 1000); + + if (e->kstack_sz <= 0 && e->ustack_sz <= 0) + return 1; + + if (e->kstack_sz > 0) { printf("Kernel:\n"); - show_stack_trace(event->kstack, event->kstack_sz / sizeof(__u64), 0); + show_stack_trace(e->kstack, e->kstack_sz / sizeof(__u64), 0); } else { printf("No Kernel Stack\n"); } - if (event->ustack_sz > 0) { + if (e->ustack_sz > 0) { printf("Userspace:\n"); - show_stack_trace(event->ustack, event->ustack_sz / sizeof(__u64), event->pid); + show_stack_trace(e->ustack, e->ustack_sz / sizeof(__u64), e->pid); } else { printf("No Userspace Stack\n"); } printf("\n"); + return 0; } +static __u64 ktime_off; + +static inline uint64_t timespec_to_ns(struct timespec *ts) +{ + return ts->tv_sec * 1000000000ULL + ts->tv_nsec; +} + +static void calibrate_ktime(void) +{ + int i; + struct timespec t1, t2, t3; + uint64_t best_delta = 0, delta, ts; + + for (i = 0; i < 10; i++) { + clock_gettime(CLOCK_REALTIME, &t1); + clock_gettime(CLOCK_MONOTONIC, &t2); + clock_gettime(CLOCK_REALTIME, &t3); + + delta = timespec_to_ns(&t3) - timespec_to_ns(&t1); + ts = (timespec_to_ns(&t3) + timespec_to_ns(&t1)) / 2; + + if (i == 0 || delta < best_delta) { + best_delta = delta; + ktime_off = ts - timespec_to_ns(&t2); + } + } +} + +static __u64 ktime_now_ns() +{ + struct timespec t; + + clock_gettime(CLOCK_MONOTONIC, &t); + + return timespec_to_ns(&t); +} + static void show_help(const char *progname) { printf("Usage: %s [-f ] [-h]\n", progname); @@ -139,7 +262,7 @@ static void show_help(const char *progname) int main(int argc, char *const argv[]) { const char *online_cpus_file = "/sys/devices/system/cpu/online"; - int freq = 1, pid = -1, cpu; + int freq = 1, pid = -1, cpu = -1; struct profile_bpf *skel = NULL; struct perf_event_attr attr; struct bpf_link **links = NULL; @@ -148,15 +271,32 @@ int main(int argc, char *const argv[]) int *pefds = NULL, pefd; int argp, i, err = 0; bool *online_mask = NULL; + struct itimerval timer_ival; - while ((argp = getopt(argc, argv, "hf:")) != -1) { + while ((argp = getopt(argc, argv, "hf:p:c:")) != -1) { switch (argp) { + case 'v': + verbose = true; + break; case 'f': freq = atoi(optarg); if (freq < 1) freq = 1; break; - + case 'p': + pid = atoi(optarg); + if (pid < 0) { + fprintf(stderr, "couldn't parse PID\n"); + return 1; + } + break; + case 'c': + cpu = atoi(optarg); + if (cpu < 0) { + fprintf(stderr, "couldn't parse CPU ID\n"); + return 1; + } + break; case 'h': default: show_help(argv[0]); @@ -164,6 +304,8 @@ int main(int argc, char *const argv[]) } } + stats = hashmap__new(hash_identity_fn, hash_equal_fn, NULL); + err = parse_cpu_mask_file(online_cpus_file, &online_mask, &num_online_cpus); if (err) { fprintf(stderr, "Fail to get online CPU numbers: %d\n", err); @@ -177,13 +319,24 @@ int main(int argc, char *const argv[]) goto cleanup; } - skel = profile_bpf__open_and_load(); + calibrate_ktime(); + + skel = profile_bpf__open(); if (!skel) { fprintf(stderr, "Fail to open and load BPF skeleton\n"); err = -1; goto cleanup; } + if (cpu >= 0) + skel->rodata->cpu_id = cpu; + + err = profile_bpf__load(skel); + if (err) { + fprintf(stderr, "Fail to load BPF skeleton: %d\n", err); + goto cleanup; + } + symbolizer = blaze_symbolizer_new(); if (!symbolizer) { fprintf(stderr, "Fail to create a symbolizer\n"); @@ -192,7 +345,7 @@ int main(int argc, char *const argv[]) } /* Prepare ring buffer to receive events from the BPF program. */ - ring_buf = ring_buffer__new(bpf_map__fd(skel->maps.events), event_handler, NULL, NULL); + ring_buf = ring_buffer__new(bpf_map__fd(skel->maps.rb), event_handler, NULL, NULL); if (!ring_buf) { err = -1; goto cleanup; @@ -206,9 +359,9 @@ int main(int argc, char *const argv[]) links = calloc(num_cpus, sizeof(struct bpf_link *)); memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_HARDWARE; attr.size = sizeof(attr); - attr.config = PERF_COUNT_HW_CPU_CYCLES; + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; attr.sample_freq = freq; attr.freq = 1; @@ -218,7 +371,7 @@ int main(int argc, char *const argv[]) continue; /* Set up performance monitoring on a CPU/Core */ - pefd = perf_event_open(&attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC); + pefd = perf_event_open(&attr, -1, cpu, -1, PERF_FLAG_FD_CLOEXEC); if (pefd < 0) { fprintf(stderr, "Fail to set up performance monitor on a CPU/Core\n"); err = -1; @@ -227,15 +380,34 @@ int main(int argc, char *const argv[]) pefds[cpu] = pefd; /* Attach a BPF program on a CPU */ - links[cpu] = bpf_program__attach_perf_event(skel->progs.profile, pefd); + links[cpu] = bpf_program__attach_perf_event(skel->progs.wprof_tick, pefd); if (!links[cpu]) { err = -1; goto cleanup; } } + err = profile_bpf__attach(skel); + if (err) { + fprintf(stderr, "Failed to attach skeleton: %d\n", err); + goto cleanup; + } + + skel->bss->session_start_ts = ktime_now_ns(); + + signal(SIGALRM, sig_timer); + + timer_ival.it_value.tv_sec = STATS_PERIOD_MS / 1000; + timer_ival.it_value.tv_usec = STATS_PERIOD_MS * 1000 % 1000000; + timer_ival.it_interval = timer_ival.it_value; + err = setitimer(ITIMER_REAL, &timer_ival, NULL); + if (err < 0) { + fprintf(stderr, "Failed to setup stats timer: %d\n", err); + goto cleanup; + } + /* Wait and receive stack traces */ - while (ring_buffer__poll(ring_buf, -1) >= 0) { + while ((err = ring_buffer__poll(ring_buf, -1)) >= 0 || err == -EINTR) { } cleanup: diff --git a/examples/c/profile.h b/examples/c/profile.h index cb72f38c..5ca662f0 100644 --- a/examples/c/profile.h +++ b/examples/c/profile.h @@ -11,12 +11,33 @@ #define MAX_STACK_DEPTH 128 #endif +#define MAX_THREAD_CNT 4096 + +#define RINGBUF_SZ (4 * 1024 * 1024) + +enum task_status { + STATUS_ON_CPU, + STATUS_OFF_CPU, +}; + +enum event_kind { + EV_ON_CPU, + EV_OFF_CPU, + EV_TIMER, +}; + typedef __u64 stack_trace_t[MAX_STACK_DEPTH]; -struct stacktrace_event { - __u32 pid; +struct wprof_event { + enum event_kind kind; __u32 cpu_id; + __u64 ts; + __u32 pid; + __u32 tgid; char comm[TASK_COMM_LEN]; + + __u64 duration_ns; + __s32 kstack_sz; __s32 ustack_sz; stack_trace_t kstack; diff --git a/examples/c/sim.bpf.c b/examples/c/sim.bpf.c new file mode 100644 index 00000000..2963bdab --- /dev/null +++ b/examples/c/sim.bpf.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2020 Facebook */ +#include "vmlinux.h" +#include +#include +#include +#include "bootstrap.h" + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, pid_t); + __type(value, u64); +} exec_start SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} rb SEC(".maps"); + +const volatile unsigned long long min_duration_ns = 0; + +SEC("tp/sched/sched_process_exec") +int handle_exec(struct trace_event_raw_sched_process_exec *ctx) +{ + struct task_struct *task; + unsigned fname_off; + struct event *e; + pid_t pid; + u64 ts; + + /* remember time exec() was executed for this PID */ + pid = bpf_get_current_pid_tgid() >> 32; + ts = bpf_ktime_get_ns(); + bpf_map_update_elem(&exec_start, &pid, &ts, BPF_ANY); + + /* don't emit exec events when minimum duration is specified */ + if (min_duration_ns) + return 0; + + /* reserve sample from BPF ringbuf */ + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + + /* fill out the sample with data */ + task = (struct task_struct *)bpf_get_current_task(); + + e->exit_event = false; + e->pid = pid; + e->ppid = BPF_CORE_READ(task, real_parent, tgid); + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + + fname_off = ctx->__data_loc_filename & 0xFFFF; + bpf_probe_read_str(&e->filename, sizeof(e->filename), (void *)ctx + fname_off); + + /* successfully submit it to user-space for post-processing */ + bpf_ringbuf_submit(e, 0); + return 0; +} + +SEC("tp/sched/sched_process_exit") +int handle_exit(struct trace_event_raw_sched_process_template *ctx) +{ + struct task_struct *task; + struct event *e; + pid_t pid, tid; + u64 id, ts, *start_ts, duration_ns = 0; + + /* get PID and TID of exiting thread/process */ + id = bpf_get_current_pid_tgid(); + pid = id >> 32; + tid = (u32)id; + + /* ignore thread exits */ + if (pid != tid) + return 0; + + /* if we recorded start of the process, calculate lifetime duration */ + start_ts = bpf_map_lookup_elem(&exec_start, &pid); + if (start_ts) + duration_ns = bpf_ktime_get_ns() - *start_ts; + else if (min_duration_ns) + return 0; + bpf_map_delete_elem(&exec_start, &pid); + + /* if process didn't live long enough, return early */ + if (min_duration_ns && duration_ns < min_duration_ns) + return 0; + + /* reserve sample from BPF ringbuf */ + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + + /* fill out the sample with data */ + task = (struct task_struct *)bpf_get_current_task(); + + e->exit_event = true; + e->duration_ns = duration_ns; + e->pid = pid; + e->ppid = BPF_CORE_READ(task, real_parent, tgid); + e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff; + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + + /* send data to user-space for post-processing */ + bpf_ringbuf_submit(e, 0); + return 0; +} diff --git a/examples/c/sim.c b/examples/c/sim.c new file mode 100644 index 00000000..c1b05876 --- /dev/null +++ b/examples/c/sim.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct env { + bool verbose; + long on_dur_ms; + long off_dur_ms; +} env; + +const char *argp_program_version = "sim 0.0"; +const char *argp_program_bug_address = ""; +const char argp_program_doc[] = "On/off CPU workload simulator.\n"; + +static const struct argp_option opts[] = { + { "verbose", 'v', NULL, 0, "Verbose debug output" }, + { "on-duration", 'd', "DURATION-MS", 0, "Time spent burning CPU" }, + { "off-duration", 'D', "DURATION-MS", 0, "Time spent sleeping" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + switch (key) { + case 'v': + env.verbose = true; + break; + case 'd': + errno = 0; + env.on_dur_ms = strtol(arg, NULL, 10); + if (errno || env.on_dur_ms < 0) { + fprintf(stderr, "Invalid --on-duration: %s\n", arg); + argp_usage(state); + } + break; + case 'D': + errno = 0; + env.off_dur_ms = strtol(arg, NULL, 10); + if (errno || env.off_dur_ms < 0) { + fprintf(stderr, "Invalid --off-duration: %s\n", arg); + argp_usage(state); + } + break; + case ARGP_KEY_ARG: + argp_usage(state); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, +}; + +static volatile bool exiting = false; + +static void sig_handler(int sig) +{ + exiting = true; +} + +static inline uint64_t timespec_to_ns(struct timespec *ts) +{ + return ts->tv_sec * 1000000000ULL + ts->tv_nsec; +} + +static uint64_t now_ns() +{ + struct timespec t; + + clock_gettime(CLOCK_MONOTONIC, &t); + + return timespec_to_ns(&t); +} + +int main(int argc, char **argv) +{ + int err; + + /* Parse command line arguments */ + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + /* Cleaner handling of Ctrl-C */ + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + while (!exiting) { + uint64_t start_ts; + volatile double sink = 1.0; + const int iter_num = 100000; + int i; + + if (env.on_dur_ms) { + start_ts = now_ns(); + do { + for (i = 0; i < iter_num; i++) { + sink = sqrt(sink * sink); + } + } while (now_ns() - start_ts < env.on_dur_ms * 1000000ULL); + } + if (env.off_dur_ms) { + start_ts = now_ns(); + do { + usleep(100000); + } while (now_ns() - start_ts < env.off_dur_ms * 1000000ULL); + } + } + + return err < 0 ? -err : 0; +}