Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9cd3145
Import Nick's code
r1viollet Sep 22, 2022
8e567d9
Add a small test for dwarf unwinding
r1viollet Sep 22, 2022
050df2d
Compare between remote vs local unwinding
r1viollet Sep 26, 2022
8b097fc
Minor update on symbols
r1viollet Sep 26, 2022
7c7ca0a
wip
r1viollet Nov 16, 2022
d165250
Merge branch 'main' of github.com:DataDog/ddprof into r1viollet/nick_…
r1viollet Nov 16, 2022
a295903
WIP
r1viollet Nov 16, 2022
e81e633
Refactor and add a test on async profiler unwinding
r1viollet Nov 17, 2022
ed168f4
Minor notes on steps to take to improve unwinding
r1viollet Nov 17, 2022
fcda4a5
Create a remote unwinding test
r1viollet Nov 18, 2022
e4cff89
Add an async prof library
r1viollet Nov 19, 2022
e61cb47
Hacky version using the async profiler's unwinding
r1viollet Nov 19, 2022
78e0957
Adding a benchmark for the async profiler
r1viollet Nov 21, 2022
c0047e0
Prevent tail call optimisation
r1viollet Nov 22, 2022
6af2003
Minor change in comment
r1viollet Nov 23, 2022
15f3093
Remove the save context from the benchmark operation
r1viollet Nov 25, 2022
81b9d2d
Minor env fixes
r1viollet Nov 29, 2022
f562ce9
Minor fix for zsh
r1viollet Nov 29, 2022
e8989ab
Version allowing to run async profiler with ddprof
r1viollet Nov 30, 2022
6765810
Add a small loader tool to compare async profiler's load vs remote load.
r1viollet Dec 1, 2022
cfd7bd5
Work in progress
r1viollet Jan 4, 2023
a5cbca1
Async profiler
r1viollet Jan 6, 2023
495d0d3
Adjust the offset to the eh_frame section
r1viollet Jan 6, 2023
e8ce231
Add the show frames to debug unwinding issues
r1viollet Jan 9, 2023
a589266
Adjust max unwinding depth
r1viollet Jan 9, 2023
5554831
Add in binary information
r1viollet Jan 10, 2023
9d38887
Implement a basic red zone optim fix
r1viollet Jan 10, 2023
82f40ad
Lost sample throttling
r1viollet Jan 11, 2023
3052627
Add pthread to the async profiler compilation
r1viollet Jan 18, 2023
4b394f7
Ensure in whole host we are able to run using the /proc/<pid>/root li…
r1viollet Jan 18, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 18 additions & 35 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,19 @@ add_subdirectory(src/event_parser)
# elfutils
include(Findelfutils)

# -- Async profiler --
set(ASYNC_PROFILER_LIB_DIR ${CMAKE_SOURCE_DIR})
set(ASYNC_PROFILER_SRC_DIR ${ASYNC_PROFILER_LIB_DIR}/src/async-profiler)
set(ASYNC_PROFILER_LIB_INCLUDE ${ASYNC_PROFILER_LIB_DIR}/include/async-profiler)
aux_source_directory(${ASYNC_PROFILER_SRC_DIR} ASYNC_PROFILER_SOURCES)
add_library(async_prof_lib STATIC ${ASYNC_PROFILER_SOURCES})
target_include_directories(async_prof_lib PUBLIC ${ASYNC_PROFILER_LIB_INCLUDE}
${CMAKE_SOURCE_DIR}/include)
target_link_libraries(async_prof_lib PRIVATE dw elf Threads::Threads)
set_property(TARGET async_prof_lib PROPERTY POSITION_INDEPENDENT_CODE ON)
add_library(DDProf::AsyncProf ALIAS async_prof_lib)
# -------------------

# ---- Static analysis ----
include(ClangTidy)
include(Format)
Expand Down Expand Up @@ -145,7 +158,8 @@ aux_source_directory(src/exe EXE_SRC)
# Define all sources
set(DDPROF_GLOBAL_SRC ${COMMON_SRC} ${PPROF_SRC} ${EXPORTER_SRC} ${EXE_SRC})

set(DDPROF_LIBRARY_LIST DDProf::Parser llvm-demangle ${ELFUTILS_LIBRARIES} Threads::Threads)
set(DDPROF_LIBRARY_LIST DDProf::Parser DDProf::AsyncProf llvm-demangle ${ELFUTILS_LIBRARIES}
Threads::Threads)

if(ON)
# Add the rust library - Refactoring ongoing. OFF for now
Expand Down Expand Up @@ -378,42 +392,11 @@ install(
ARCHIVE DESTINATION ddprof/lib
PUBLIC_HEADER DESTINATION ddprof/include)

# ---- Declaration of native library ----
option(BUILD_NATIVE_LIB "Build a library out of the native profiler" ON)
if(${BUILD_NATIVE_LIB})

# Define all sources
set(DDPROF_LIB_SRC ${COMMON_SRC} src/lib/ddprof_output.cc)

# Libs to link
set(NATIVE_LIB_LIBRARY_LIST DDProf::Parser llvm-demangle ${ELFUTILS_LIBRARIES} Threads::Threads)

if("${DDPROF_ALLOCATOR}" STREQUAL "JEMALLOC")
list(PREPEND NATIVE_LIB_LIBRARY_LIST jemalloc)
endif()

# Create the lib
add_library(ddprof-native ${DDPROF_LIB_SRC})

set_target_properties(ddprof-native PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(ddprof-native PROPERTIES COMPILE_DEFINITIONS DDPROF_NATIVE_LIB)

# libcap, can be removed from version distributed to client
list(APPEND NATIVE_LIB_LIBRARY_LIST libcap)

target_include_directories(ddprof-native PRIVATE ${DDPROF_INCLUDE_LIST})

target_link_libraries(ddprof-native PRIVATE ${NATIVE_LIB_LIBRARY_LIST})
add_library(DDProf::Native ALIAS ddprof-native)

option(ACCURACY_TEST "Enable accuracy test" OFF)
if(${ACCURACY_TEST})
add_subdirectory(test/self_unwind)
endif()
endif()

# ---- Unit tests ----

aux_source_directory(${CMAKE_SOURCE_DIR}/src/async-profiler ASYNC_PROFILER_SRC)
set(ASYNC_PROFILER_INCLUDE ${CMAKE_SOURCE_DIR}/include/async-profiler)

# Unit tests Add infrastructure for enabling tests
option(BUILD_DDPROF_TESTING "Enable tests" ON)
if(${BUILD_DDPROF_TESTING})
Expand Down
87 changes: 87 additions & 0 deletions design_notes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
## things by pid
#################
DsoHdr
-- DSO

Lookups
-- pid / bin maps
-- mappings
-- runtime

DwflHdr
-- DwflWrapper
-- Visited pids

## Not by pid
#############
-- File info (dso_hdr)
-- Lookup
Dwarf symbol lookup
-- Symbol table

## Async profiler

parseLibraries parses everything in proc self.

### Step 1 -- ensure we can have one code array per PID

### Step 2 -- ensure symbols are shared across PIDs

### Step 3 -- ensure unwinding tables are shared across PIDs


### Junk notes
Write an API that can work with ddprof object model

LoadSymbolTable loads at a given base address
We want to store all symbols at an elf address

1) Rewrite parseLibraries <Hard>
we can use DSO information + file info
Ensure the cache is at elf address (not base)
Start is 0 or for non PIE,


2) Find library is by absolute address
- Keep that ?


### Issues

- We are moving from a lazy to an absolute load
- We don't have enough tests

- We are not consider elf versions (though do we care ?)
- read past sp --> check with



### Async profiler load in symbols_linux

--> Create code cache
library name
Index --> count of lib
image base --> start
image end --> end

--> Parse program headers
text_base --> set as base
---> parseDynamicSection
global offset table --> Example (Shit it is absolute)
GOT start == 0x5555556226b0
GOT start == 0x7ffff7ffd018

relocation -> relent size of relocation entry
!We already have ways to parse GOT, we don't care

---> parseDwarfInfo
parseDwarfInfo()
looks like the values are relative (wouhou)
then we set the dwarf table (so nothing to change)

-> VDSO
--> parse memory


// TODO tree of life

149 changes: 149 additions & 0 deletions include/async-profiler/arch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
/*
* Copyright 2017 Andrei Pangin
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef _ARCH_H
#define _ARCH_H

typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;
typedef unsigned long long u64;

static inline u64 atomicInc(volatile u64 &var, u64 increment = 1) {
return __sync_fetch_and_add(&var, increment);
}

static inline int atomicInc(volatile int &var, int increment = 1) {
return __sync_fetch_and_add(&var, increment);
}

static inline u64 loadAcquire(u64 &var) {
return __atomic_load_n(&var, __ATOMIC_ACQUIRE);
}

static inline void storeRelease(u64 &var, u64 value) {
return __atomic_store_n(&var, value, __ATOMIC_RELEASE);
}

#if defined(__x86_64__) || defined(__i386__)

typedef unsigned char instruction_t;
const instruction_t BREAKPOINT = 0xcc;
const int BREAKPOINT_OFFSET = 0;

const int SYSCALL_SIZE = 2;
const int FRAME_PC_SLOT = 1;
const int ADJUST_RET = 1;
const int PLT_HEADER_SIZE = 16;
const int PLT_ENTRY_SIZE = 16;
const int PERF_REG_PC = 8; // PERF_REG_X86_IP

# define spinPause() asm volatile("pause")
# define rmb() asm volatile("lfence" : : : "memory")
# define flushCache(addr) \
asm volatile("mfence; clflush (%0); mfence" : : "r"(addr) : "memory")

#elif defined(__arm__) || defined(__thumb__)

typedef unsigned int instruction_t;
const instruction_t BREAKPOINT = 0xe7f001f0;
const instruction_t BREAKPOINT_THUMB = 0xde01de01;
const int BREAKPOINT_OFFSET = 0;

const int SYSCALL_SIZE = sizeof(instruction_t);
const int FRAME_PC_SLOT = 1;
const int ADJUST_RET = 0;
const int PLT_HEADER_SIZE = 20;
const int PLT_ENTRY_SIZE = 12;
const int PERF_REG_PC = 15; // PERF_REG_ARM_PC

# define spinPause() asm volatile("yield")
# define rmb() asm volatile("dmb ish" : : : "memory")
# define flushCache(addr) \
__builtin___clear_cache((char *)(addr), \
(char *)(addr) + sizeof(instruction_t))

#elif defined(__aarch64__)

typedef unsigned int instruction_t;
const instruction_t BREAKPOINT = 0xd4200000;
const int BREAKPOINT_OFFSET = 0;

const int SYSCALL_SIZE = sizeof(instruction_t);
const int FRAME_PC_SLOT = 1;
const int ADJUST_RET = 0;
const int PLT_HEADER_SIZE = 32;
const int PLT_ENTRY_SIZE = 16;
const int PERF_REG_PC = 32; // PERF_REG_ARM64_PC

# define spinPause() asm volatile("isb")
# define rmb() asm volatile("dmb ish" : : : "memory")
# define flushCache(addr) \
__builtin___clear_cache((char *)(addr), \
(char *)(addr) + sizeof(instruction_t))

#elif defined(__PPC64__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

typedef unsigned int instruction_t;
const instruction_t BREAKPOINT = 0x7fe00008;
// We place the break point in the third instruction slot on PPCLE as the first
// two are skipped if the call comes from within the same compilation unit
// according to the LE ABI.
const int BREAKPOINT_OFFSET = 8;

const int SYSCALL_SIZE = sizeof(instruction_t);
const int FRAME_PC_SLOT = 2;
const int ADJUST_RET = 0;
const int PLT_HEADER_SIZE = 24;
const int PLT_ENTRY_SIZE = 24;
const int PERF_REG_PC = 32; // PERF_REG_POWERPC_NIP

# define spinPause() \
asm volatile("yield") // does nothing, but using or 1,1,1 would lead to
// other problems
# define rmb() \
asm volatile("sync" \
: \
: \
: "memory") // lwsync would do but better safe than sorry
# define flushCache(addr) \
__builtin___clear_cache((char *)(addr), \
(char *)(addr) + sizeof(instruction_t))

#else

# error "Compiling on unsupported arch"

#endif

// Return address signing support.
// Apple M1 has 47 bit virtual addresses.
#if defined(__aarch64__) && defined(__APPLE__)
# define ADDRESS_BITS 47
# define WX_MEMORY true
#else
# define WX_MEMORY false
#endif

#ifdef ADDRESS_BITS
static inline const void *stripPointer(const void *p) {
return (const void *)((unsigned long)p & ((1UL << ADDRESS_BITS) - 1));
}
#else
# define stripPointer(p) (p)
#endif

#endif // _ARCH_H
Loading