Skip to content

Commit e1d0097

Browse files
authored
Be more vocal about traces without the proper CPU configuration (#3928)
* Yell louder about improperly configured CPUs. * Annotate traces when the CPU is not configured correctly but users force recording anyways.
1 parent 5d72fce commit e1d0097

8 files changed

+64
-2
lines changed

src/PerfCounters.cc

+6
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ namespace rr {
4444
#define PERF_COUNT_RR 0x72727272L
4545

4646
static bool attributes_initialized;
47+
static bool cpu_improperly_configured;
4748
// At some point we might support multiple kinds of ticks for the same CPU arch.
4849
// At that point this will need to become more complicated.
4950
struct perf_event_attrs {
@@ -512,6 +513,7 @@ static void check_working_counters(perf_event_attrs &perf_attr) {
512513
// Returns the ticks minimum period.
513514
static uint32_t check_for_bugs(perf_event_attrs &perf_attr) {
514515
DEBUG_ASSERT(!running_under_rr());
516+
cpu_improperly_configured = false;
515517

516518
uint32_t min_period = check_for_ioc_period_bug(perf_attr);
517519
check_working_counters(perf_attr);
@@ -836,6 +838,10 @@ void PerfCounters::start_pt_copy_thread() {
836838
}
837839
}
838840

841+
bool PerfCounters::improperly_configured() {
842+
return cpu_improperly_configured;
843+
}
844+
839845
// See https://github.com/intel/libipt/blob/master/doc/howto_capture.md
840846
void PerfCounters::PTState::open(pid_t tid) {
841847
static uint32_t event_type = pt_event_type();

src/PerfCounters.h

+6
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,12 @@ class PerfCounters {
177177
*/
178178
static void start_pt_copy_thread();
179179

180+
/**
181+
* Returns true iff we detected issues with performance counter configuration
182+
* and wanted to die but the user forced continuation.
183+
*/
184+
static bool improperly_configured();
185+
180186
/**
181187
* Try to use BPF to accelerate async signal processing
182188
*/

src/PerfCounters_x86.h

+8-2
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,8 @@ static void check_for_xen_pmi_bug(const perf_event_attrs &perf_attr) {
322322
"virtualization bug.\n"
323323
"Aborting. Retry with -F to override, but it will probably\n"
324324
"fail.";
325+
} else {
326+
cpu_improperly_configured = true;
325327
}
326328
}
327329
}
@@ -348,11 +350,13 @@ static void check_for_zen_speclockmap() {
348350
LOG(debug) << "SpecLockMap is disabled";
349351
} else {
350352
LOG(debug) << "SpecLockMap is not disabled";
351-
if (!Flags::get().suppress_environment_warnings) {
353+
if (!Flags::get().force_things) {
352354
fprintf(stderr,
353355
"On Zen CPUs, rr will not work reliably unless you disable the "
354356
"hardware SpecLockMap optimization.\nFor instructions on how to "
355357
"do this, see https://github.com/rr-debugger/rr/wiki/Zen\n");
358+
} else {
359+
cpu_improperly_configured = true;
356360
}
357361
}
358362
}
@@ -376,7 +380,7 @@ static void check_for_freeze_on_smi() {
376380
LOG(debug) << "freeze_on_smi is set";
377381
} else if (freeze_on_smi == '0') {
378382
LOG(warn) << "freeze_on_smi is not set";
379-
if (!Flags::get().suppress_environment_warnings) {
383+
if (!Flags::get().force_things) {
380384
fprintf(stderr,
381385
"Freezing performance counters on SMIs should be enabled for maximum rr\n"
382386
"reliability on Comet Lake and later CPUs. To manually enable this setting, run\n"
@@ -386,6 +390,8 @@ static void check_for_freeze_on_smi() {
386390
"to automatically apply this setting on every reboot.\n"
387391
"See 'man 5 sysfs', 'man 5 tmpfiles.d'.\n"
388392
"If you are seeing this message, the setting has not been enabled.\n");
393+
} else {
394+
cpu_improperly_configured = true;
389395
}
390396
} else {
391397
LOG(warn) << "Unrecognized freeze_on_smi value " << freeze_on_smi;

src/ReplaySession.cc

+12
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,18 @@ ReplaySession::ReplaySession(const std::string& dir, const Flags& flags)
272272
set_intel_pt_enabled(flags.intel_pt_start_checking_event >= 0);
273273

274274
check_virtual_address_size();
275+
276+
bool cpu_improperly_configured_known, cpu_improperly_configured;
277+
cpu_improperly_configured = trace_in.cpu_improperly_configured(&cpu_improperly_configured_known);
278+
if (cpu_improperly_configured_known && cpu_improperly_configured) {
279+
if (rr::Flags::get().force_things) {
280+
LOG(warn) << "CPU was improperly configured at recording but forcing anyways.";
281+
} else {
282+
CLEAN_FATAL() << "This trace was recorded on a CPU determined to be improperly configured " <<
283+
"but rr's automated checks were overridden by the user. If you really want to " <<
284+
"replay this trace override those checks again with -F";
285+
}
286+
}
275287
}
276288

277289
ReplaySession::ReplaySession(const ReplaySession& other)

src/TraceInfoCommand.cc

+6
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ static int dump_trace_info(const string& trace_dir, FILE* out) {
102102
fprintf(out, " \"maxVirtualAddressSize\":%d,\n", max_virtual_address_size);
103103
}
104104

105+
bool cpu_improperly_configured_known;
106+
bool cpu_improperly_configured = trace.cpu_improperly_configured(&cpu_improperly_configured_known);
107+
if (cpu_improperly_configured_known) {
108+
fprintf(out, " \"cpuImproperlyConfigured\":%s,\n", cpu_improperly_configured ? "true" : "false");
109+
}
110+
105111
if (!trace.uname().sysname.empty()) {
106112
const auto& uname = trace.uname();
107113
fputs(" \"uname\":{", out);

src/TraceStream.cc

+16
Original file line numberDiff line numberDiff line change
@@ -1492,6 +1492,7 @@ void TraceWriter::close(CloseStatus status, const TraceUuid* uuid) {
14921492
header.setRuntimePageSize(page_size());
14931493
header.setPreloadLibraryPageSize(PRELOAD_LIBRARY_PAGE_SIZE);
14941494
header.setMaxVirtualAddressSize(max_virtual_address_size);
1495+
header.setCpuImproperlyConfigured(to_tristate(PerfCounters::improperly_configured()));
14951496

14961497
{
14971498
struct utsname uname_buf;
@@ -1716,6 +1717,21 @@ TraceReader::TraceReader(const string& dir)
17161717
exclusion_range_ = MemoryRange(remote_ptr<void>(header.getExclusionRangeStart()),
17171718
remote_ptr<void>(header.getExclusionRangeEnd()));
17181719

1720+
switch (header.getCpuImproperlyConfigured()) {
1721+
case trace::CpuTriState::UNKNOWN:
1722+
cpu_improperly_configured_known_ = false;
1723+
cpu_improperly_configured_ = false;
1724+
break;
1725+
case trace::CpuTriState::KNOWN_TRUE:
1726+
cpu_improperly_configured_known_ = true;
1727+
cpu_improperly_configured_ = true;
1728+
break;
1729+
case trace::CpuTriState::KNOWN_FALSE:
1730+
cpu_improperly_configured_known_ = true;
1731+
cpu_improperly_configured_ = false;
1732+
break;
1733+
}
1734+
17191735
const auto& uname = header.getUname();
17201736
uname_.sysname = data_to_str(uname.getSysname());
17211737
uname_.nodename = data_to_str(uname.getNodename());

src/TraceStream.h

+6
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,10 @@ class TraceReader : public TraceStream {
506506
uint8_t max_virtual_address_size() const {
507507
return max_virtual_address_size_;
508508
}
509+
bool cpu_improperly_configured(bool* known) const {
510+
*known = cpu_improperly_configured_known_;
511+
return cpu_improperly_configured_;
512+
}
509513

510514
enum TraceQuirks {
511515
// Whether the /proc/<pid>/mem calls were explicitly recorded in this trace
@@ -547,6 +551,8 @@ class TraceReader : public TraceStream {
547551
bool chaos_mode_;
548552
int rrcall_base_;
549553
uint8_t max_virtual_address_size_;
554+
bool cpu_improperly_configured_known_;
555+
bool cpu_improperly_configured_;
550556
uint32_t syscallbuf_fds_disabled_size_;
551557
uint32_t syscallbuf_hdr_size_;
552558
int required_forward_compatibility_version_;

src/rr_trace.capnp

+4
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,10 @@ struct Header {
152152
# size. A value of 0, only present for traces recorded before this was added,
153153
# means the default value for the relevant arch.
154154
maxVirtualAddressSize @28 :UInt8 = 0;
155+
# One of our pre-flight checks found the CPU has issues (e.g. the Zen SpecLockMap
156+
# optimization is not disabled) but the user chose to force recording to
157+
# continue regardless.
158+
cpuImproperlyConfigured @29 :CpuTriState = unknown;
155159
}
156160

157161
# A file descriptor belonging to a task

0 commit comments

Comments
 (0)