Skip to content

Commit 75149e5

Browse files
committed
Add check for STALL tensors
1 parent c945a41 commit 75149e5

File tree

3 files changed

+61
-2
lines changed

3 files changed

+61
-2
lines changed

bluefog/common/global_state.h

+3
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ struct BluefogGlobalState {
6666
// Time point when last cycle started.
6767
std::chrono::steady_clock::time_point last_cycle_start;
6868

69+
// Time point when coordinator last checked for stalled tensors.
70+
std::chrono::steady_clock::time_point last_stall_check;
71+
6972
std::shared_ptr<MPIController> controller;
7073

7174
#if HAVE_NCCL

bluefog/common/operations.cc

+58-1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
#define COORDINATE_RANK 0
4242
#define BLUEFOG_TIMELINE "BLUEFOG_TIMELINE"
4343
#define BLUEFOG_CYCLE_TIME "BLUEFOG_CYCLE_TIME"
44+
// Stall-check warning time
45+
#define STALL_WARNING_TIME std::chrono::seconds(15)
4446

4547
namespace bluefog {
4648
namespace common {
@@ -313,6 +315,55 @@ Response ConstructResponse(MessageTable* message_table, std::string name) {
313315
return response;
314316
}
315317

318+
// Report Tensors that were submitted to be reduced, gathered or broadcasted by
319+
// some ranks but not others and are waiting for long time to get processed.
320+
void CheckForStalledTensors(BluefogGlobalState& state) {
321+
bool preamble = false;
322+
auto now = std::chrono::steady_clock::now();
323+
for (auto& m : *state.message_table) {
324+
auto tensor_name = m.first;
325+
std::vector<Request>& messages = std::get<0>(m.second);
326+
std::chrono::steady_clock::time_point start_at = std::get<1>(m.second);
327+
328+
if (now - start_at > STALL_WARNING_TIME) {
329+
if (!preamble) {
330+
std::cerr << "WARNING: One or more tensors were submitted to be "
331+
"reduced, gathered or broadcasted by subset of ranks and "
332+
"are waiting for remainder of ranks for more than "
333+
<< std::chrono::duration_cast<std::chrono::seconds>(
334+
STALL_WARNING_TIME)
335+
.count()
336+
<< " seconds. ";
337+
std::cerr << "This may indicate that different ranks are trying to "
338+
"submit different tensors or that only subset of ranks is "
339+
"submitting tensors, which will cause deadlock. " << std::endl;
340+
std::cerr << "Stalled ops:" << std::endl;
341+
preamble = true;
342+
}
343+
std::cerr << tensor_name;
344+
std::cerr << " [missing ranks:";
345+
std::unordered_set<int32_t> ready_ranks;
346+
bool missing_preamble = false;
347+
for (auto msg_iter = messages.begin(); msg_iter != messages.end();
348+
msg_iter++) {
349+
ready_ranks.insert(msg_iter->request_rank());
350+
}
351+
for (int32_t rank = 0; rank < mpi_context.size_; rank++) {
352+
if (ready_ranks.find(rank) == ready_ranks.end()) {
353+
if (!missing_preamble) {
354+
std::cerr << " ";
355+
missing_preamble = true;
356+
} else {
357+
std::cerr << ", ";
358+
}
359+
std::cerr << rank;
360+
}
361+
}
362+
std::cerr << "]" << std::endl;
363+
}
364+
}
365+
}
366+
316367
bool RunLoopOnce(BluefogGlobalState& state);
317368

318369
void BackgroundThreadLoop(BluefogGlobalState& state) {
@@ -777,7 +828,13 @@ bool RunLoopOnce(BluefogGlobalState& state) {
777828
state.tensor_queue.GetTensorEntriesFromResponse(response, entries);
778829
// TODO: tensor fusion logics?
779830
}
780-
// TODO: Check for stalled tensors.
831+
832+
// Check for stalled tensors.
833+
if (std::chrono::steady_clock::now() - state.last_stall_check >
834+
STALL_WARNING_TIME) {
835+
CheckForStalledTensors(state);
836+
state.last_stall_check = std::chrono::steady_clock::now();
837+
}
781838
} else {
782839
std::string encoded_message;
783840
RequestList message_list;

test/torch_win_ops_test.py

-1
Original file line numberDiff line numberDiff line change
@@ -602,7 +602,6 @@ def test_win_mutex_full(self):
602602
assert (t_end - t_start) < 2, \
603603
"The mutex acquire time should be shorter than 2 second"
604604

605-
@unittest.skip
606605
def test_win_mutex_given_ranks(self):
607606
size = bf.size()
608607
rank = bf.rank()

0 commit comments

Comments
 (0)