diff --git a/gloo/CMakeLists.txt b/gloo/CMakeLists.txt
index 186fe1288..0a14cc629 100644
--- a/gloo/CMakeLists.txt
+++ b/gloo/CMakeLists.txt
@@ -51,6 +51,11 @@ list(APPEND GLOO_HDRS
   "${CMAKE_CURRENT_SOURCE_DIR}/types.h"
   )
 
+if(NOT MSVC)
+  list(APPEND GLOO_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_shm.cc")
+  list(APPEND GLOO_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_shm.h")
+endif()
+
 if(USE_CUDA)
   file(GLOB GLOO_CUDA_SRCS
     "${CMAKE_CURRENT_SOURCE_DIR}/cuda*.cc"
diff --git a/gloo/allreduce.cc b/gloo/allreduce.cc
index 080f7f302..e5871ad9a 100644
--- a/gloo/allreduce.cc
+++ b/gloo/allreduce.cc
@@ -12,8 +12,10 @@
 #include <array>
 #include <cstring>
 
+#include "gloo/allreduce_shm.h"
 #include "gloo/common/logging.h"
 #include "gloo/math.h"
+#include "gloo/transport/device.h"
 #include "gloo/types.h"
 
 namespace gloo {
@@ -131,7 +133,15 @@ void allreduce(const detail::AllreduceOptionsImpl& opts) {
     return;
   }
 
-  switch (opts.algorithm) {
+  auto algorithm = opts.algorithm;
+
+#ifndef _WIN32
+  if (context->isIntraNode() && !context->getDevice()->hasGPUDirect()) {
+    algorithm = detail::AllreduceOptionsImpl::SHM;
+  }
+#endif
+
+  switch (algorithm) {
     case detail::AllreduceOptionsImpl::UNSPECIFIED:
     case detail::AllreduceOptionsImpl::RING:
       ring(opts, reduceInputs, broadcastOutputs);
@@ -139,6 +149,11 @@ void allreduce(const detail::AllreduceOptionsImpl& opts) {
     case detail::AllreduceOptionsImpl::BCUBE:
       bcube(opts, reduceInputs, broadcastOutputs);
       break;
+#ifndef _WIN32
+    case detail::AllreduceOptionsImpl::SHM:
+      shm(opts);
+      break;
+#endif
     default:
       GLOO_ENFORCE(false, "Algorithm not handled.");
   }
diff --git a/gloo/allreduce.h b/gloo/allreduce.h
index 904eb8b32..6f6037ede 100644
--- a/gloo/allreduce.h
+++ b/gloo/allreduce.h
@@ -39,6 +39,7 @@ struct AllreduceOptionsImpl {
     UNSPECIFIED = 0,
     RING = 1,
     BCUBE = 2,
+    SHM = 3,
   };
 
   explicit AllreduceOptionsImpl(const std::shared_ptr<Context>& context)
diff --git a/gloo/allreduce_shm.cc b/gloo/allreduce_shm.cc
new file mode 100644
index 000000000..6de573da4
--- /dev/null
+++ b/gloo/allreduce_shm.cc
@@ -0,0 +1,509 @@
+#include "gloo/allreduce_shm.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <thread>
+
+namespace gloo {
+
+namespace {
+
+using ReductionFunction = AllreduceOptions::Func;
+using CollState = AllreduceSharedMemoryData::CollState;
+using Allreduceworkspace = AllreduceSharedMemoryData::AllreduceWorkspace;
+
+constexpr int VECTOR_LENGTH_IN_BYTES = 32;
+
+#define BUFFER0_OFFSET(current_buffer) \
+  current_buffer* Allreduceworkspace::NAIVE_ALLREDUCE_THRESHOLD
+#define BUFFER1_OFFSET(current_buffer)                \
+  2 * Allreduceworkspace::NAIVE_ALLREDUCE_THRESHOLD + \
+      current_buffer* Allreduceworkspace::MAX_BUF_SIZE
+
+// SHM building blocks
+struct SharedData {
+  const char* name;
+  int descriptor;
+  void* bytes;
+  size_t nbytes;
+};
+
+void shared_open(SharedData* data, const char* name, size_t nbytes) {
+  int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR);
+  if (d != -1) {
+    void* bytes = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0);
+    data->name = name;
+    data->descriptor = d;
+    data->bytes = bytes;
+    data->nbytes = nbytes;
+  } else {
+    if (errno != ENOENT) {
+      // don't print if shm can not be found because we want to loop over from
+      // caller again until the other ranks created the shm
+      printf("shared_open %s failed, errno=%d\n", name, errno);
+    }
+    data->descriptor = -1;
+  }
+}
+
+void shared_create(
+    SharedData* data,
+    const char* name,
+    void* bytes,
+    size_t nbytes) {
+  int d = shm_open(name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if (d != -1) {
+    if (nbytes = write(d, bytes, nbytes)) {
+      shared_open(data, name, nbytes);
+    }
+  } else {
+    printf("shared_create %s failed\n", name);
+  }
+}
+
+void wait_buffer_state(
+    CollState state0,
+    CollState state1,
+    int state_group,
+    std::chrono::milliseconds timeout,
+    std::shared_ptr<AllreduceSharedMemoryData> shm_data) {
+  // Create a new thread
+  auto workspace = shm_data->workspace;
+  const int rank = shm_data->rank;
+  const int world_size = shm_data->world_size;
+
+  for (int i = 0; i < world_size; i++) {
+    if (i == rank) {
+      continue;
+    }
+    volatile CollState* state_ptr = &(workspace[i]->states[state_group]);
+
+    while (true) {
+      volatile CollState cur_state = *state_ptr;
+      if (cur_state == state0 || cur_state == state1) {
+        break;
+      }
+      if (shm_data->shutdown) {
+        return;
+      }
+    }
+  }
+
+  std::unique_lock lock(shm_data->m);
+  shm_data->wait_done = true;
+  lock.unlock();
+  shm_data->cv.notify_one();
+}
+
+void wait_buffer_state_until_2(
+    CollState state0,
+    CollState state1,
+    int state_group,
+    std::chrono::milliseconds timeout,
+    std::shared_ptr<AllreduceSharedMemoryData> shm_data) {
+  shm_data->wait_done = false;
+  shm_data->shutdown = false;
+
+  // Create wait buffer thread.
+  std::thread t(
+      wait_buffer_state, state0, state1, state_group, timeout, shm_data);
+
+  std::unique_lock lock(shm_data->m);
+  auto done =
+      shm_data->cv.wait_for(lock, timeout, [&] { return shm_data->wait_done; });
+  if (!done) {
+    shm_data->shutdown = true;
+    t.join();
+    throw ::gloo::IoException(GLOO_ERROR_MSG(
+        "Timed out waiting",
+        timeout.count(),
+        "ms for wait buffer state operation to complete"));
+  } else {
+    t.join();
+  }
+}
+
+void reduce_all_buffers(
+    int start_elements,
+    int num_elements,
+    int element_size,
+    int to_buffer_idx,
+    int world_size,
+    char* to_buffer,
+    char** buffers,
+    ReductionFunction fn) {
+  size_t offset = start_elements * element_size;
+  memcpy(to_buffer + offset, buffers[0] + offset, num_elements * element_size);
+  for (int i = 1; i < world_size; i++) {
+    fn(to_buffer + offset,
+       to_buffer + offset,
+       buffers[i] + offset,
+       num_elements);
+  }
+}
+
+static void parallel_memcpy(void* to, void* from, size_t n_bytes)
+    __attribute__((target("avx512bw")));
+static void parallel_memcpy(void* to, void* from, size_t n_bytes) {
+  auto aligned_bytes = n_bytes - (n_bytes % VECTOR_LENGTH_IN_BYTES);
+  // process aligned part
+#pragma omp parallel for
+  for (int i = 0; i < aligned_bytes; i += VECTOR_LENGTH_IN_BYTES) {
+    auto val = _mm256_loadu_si256((__m256i*)((char*)from + i));
+    _mm256_storeu_si256((__m256i*)((char*)to + i), val);
+  }
+
+  // process remaining part
+  for (int i = aligned_bytes; i < n_bytes; i++) {
+    *((char*)to + i) = *((char*)from + i);
+  }
+}
+
+size_t slice_size(size_t chunk_el, int slice_idx, int world_size) {
+  size_t slice_size = chunk_el / world_size;
+  return slice_idx == world_size - 1 ? slice_size + (chunk_el % world_size)
+                                     : slice_size;
+}
+
+char* slice_data(
+    char* data_ptr,
+    size_t chunk_el,
+    int el_size,
+    int slice_idx,
+    int world_size) {
+  size_t slice_size = chunk_el / world_size;
+  size_t el_offset = slice_size * slice_idx;
+  return data_ptr + el_offset * el_size;
+}
+
+size_t slice_el_start(size_t chunk_el, int slice_idx, int world_size) {
+  size_t slice_size = chunk_el / world_size;
+  return slice_size * slice_idx;
+}
+
+void symmetric_naive_all_reduce(
+    char* data_ptr,
+    int element_size,
+    size_t chunk_size,
+    size_t chunk_el,
+    const detail::AllreduceOptionsImpl& opts) {
+  const auto& context = opts.context;
+  auto& shm_data = context->shmData;
+  const int rank = shm_data->rank;
+  const int world_size = shm_data->world_size;
+  auto symmetric_buffer = shm_data->symmetric_buffer;
+  auto workspace = shm_data->workspace;
+  auto& state_idx = shm_data->state_idx;
+  auto& current_buffer = shm_data->current_buffer;
+
+  const int state_group = 0;
+
+  CollState copy_current, copy_next;
+
+  switch (state_idx) {
+    case 0:
+      copy_current = CollState::coll_allreduce_naive__copy_in_done;
+      copy_next = CollState::coll_alt1_allreduce_naive__copy_in_done;
+      break;
+    case 1:
+      copy_current = CollState::coll_alt1_allreduce_naive__copy_in_done;
+      copy_next = CollState::coll_alt2_allreduce_naive__copy_in_done;
+      break;
+    case 2:
+      copy_current = CollState::coll_alt2_allreduce_naive__copy_in_done;
+      copy_next = CollState::coll_allreduce_naive__copy_in_done;
+      break;
+    default:
+      assert(!"Should not get here.");
+  }
+  state_idx = (state_idx + 1) % 3;
+
+  parallel_memcpy(symmetric_buffer[current_buffer][rank], data_ptr, chunk_size);
+
+  std::atomic_thread_fence(std::memory_order_release);
+  workspace[rank]->states[state_group] = copy_current;
+
+  wait_buffer_state_until_2(
+      copy_current, copy_next, state_group, opts.timeout, shm_data);
+
+  // each rank reduce the buffer independently so therre is no need for
+  // synchronization afterward
+  reduce_all_buffers(
+      0,
+      chunk_el,
+      element_size,
+      rank,
+      world_size,
+      data_ptr,
+      symmetric_buffer[current_buffer],
+      opts.reduce);
+
+  // switch buffer
+  current_buffer = 1 - current_buffer;
+}
+
+// naive allreduce distributed, each rank do naive reduce on its slice
+void distributed_naive_reduce(
+    char* data_ptr,
+    int element_size,
+    size_t chunk_size,
+    size_t chunk_el,
+    const detail::AllreduceOptionsImpl& opts) {
+  const auto& context = opts.context;
+  auto& shm_data = context->shmData;
+  const int rank = shm_data->rank;
+  const int world_size = shm_data->world_size;
+  auto distributed_buffer = shm_data->distributed_buffer;
+  auto workspace = shm_data->workspace;
+  auto& state_idx = shm_data->state_idx;
+  auto& current_buffer = shm_data->current_buffer;
+
+  const int state_group = 1;
+
+  CollState copy_current, copy_next, reduce_current;
+
+  // similar to symmetric_naive_allreduce, but here we only need two sets of
+  // states, because distributed naive reduce has two barriers in the
+  // algorithm
+  switch (state_idx) {
+    case 0:
+      copy_current = CollState::coll_allreduce_naive__copy_in_done;
+      reduce_current = CollState::coll_allreduce_naive__reduce_done;
+      copy_next = CollState::coll_alt1_allreduce_naive__copy_in_done;
+      break;
+    case 1:
+      copy_current = CollState::coll_alt1_allreduce_naive__copy_in_done;
+      reduce_current = CollState::coll_alt1_allreduce_naive__reduce_done;
+      copy_next = CollState::coll_allreduce_naive__copy_in_done;
+      break;
+    default:
+      assert(!"Should not get here.");
+  }
+  state_idx = (state_idx + 1) % 2;
+
+  int data_size = chunk_size / chunk_el;
+  parallel_memcpy(
+      distributed_buffer[current_buffer][rank], data_ptr, chunk_size);
+  std::atomic_thread_fence(std::memory_order_release);
+  workspace[rank]->states[state_group] = copy_current;
+
+  wait_buffer_state_until_2(
+      copy_current, reduce_current, state_group, opts.timeout, shm_data);
+
+  // reduce scatter
+  reduce_all_buffers(
+      slice_el_start(chunk_el, rank, world_size),
+      slice_size(chunk_el, rank, world_size),
+      element_size,
+      rank,
+      world_size,
+      distributed_buffer[current_buffer][rank],
+      distributed_buffer[current_buffer],
+      opts.reduce);
+  std::atomic_thread_fence(std::memory_order_release);
+  workspace[rank]->states[state_group] = reduce_current;
+
+  wait_buffer_state_until_2(
+      copy_current, reduce_current, state_group, opts.timeout, shm_data);
+
+  for (int i = 0; i < world_size; i++) {
+    int rank = (i + rank) % world_size;
+    parallel_memcpy(
+        slice_data(data_ptr, chunk_el, data_size, rank, world_size),
+        slice_data(
+            distributed_buffer[current_buffer][rank],
+            chunk_el,
+            chunk_size / chunk_el,
+            rank,
+            world_size),
+        slice_size(chunk_el, rank, world_size) * data_size);
+  }
+
+  current_buffer = 1 - current_buffer;
+}
+
+} // namespace
+
+void AllreduceSharedMemoryData::initialize() {
+  std::string addr_string(""), port_string("");
+  const auto& addr_string_env = std::getenv("MASTER_ADDR");
+  if (addr_string_env != nullptr) {
+    addr_string = addr_string_env;
+  }
+  const auto port_string_env = std::getenv("MASTER_PORT");
+  if (port_string_env != NULL) {
+    port_string = port_string_env;
+  }
+
+  char shm_name_prefix[Allreduceworkspace::NAME_BUF_SIZE];
+  char shm_name[Allreduceworkspace::NAME_BUF_SIZE];
+  snprintf(
+      shm_name_prefix,
+      Allreduceworkspace::NAME_BUF_SIZE,
+      "%s_%d_%s_%s",
+      "shm_allreduce_buffer",
+      getuid(),
+      addr_string.c_str(),
+      port_string.c_str());
+  // create shared workspace for SHM based allreduce
+  // allocate workspace_buf for current rank
+  AllreduceWorkspace* workspace_buf;
+  AllreduceWorkspace* workspace_buf_other;
+  SharedData allreduce_buffer;
+  cur_workspace = (AllreduceWorkspace*)malloc(sizeof(AllreduceWorkspace));
+  workspace_buf = cur_workspace;
+
+  int written = snprintf(
+      shm_name,
+      AllreduceWorkspace::NAME_BUF_SIZE,
+      "%s_%d",
+      shm_name_prefix,
+      rank);
+  if (written >= AllreduceWorkspace::NAME_BUF_SIZE) {
+    std::cout << "[warning]: written >= NAME_BUF_SIZE" << std::endl;
+  }
+
+  shared_create(
+      &allreduce_buffer, shm_name, workspace_buf, sizeof(AllreduceWorkspace));
+
+  workspace_buf = (AllreduceWorkspace*)allreduce_buffer.bytes;
+  workspace_buf->states[0] = coll_alt2_allreduce_naive__copy_in_done;
+  workspace_buf->states[1] = coll_begin;
+  workspace_buf->fd = allreduce_buffer.descriptor;
+  strcpy(workspace_buf->name, shm_name);
+
+  // create the workspace pointer list
+  workspace =
+      (AllreduceWorkspace**)malloc(world_size * sizeof(Allreduceworkspace*));
+  symmetric_buffer[0] = (char**)malloc(world_size * sizeof(char**));
+  symmetric_buffer[1] = (char**)malloc(world_size * sizeof(char**));
+  distributed_buffer[0] = (char**)malloc(world_size * sizeof(char**));
+  distributed_buffer[1] = (char**)malloc(world_size * sizeof(char**));
+
+  // map shm of all ranks
+  for (int i = 0; i < world_size; i++) {
+    if (i != rank) {
+      int written = snprintf(
+          shm_name,
+          AllreduceWorkspace::NAME_BUF_SIZE,
+          "%s_%d",
+          shm_name_prefix,
+          i);
+      if (written >= AllreduceWorkspace::NAME_BUF_SIZE) {
+        std::cout << "[warning]: written >= NAME_BUF_SIZE" << std::endl;
+      }
+
+      do {
+        shared_open(&allreduce_buffer, shm_name, sizeof(AllreduceWorkspace));
+      } while (allreduce_buffer.descriptor == -1 && errno == ENOENT);
+      workspace_buf_other = (AllreduceWorkspace*)allreduce_buffer.bytes;
+      workspace[i] = workspace_buf_other;
+    } else {
+      workspace[i] = workspace_buf;
+    }
+    symmetric_buffer[0][i] = workspace[i]->buffer + BUFFER0_OFFSET(0);
+    symmetric_buffer[1][i] = workspace[i]->buffer + BUFFER0_OFFSET(1);
+    distributed_buffer[0][i] = workspace[i]->buffer + BUFFER1_OFFSET(0);
+    distributed_buffer[1][i] = workspace[i]->buffer + BUFFER1_OFFSET(1);
+  }
+  is_initialized = true;
+}
+
+AllreduceSharedMemoryData::~AllreduceSharedMemoryData() {
+  if (is_initialized == true) {
+    // unlink and munmap shared memory
+    for (int i = 0; i < world_size; i++) {
+      std::string shm_name = std::string(workspace[i]->name);
+      close(workspace[i]->fd);
+      munmap(workspace[i], sizeof(Allreduceworkspace));
+      shm_unlink(shm_name.c_str());
+    }
+
+    free(cur_workspace);
+    free(workspace);
+    free(symmetric_buffer[0]);
+    free(symmetric_buffer[1]);
+    free(distributed_buffer[0]);
+    free(distributed_buffer[1]);
+  }
+}
+
+void shm(const detail::AllreduceOptionsImpl& opts) {
+  const auto& context = opts.context;
+  if (context->shmData == nullptr) {
+    context->shmData = std::make_shared<AllreduceSharedMemoryData>(
+        context->rank, context->size);
+    context->shmData->initialize();
+  }
+  const size_t data_size = opts.elements * opts.elementSize;
+  auto& in = opts.in;
+  auto& out = opts.out;
+
+  // Do local reduction
+  if (in.size() > 0) {
+    if (in.size() == 1) {
+      memcpy(
+          static_cast<uint8_t*>(out[0]->ptr),
+          static_cast<uint8_t*>(in[0]->ptr),
+          data_size);
+    } else {
+      opts.reduce(
+          static_cast<uint8_t*>(out[0]->ptr),
+          static_cast<const uint8_t*>(in[0]->ptr),
+          static_cast<const uint8_t*>(in[1]->ptr),
+          opts.elements);
+      for (size_t i = 2; i < in.size(); i++) {
+        opts.reduce(
+            static_cast<uint8_t*>(out[0]->ptr),
+            static_cast<const uint8_t*>(out[0]->ptr),
+            static_cast<const uint8_t*>(in[i]->ptr),
+            opts.elements);
+      }
+    }
+  } else {
+    for (size_t i = 1; i < out.size(); i++) {
+      opts.reduce(
+          static_cast<uint8_t*>(out[0]->ptr),
+          static_cast<const uint8_t*>(out[0]->ptr),
+          static_cast<const uint8_t*>(out[i]->ptr),
+          opts.elements);
+    }
+  }
+
+  void* data = out[0].get()->ptr;
+
+  for (int offset = 0; offset < data_size;
+       offset += Allreduceworkspace::MAX_BUF_SIZE) {
+    auto data_ptr = ((char*)(data) + offset);
+    size_t chunk_size = data_size - offset > Allreduceworkspace::MAX_BUF_SIZE
+        ? Allreduceworkspace::MAX_BUF_SIZE
+        : data_size - offset;
+    size_t chunk_el = chunk_size / (data_size / opts.elements);
+    if (chunk_size < Allreduceworkspace::NAIVE_ALLREDUCE_THRESHOLD) {
+      symmetric_naive_all_reduce(
+          data_ptr, opts.elementSize, chunk_size, chunk_el, opts);
+    } else {
+      distributed_naive_reduce(
+          data_ptr, opts.elementSize, chunk_size, chunk_el, opts);
+    }
+  }
+
+  if (out.size() > 1) {
+    for (size_t i = 1; i < out.size(); i++) {
+      memcpy(
+          static_cast<uint8_t*>(out[i]->ptr),
+          static_cast<uint8_t*>(out[0]->ptr),
+          data_size);
+    }
+  }
+}
+
+} // namespace gloo
diff --git a/gloo/allreduce_shm.h b/gloo/allreduce_shm.h
new file mode 100644
index 000000000..a7b92b1c3
--- /dev/null
+++ b/gloo/allreduce_shm.h
@@ -0,0 +1,69 @@
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+
+#include "gloo/allreduce.h"
+
+namespace gloo {
+
+struct AllreduceSharedMemoryData {
+  enum CollState {
+    coll_begin = 0,
+    coll_allreduce_naive__copy_in_done,
+    coll_allreduce_naive__reduce_done,
+    // alternative state when allreduce is working on alternative buffer
+    // of the double buffer.
+    coll_alt1_allreduce_naive__copy_in_done,
+    coll_alt2_allreduce_naive__copy_in_done,
+    coll_alt1_allreduce_naive__reduce_done,
+  };
+
+  struct AllreduceWorkspace {
+    static constexpr size_t MAX_BUF_SIZE = 1048576 * 32;
+    static constexpr size_t NAIVE_ALLREDUCE_THRESHOLD = 1048576;
+    static constexpr int NAME_BUF_SIZE = 1000;
+
+    int fd;
+    enum CollState states[2]; // idx=0 -- state for symmetric_naive_all_reduce
+                              // idx=1 -- state for distributed_naive_all_reduce
+    // double buffer to avoid syncing between rounds
+    // offset=0 -- 2*NAIVE_ALLREDUCE_THRESHOLD : buffer for
+    // symmetric_naive_all_reduce after that : buffer for
+    // distributed_naive_all_reduce
+    char name[NAME_BUF_SIZE];
+    char buffer[2 * NAIVE_ALLREDUCE_THRESHOLD + 2 * MAX_BUF_SIZE];
+  };
+
+  AllreduceSharedMemoryData(int rank, int world_size)
+      : rank(rank),
+        world_size(world_size),
+        current_buffer(0),
+        state_idx(0),
+        is_initialized(false) {}
+  ~AllreduceSharedMemoryData();
+  void initialize();
+
+  int rank;
+  int world_size;
+  int current_buffer;
+  int state_idx;
+  bool is_initialized;
+
+  AllreduceWorkspace* cur_workspace;
+  AllreduceWorkspace** workspace;
+  // buffer for small messages, double buffer
+  char** symmetric_buffer[2];
+  // buffer for large messages, double buffer
+  char** distributed_buffer[2];
+
+  std::mutex m;
+  std::condition_variable cv;
+  bool wait_done;
+  bool shutdown;
+};
+
+void shm(const detail::AllreduceOptionsImpl& opts);
+
+} // namespace gloo
diff --git a/gloo/context.cc b/gloo/context.cc
index fd9b83c7b..367ad88b4 100644
--- a/gloo/context.cc
+++ b/gloo/context.cc
@@ -67,4 +67,8 @@ std::chrono::milliseconds Context::getTimeout() const {
   return timeout_;
 }
 
+bool Context::isIntraNode() const {
+  return transportContext_->isIntraNode();
+}
+
 } // namespace gloo
diff --git a/gloo/context.h b/gloo/context.h
index ddd6221b7..c114b8866 100644
--- a/gloo/context.h
+++ b/gloo/context.h
@@ -24,6 +24,8 @@ class Device;
 class UnboundBuffer;
 } // namespace transport
 
+class AllreduceSharedMemoryData;
+
 class Context {
  public:
   Context(int rank, int size, int base = 2);
@@ -33,6 +35,8 @@ class Context {
   const int size;
   int base;
 
+  std::shared_ptr<AllreduceSharedMemoryData> shmData;
+
   std::shared_ptr<transport::Device>& getDevice();
 
   std::unique_ptr<transport::Pair>& getPair(int i);
@@ -52,6 +56,8 @@ class Context {
 
   std::chrono::milliseconds getTimeout() const;
 
+  bool isIntraNode() const;
+
   std::unique_ptr<transport::RemoteKey> deserializeRemoteKey(
       const std::string& serialized) {
     return transportContext_->deserializeRemoteKey(serialized);
diff --git a/gloo/transport/context.cc b/gloo/transport/context.cc
index 4b4cfadbf..4b517eba5 100644
--- a/gloo/transport/context.cc
+++ b/gloo/transport/context.cc
@@ -42,18 +42,21 @@ void Context::createAndConnectAllPairs(std::shared_ptr<IStore> store) {
   const std::vector<char> value(localHostName.begin(), localHostName.end());
   store->set(localKey, value);
 
+  intraNode_ = true;
   for (int i = 0; i < size; i++) {
     if (i == rank) {
       break;
     }
 
     std::string key("rank_" + std::to_string(i));
-    auto val = store->get(key);
+    auto val = store->wait_get(key, getTimeout());
     auto hostName = std::string((const char*)val.data(), val.size());
 
     if (hostName == localHostName) {
       localRank++;
     }
+
+    intraNode_ = intraNode_ && hostName == localHostName;
   }
 
   // Create pairs
diff --git a/gloo/transport/context.h b/gloo/transport/context.h
index 1733b7979..56d8ad8ec 100644
--- a/gloo/transport/context.h
+++ b/gloo/transport/context.h
@@ -68,6 +68,10 @@ class Context {
     return timeout_;
   }
 
+  bool isIntraNode() const {
+    return intraNode_;
+  }
+
   virtual std::unique_ptr<RemoteKey> deserializeRemoteKey(
       const std::string& serialized) {
     throw std::runtime_error("Not implemented");
@@ -98,6 +102,9 @@ class Context {
   // any kind of send/recv operation.
   std::chrono::milliseconds timeout_;
 
+  // Whether is intra-node.
+  bool intraNode_ = false;
+
   std::vector<char> extractAddress(const std::vector<char>& allAddrs, int i)
       const;
 
diff --git a/gloo/transport/tcp/context.cc b/gloo/transport/tcp/context.cc
index 422896b64..e380ecc05 100644
--- a/gloo/transport/tcp/context.cc
+++ b/gloo/transport/tcp/context.cc
@@ -123,6 +123,7 @@ void Context::createAndConnectAllPairs(std::shared_ptr<IStore> store) {
       });
     }
 
+    intraNode_ = true;
     // Connect every pair
     for (int i = 0; i < size; i++) {
       if (i == rank) {
@@ -140,6 +141,8 @@ void Context::createAndConnectAllPairs(std::shared_ptr<IStore> store) {
         ++localRank;
       }
 
+      intraNode_ = intraNode_ && (remoteRankInfo.hostname == localHostName);
+
       const auto& pair = pairs_[i];
       auto remoteDeviceAddr =
           Address(remoteRankInfo.addressBytes).getSockaddr();