Skip to content

Commit

Permalink
WIP (hanging) - enable larger channe buffer slot
Browse files Browse the repository at this point in the history
  • Loading branch information
SeanNijjar committed Feb 19, 2025
1 parent b37e838 commit 5a14c57
Show file tree
Hide file tree
Showing 7 changed files with 172 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ auto forward_to_fabric_from_cb(
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size));
}

uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t)));
uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * sender.buffer_size_bytes);
sender.send_payload_blocking_from_address(packet_addr, packet_size);
noc_async_writes_flushed();
// }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,8 @@ bool RunLoopbackTest(
// EDM Builder Setup
////////////////////////////////////////////////////////////////////////////

static constexpr std::size_t edm_buffer_size = 4096 + PACKET_HEADER_SIZE_BYTES;
static constexpr std::size_t edm_buffer_size =
ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES;

auto chip0_worker_fabric_connection = chip_0_edm_builder.build_connection_to_worker_channel();
////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -910,7 +911,8 @@ bool RunLineFabricTest(
std::size_t page_plus_header_size = page_size + sizeof(tt::fabric::PacketHeader);
std::size_t tensor_size_bytes = num_pages_total * page_size;

static constexpr std::size_t edm_buffer_size = 4096 + PACKET_HEADER_SIZE_BYTES;
static constexpr std::size_t edm_buffer_size =
ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES;
const size_t local_chip_id = 0;
const size_t remote_chip_id = 1;
auto program_ptrs = std::vector<Program*>(devices.size());
Expand Down Expand Up @@ -1237,7 +1239,8 @@ int TestLoopbackEntrypoint(
IDevice* sender_device = device_0;
IDevice* receiver_device = device_1;

static constexpr std::size_t edm_buffer_size = 4096 + PACKET_HEADER_SIZE_BYTES;
static constexpr std::size_t edm_buffer_size =
ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES;
const chip_id_t local_chip_id = 0;
const chip_id_t remote_chip_id = 1;
auto const& edm_config = ttnn::ccl::FabricEriscDatamoverConfig(edm_buffer_size, 1, 2);
Expand Down Expand Up @@ -2988,7 +2991,8 @@ void RunWriteThroughputStabilityTestWithPersistentFabric(
static constexpr uint32_t source_payload_cb_index = tt::CB::c_in1;
static constexpr size_t packet_header_cb_size_in_headers = 4;
static constexpr bool enable_persistent_fabric_mode = true;
static constexpr size_t packet_payload_size_bytes = 4096;
static constexpr size_t packet_payload_size_bytes =
ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes;
static constexpr size_t dest_buffer_size = packet_payload_size_bytes * 4;
static constexpr tt::DataFormat cb_df = tt::DataFormat::Bfp8;

Expand Down Expand Up @@ -3114,7 +3118,8 @@ void RunWriteThroughputStabilityTestWithPersistentFabric(

TT_FATAL(
local_device_fabric_handle.get_num_links() == num_links,
"Error in test setup. Expected two links between devices but got {} links for device {}",
"Error in test setup. Expected {} links between devices but got {} links for device {}",
num_links,
local_device_fabric_handle.get_num_links(),
device->id());

Expand Down
154 changes: 120 additions & 34 deletions ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace ttnn::ccl {
//

FabricEriscDatamoverConfig::FabricEriscDatamoverConfig(
std::size_t channel_buffer_size_bytes, std::size_t sender_ratio_size, std::size_t receiver_ratio_size) {
std::size_t preferred_channel_buffer_size_bytes, std::size_t sender_ratio_size, std::size_t receiver_ratio_size) {
TT_FATAL(
(receiver_completed_packet_header_cb_address % eth_word_l1_alignment == 0),
"receiver_completed_packet_header_cb_address must be aligned to 16 bytes");
Expand Down Expand Up @@ -73,44 +73,103 @@ FabricEriscDatamoverConfig::FabricEriscDatamoverConfig(
"receiver_completed_packet_header_cb_address must be aligned to 16 bytes");

TT_FATAL(sender_channel_1_buffer_index_address != sender_channel_0_buffer_index_address, "FabricEriscDatamoverConfig was constructed with illegal buffer index address");
const size_t min_buffer_size = sizeof(tt::fabric::PacketHeader) + 2 * FabricEriscDatamoverConfig::eth_channel_sync_size;
TT_FATAL(channel_buffer_size_bytes >= min_buffer_size, "FabricEriscDatamoverConfig was constructed with `channel_buffer_size_bytes` argument set smaller than minimum size of {}", min_buffer_size);

constexpr size_t default_pow2_num_sender_buffer_slots = 8;
constexpr size_t default_pow2_num_receiver_buffer_slots = 16;

const std::size_t channel_buffer_size_with_channel_sync =
channel_buffer_size_bytes + sizeof(tt::fabric::PacketHeader); // + 16 // sizeof(tt::fabric::PacketHeader);
const size_t min_buffer_size = sizeof(tt::fabric::PacketHeader) + FabricEriscDatamoverConfig::eth_channel_sync_size;
TT_FATAL(
preferred_channel_buffer_size_bytes >= min_buffer_size,
"FabricEriscDatamoverConfig was constructed with `preferred_channel_buffer_size_bytes` argument set smaller "
"than minimum size of {}",
min_buffer_size);

const size_t next_lowest_power_of_2_buffer_slot_count =
// constexpr size_t default_pow2_num_sender_buffer_slots = 8;
// constexpr size_t default_pow2_num_receiver_buffer_slots = 16;
// See if we can thread in the constant from HAL

this->channel_buffer_size_bytes = channel_buffer_size_bytes;
this->channel_buffer_size_bytes_with_channel_sync = channel_buffer_size_with_channel_sync;
const std::size_t total_ratio_count = 2 * sender_ratio_size + receiver_ratio_size;

this->sender_0_channel_size_bytes = tt::round_down(
(available_channel_buffering_space / total_ratio_count) * sender_ratio_size,
channel_buffer_size_with_channel_sync);
if constexpr (FabricEriscDatamoverConfig::constrain_to_power_of_2_buffer_slot_counts) {
this->sender_0_num_buffers = default_pow2_num_sender_buffer_slots;
} else {
this->sender_0_num_buffers = this->sender_0_channel_size_bytes / channel_buffer_size_with_channel_sync;
}
this->sender_1_channel_size_bytes = tt::round_down(
(available_channel_buffering_space / total_ratio_count) * sender_ratio_size,
channel_buffer_size_with_channel_sync);
if constexpr (FabricEriscDatamoverConfig::constrain_to_power_of_2_buffer_slot_counts) {
this->sender_1_num_buffers = default_pow2_num_sender_buffer_slots;
} else {
this->sender_1_num_buffers = this->sender_1_channel_size_bytes / channel_buffer_size_with_channel_sync;
}
this->receiver_channel_size_bytes = tt::round_down(
(available_channel_buffering_space / total_ratio_count) * receiver_ratio_size,
channel_buffer_size_with_channel_sync);
if constexpr (FabricEriscDatamoverConfig::constrain_to_power_of_2_buffer_slot_counts) {
this->receiver_num_buffers = default_pow2_num_receiver_buffer_slots;
constexpr size_t min_desired_packet_payload_size_bytes = 1088 * 4;
constexpr size_t max_packet_payload_size_bytes = 8192;
constexpr size_t min_desired_packet_payload_size_bytes_with_header =
min_desired_packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader);
constexpr size_t max_packet_payload_size_bytes_with_header =
max_packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader);

auto round_down_to_power_of_2 = [](size_t x) {
TT_FATAL(
x > 0,
"Cannot compute next lowest power of 2 for 0. Internal error when setting up "
"FabricEriscDatamoverConfig");
size_t next_power_of_2 = 1;
while (x >= next_power_of_2) {
next_power_of_2 <<= 1;
}
return next_power_of_2 >> 1;
};

const size_t sender_channel_max_size_bytes =
(this->available_channel_buffering_space / total_ratio_count) * sender_ratio_size;
const size_t receiver_channel_max_size_bytes =
(this->available_channel_buffering_space / total_ratio_count) * receiver_ratio_size;

const size_t sender_channel_num_buffer_slots_non_pow2 =
sender_channel_max_size_bytes / min_desired_packet_payload_size_bytes_with_header;
const size_t receiver_channel_num_buffer_slots_non_pow2 =
receiver_channel_max_size_bytes / min_desired_packet_payload_size_bytes_with_header;
const size_t sender_channel_num_buffer_slots_pow2 =
round_down_to_power_of_2(sender_channel_num_buffer_slots_non_pow2);
const size_t receiver_channel_num_buffer_slots_pow2 =
round_down_to_power_of_2(receiver_channel_num_buffer_slots_non_pow2);

this->sender_0_num_buffers = sender_channel_num_buffer_slots_pow2;
this->sender_1_num_buffers = sender_channel_num_buffer_slots_pow2;
this->receiver_num_buffers = receiver_channel_num_buffer_slots_pow2;

this->sender_0_channel_size_bytes = sender_channel_max_size_bytes;
this->sender_1_channel_size_bytes = sender_channel_max_size_bytes;
this->receiver_channel_size_bytes = receiver_channel_max_size_bytes;

const size_t sender_0_buffer_slot_size_bytes = tt::round_down(
sender_channel_max_size_bytes / this->sender_0_num_buffers, sizeof(tt::fabric::PacketHeader));
const size_t sender_1_buffer_slot_size_bytes = tt::round_down(
sender_channel_max_size_bytes / this->sender_1_num_buffers, sizeof(tt::fabric::PacketHeader));
const size_t receiver_buffer_slot_size_bytes = tt::round_down(
receiver_channel_max_size_bytes / this->receiver_num_buffers, sizeof(tt::fabric::PacketHeader));

this->channel_buffer_size_bytes = std::min(
{sender_0_buffer_slot_size_bytes, sender_1_buffer_slot_size_bytes, receiver_buffer_slot_size_bytes});
this->sender_0_channel_size_bytes = this->channel_buffer_size_bytes * this->sender_0_num_buffers;
this->sender_1_channel_size_bytes = this->channel_buffer_size_bytes * this->sender_1_num_buffers;
this->receiver_channel_size_bytes = this->channel_buffer_size_bytes * this->receiver_num_buffers;

TT_FATAL(
this->sender_0_num_buffers == this->sender_1_num_buffers,
"Implementation expects sender_0_num_buffers and sender_1_num_buffers to be the same for now");
TT_FATAL(
this->sender_0_channel_size_bytes + this->sender_1_channel_size_bytes + this->receiver_channel_size_bytes <=
this->available_channel_buffering_space,
"Internal error - channel sizes exceed available space");
TT_FATAL(
this->channel_buffer_size_bytes >= min_desired_packet_payload_size_bytes_with_header,
"Error - couldn't produce a channel buffer slot of minimal size {} when setting up "
"FabricEriscDatamoverConfig. This indicates a bug in internal logic",
min_desired_packet_payload_size_bytes_with_header);
} else {
this->receiver_num_buffers = this->receiver_channel_size_bytes / channel_buffer_size_with_channel_sync;
this->channel_buffer_size_bytes = preferred_channel_buffer_size_bytes;
this->sender_0_channel_size_bytes = tt::round_down(
(available_channel_buffering_space / total_ratio_count) * sender_ratio_size,
this->channel_buffer_size_bytes);
this->sender_0_num_buffers = this->sender_0_channel_size_bytes / this->channel_buffer_size_bytes;

this->sender_1_channel_size_bytes = tt::round_down(
(available_channel_buffering_space / total_ratio_count) * sender_ratio_size,
this->channel_buffer_size_bytes);

this->sender_1_num_buffers = this->sender_1_channel_size_bytes / this->channel_buffer_size_bytes;

this->receiver_channel_size_bytes = tt::round_down(
(available_channel_buffering_space / total_ratio_count) * receiver_ratio_size,
this->channel_buffer_size_bytes);
this->receiver_num_buffers = this->receiver_channel_size_bytes / this->channel_buffer_size_bytes;
}

this->sender_0_channel_base_address = buffer_region_start;
Expand All @@ -123,10 +182,37 @@ FabricEriscDatamoverConfig::FabricEriscDatamoverConfig(

static constexpr size_t total_num_channels = 3; // sender0, sender1, receiver
const size_t max_channel_buffer_size = (available_channel_buffering_space / total_num_channels) - FabricEriscDatamoverConfig::eth_channel_sync_size - sizeof(tt::fabric::PacketHeader);
TT_FATAL(channel_buffer_size_bytes <= max_channel_buffer_size, "Specified size of `channel_buffer_size_bytes` was too large. Maximum allowable size is {} B", max_channel_buffer_size);
TT_FATAL(
this->channel_buffer_size_bytes <= max_channel_buffer_size,
"Specified size of `channel_buffer_size_bytes` was too large. Maximum allowable size is {} B",
max_channel_buffer_size);
TT_FATAL(this->sender_0_channel_size_bytes > 0, "Internal error when computing `sender_0_channel_size_bytes` which was computed to be size 0");
TT_FATAL(this->sender_1_channel_size_bytes > 0, "Internal error when computing `sender_1_channel_size_bytes` which was computed to be size 0");
TT_FATAL(this->receiver_channel_size_bytes > 0, "Internal error when computing `receiver_channel_size_bytes` which was computed to be size 0");
TT_FATAL(
this->receiver_channel_size_bytes % sizeof(tt::fabric::PacketHeader) == 0,
"Internal error - receiver_channel_size_bytes was computed to be not a multiple of "
"sizeof(tt::fabric::PacketHeader)");
TT_FATAL(
this->sender_0_channel_size_bytes % sizeof(tt::fabric::PacketHeader) == 0,
"Internal error - sender_0_channel_size_bytes was computed to be not a multiple of "
"sizeof(tt::fabric::PacketHeader)");
TT_FATAL(
this->sender_1_channel_size_bytes % sizeof(tt::fabric::PacketHeader) == 0,
"Internal error - sender_1_channel_size_bytes was computed to be not a multiple of "
"sizeof(tt::fabric::PacketHeader)");
TT_FATAL(
this->sender_0_channel_base_address % sizeof(tt::fabric::PacketHeader) == 0,
"Internal error - sender_0_channel_base_address was computed to be not a multiple of "
"sizeof(tt::fabric::PacketHeader)");
TT_FATAL(
this->sender_1_channel_base_address % sizeof(tt::fabric::PacketHeader) == 0,
"Internal error - sender_1_channel_base_address was computed to be not a multiple of "
"sizeof(tt::fabric::PacketHeader)");
TT_FATAL(
this->receiver_channel_base_address % sizeof(tt::fabric::PacketHeader) == 0,
"Internal error - receiver_channel_base_address was computed to be not a multiple of "
"sizeof(tt::fabric::PacketHeader)");
TT_FATAL(
this->sender_0_channel_size_bytes + this->sender_1_channel_size_bytes + this->receiver_channel_size_bytes <=
this->available_channel_buffering_space, "Internal error when computing channel sizes. Total channel size exceeds available space");
Expand Down
4 changes: 2 additions & 2 deletions ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ struct FabricEriscDatamoverConfig {
std::size_t channel_buffer_size_bytes, std::size_t sender_ratio_size, std::size_t receiver_ratio_size);

std::size_t channel_buffer_size_bytes = 0;
std::size_t channel_buffer_size_bytes_with_channel_sync = 0;
std::size_t sender_0_channel_size_bytes = 0;
std::size_t sender_0_num_buffers = 0;
std::size_t sender_1_channel_size_bytes = 0;
Expand Down Expand Up @@ -183,7 +182,8 @@ class FabricEriscDatamoverBuilder {
public:
static constexpr size_t default_firmware_context_switch_interval = 200000;
// payload only, no header
static constexpr size_t default_packet_payload_size_bytes = 4096;
static constexpr size_t default_packet_payload_size_bytes =
1088 * 4; // 4352 bytes to fit up to 4 bfp8 tiles per packet

FabricEriscDatamoverBuilder(
const CoreCoord& my_eth_core_logical,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ struct WorkerToFabricEdmSenderImpl {
noc_inline_dw_write(edm_connection_handshake_noc_addr, open_connection_value);
noc_async_read_barrier();

this->edm_buffer_addr = this->edm_buffer_base_addr + (this->get_buffer_slot_index() * (this->buffer_size_bytes + sizeof(eth_channel_sync_t)));
this->edm_buffer_addr = this->edm_buffer_base_addr + (this->get_buffer_slot_index() * (this->buffer_size_bytes));
ASSERT(*this->buffer_slot_wrptr_ptr < 20);
}

Expand Down Expand Up @@ -301,7 +301,7 @@ struct WorkerToFabricEdmSenderImpl {
*this->buffer_slot_wrptr_ptr =
!(wrptr == ((this->num_buffers_per_channel * 2) - 1)) ? wrptr + 1 : 0;
}
this->edm_buffer_addr = this->edm_buffer_base_addr + (this->get_buffer_slot_index() * (this->buffer_size_bytes + sizeof(eth_channel_sync_t)));
this->edm_buffer_addr = this->edm_buffer_base_addr + (this->get_buffer_slot_index() * this->buffer_size_bytes);
}

FORCE_INLINE uint64_t compute_dest_buffer_slot_noc_addr() const {
Expand Down
Loading

0 comments on commit 5a14c57

Please sign in to comment.