Skip to content

Commit

Permalink
Saving work
Browse files Browse the repository at this point in the history
  • Loading branch information
sagarwalTT committed Feb 19, 2025
1 parent d174b31 commit 1110f4b
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 206 deletions.
2 changes: 1 addition & 1 deletion tt_metal/api/tt-metalium/dispatch_settings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ class DispatchSettings {
// broken down into equal sized partial pages. UNPADDED_PARTIAL_PAGE_SIZE denotes the unpadded partial page size to
// use. The size of the padded partial page is the smallest value >= UNPADDED_PARTIAL_PAGE_SIZE that is
// PCIE-aligned.
static constexpr uint32_t UNPADDED_PARTIAL_PAGE_SIZE = 3072;
static constexpr uint32_t UNPADDED_PARTIAL_PAGE_SIZE = 3044;

static_assert(
DISPATCH_MESSAGE_ENTRIES <=
Expand Down
55 changes: 34 additions & 21 deletions tt_metal/impl/buffers/dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ struct InterleavedBufferWriteLargePageDispatchParams : InterleavedBufferWriteDis
tt::stl::Span<const uint32_t> expected_num_workers_completed) :
InterleavedBufferWriteDispatchParams(
buffer, dst_page_index, total_pages_to_write, cq_id, expected_num_workers_completed) {
this->page_size_to_write = partial_page_spec.padded_partial_page_size;
this->page_size_to_write = partial_page_spec.unpadded_partial_page_size;
this->data_size_per_page_size_to_write = partial_page_spec.unpadded_partial_page_size;
this->full_pages_to_write = num_full_pages;
this->full_page_size = full_page_size;
Expand Down Expand Up @@ -267,12 +267,12 @@ PartialPageSpec calculate_partial_page_spec(const Buffer& buffer) {
// while (buffer.aligned_page_size() % partial_page.unpadded_partial_page_size != 0) {
// partial_page.unpadded_partial_page_size += 1;
// }
partial_page_spec.padded_partial_page_size = partial_page_spec.unpadded_partial_page_size;
// partial_page_spec.padded_partial_page_size = partial_page_spec.unpadded_partial_page_size;
const uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST);
const uint32_t l1_alignment = hal.get_alignment(HalMemType::L1);
while (partial_page_spec.padded_partial_page_size % pcie_alignment != 0 ||
partial_page_spec.padded_partial_page_size % l1_alignment != 0) {
partial_page_spec.padded_partial_page_size += 1;
while (partial_page_spec.unpadded_partial_page_size % pcie_alignment != 0 ||
partial_page_spec.unpadded_partial_page_size % l1_alignment != 0) {
partial_page_spec.unpadded_partial_page_size += 1;
}
partial_page_spec.num_partial_pages_per_full_page =
tt::div_up(buffer.aligned_page_size(), partial_page_spec.unpadded_partial_page_size);
Expand Down Expand Up @@ -312,10 +312,10 @@ std::unique_ptr<InterleavedBufferWriteDispatchParams> initialize_interleaved_buf
// const uint32_t num_partial_pages_per_full_page = buffer.aligned_page_size() /
// partial_page_spec.unpadded_partial_page_size;
const uint32_t full_page_size =
partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.padded_partial_page_size;
partial_page_spec.num_partial_pages_per_full_page * partial_page_spec.unpadded_partial_page_size;
const uint32_t num_full_pages = total_pages_to_write;
const uint32_t padded_buffer_size = total_pages_to_write * full_page_size;
total_pages_to_write = tt::div_up(padded_buffer_size, partial_page_spec.padded_partial_page_size);
const uint32_t padded_buffer_size = total_pages_to_write * buffer.aligned_page_size();
total_pages_to_write = num_full_pages * partial_page_spec.num_partial_pages_per_full_page;
dispatch_params = std::make_unique<InterleavedBufferWriteLargePageDispatchParams>(
buffer,
dst_page_index,
Expand Down Expand Up @@ -783,23 +783,31 @@ std::unique_ptr<BufferReadDispatchParams> initialize_interleaved_buf_read_dispat
const bool read_large_pages = are_pages_large(buffer);
if (read_large_pages) {
dispatch_params = std::make_unique<BufferReadLargePageDispatchParams>();
BufferReadLargePageDispatchParams* large_page_dispatch_params =
dynamic_cast<BufferReadLargePageDispatchParams*>(dispatch_params.get());
const PartialPageSpec partial_page = calculate_partial_page_spec(buffer);
large_page_dispatch_params->partial_page_spec = partial_page;
large_page_dispatch_params->padded_page_size =
partial_page.num_partial_pages_per_full_page * partial_page.padded_partial_page_size;
} else {
dispatch_params = std::make_unique<BufferReadDispatchParams>();
dispatch_params->padded_page_size = buffer.aligned_page_size();
}
dispatch_params->pages_per_txn = region.size / buffer.page_size();

dispatch_params->total_pages_to_read = region.size / buffer.page_size();
dispatch_params->src_page_index = region.offset / buffer.page_size();
dispatch_params->cq_id = cq_id;
dispatch_params->device = buffer.device();
dispatch_params->address = buffer.address();
dispatch_params->unpadded_dst_offset = 0;
dispatch_params->expected_num_workers_completed = expected_num_workers_completed;
dispatch_params->num_banks = buffer.device()->allocator()->get_num_banks(buffer.buffer_type());

if (read_large_pages) {
BufferReadLargePageDispatchParams* large_page_dispatch_params =
dynamic_cast<BufferReadLargePageDispatchParams*>(dispatch_params.get());
const PartialPageSpec partial_page_spec = calculate_partial_page_spec(buffer);
large_page_dispatch_params->partial_page_spec = partial_page_spec;
dispatch_params->padded_page_size =
partial_page_spec.unpadded_partial_page_size * partial_page_spec.num_partial_pages_per_full_page;
// large_page_dispatch_params->update_params_to_be_within_bounds(buffer);
} else {
dispatch_params->padded_page_size = buffer.aligned_page_size();
}

return dispatch_params;
}

Expand Down Expand Up @@ -939,14 +947,16 @@ void copy_interleaved_buffer_to_completion_queue(
Buffer& buffer,
tt::stl::Span<const SubDeviceId> sub_device_ids,
CoreType dispatch_core_type) {
if (dispatch_params.pages_per_txn > 0) {
if (dispatch_params.total_pages_to_read > 0) {
// Only 8 bits are assigned for the page offset in CQPrefetchRelayPagedCmd
// To handle larger page offsets move bank base address up and update page offset to be relative to the new
// bank address
if (dispatch_params.src_page_index > CQ_PREFETCH_RELAY_PAGED_START_PAGE_MASK) {
dispatch_params.update_params_to_be_within_bounds(buffer);
}
dispatch_params.calculate_num_pages_for_read_transaction();
issue_read_buffer_dispatch_command_sequence(buffer, dispatch_params, sub_device_ids, dispatch_core_type);
dispatch_params.update_params_after_read_transaction();
}
}

Expand All @@ -964,7 +974,7 @@ std::shared_ptr<tt::tt_metal::CompletionReaderVariant> generate_sharded_buffer_r
dispatch_params.padded_page_size,
dst,
dispatch_params.unpadded_dst_offset,
dispatch_params.pages_per_txn,
dispatch_params.total_pages_read,
initial_src_page_index,
dispatch_params.starting_src_host_page_index,
dispatch_params.buffer_page_mapping);
Expand All @@ -983,11 +993,11 @@ std::shared_ptr<tt::tt_metal::CompletionReaderVariant> generate_interleaved_buff
dispatch_params->padded_page_size,
dst,
dispatch_params->unpadded_dst_offset,
dispatch_params->pages_per_txn,
dispatch_params->total_pages_read,
dispatch_params->src_page_index,
0,
nullptr,
partial_page_spec);
nullptr);
}

void copy_completion_queue_data_into_user_space(
Expand Down Expand Up @@ -1046,6 +1056,9 @@ void copy_completion_queue_data_into_user_space(
if (buffer_page_mapping == nullptr) {
void* contiguous_dst = (void*)(uint64_t(dst) + contig_dst_offset);
if (page_size == padded_page_size) {
// if (!partial_page_spec) {

// }
uint32_t data_bytes_xfered = bytes_xfered - offset_in_completion_q_data;
tt::Cluster::instance().read_sysmem(
contiguous_dst,
Expand All @@ -1072,7 +1085,7 @@ void copy_completion_queue_data_into_user_space(
}

const uint32_t padded_page_size_to_read =
partial_page_spec ? partial_page_spec->padded_partial_page_size : padded_page_size;
partial_page_spec ? partial_page_spec->unpadded_partial_page_size : padded_page_size;

pad_size_bytes = partial_page_spec ? padded_page_size_to_read - page_size_to_read : pad_size_bytes;
// if (partial_page_spec &&
Expand Down
43 changes: 37 additions & 6 deletions tt_metal/impl/buffers/dispatch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,34 +28,65 @@ struct BufferReadDispatchParams {
uint32_t unpadded_dst_offset = 0;
uint32_t pages_per_txn = 0;
uint32_t address = 0;
uint32_t total_pages_to_read = 0;
uint32_t total_pages_read = 0;
uint32_t num_banks = 0;

virtual ~BufferReadDispatchParams() = default;

void update_params_to_be_within_bounds(const Buffer& buffer) {
const uint32_t num_banks = this->device->allocator()->get_num_banks(buffer.buffer_type());
const uint32_t num_pages_per_bank = this->src_page_index / num_banks;
virtual void update_params_to_be_within_bounds(const Buffer& buffer) {
const uint32_t num_pages_per_bank = this->src_page_index / this->num_banks;
this->address += num_pages_per_bank * this->padded_page_size;
this->src_page_index = this->src_page_index % num_banks;
this->src_page_index = this->src_page_index % this->num_banks;
}

virtual void calculate_num_pages_for_read_transaction() { this->pages_per_txn = this->total_pages_to_read; }

virtual void update_params_after_read_transaction() {
this->total_pages_to_read -= this->pages_per_txn;
this->total_pages_read += this->pages_per_txn;
this->src_page_index += this->pages_per_txn;
}
};

struct PartialPageSpec {
uint32_t unpadded_partial_page_size = 0;
uint32_t padded_partial_page_size = 0;
// uint32_t padded_partial_page_size = 0;
uint32_t last_partial_page_additional_padding = 0;
uint32_t num_partial_pages_per_full_page = 0;
};

struct BufferReadLargePageDispatchParams : BufferReadDispatchParams {
PartialPageSpec partial_page_spec;

void update_params_to_be_within_bounds(const Buffer& buffer) override {
const uint32_t num_pages_per_bank = this->src_page_index / this->num_banks;
this->address += num_pages_per_bank * (this->partial_page_spec.num_partial_pages_per_full_page *
this->partial_page_spec.unpadded_partial_page_size);
this->src_page_index = this->src_page_index % this->num_banks;
}

// void calculate_num_pages_for_read_transaction() override {
// this->pages_per_txn =
// std::min(this->total_pages_to_read, this->num_banks - (this->src_page_index % this->num_banks));
// }

void update_params_after_read_transaction() override {
this->total_pages_to_read -= this->pages_per_txn;
this->total_pages_read += this->pages_per_txn;
this->address += ((this->src_page_index + this->pages_per_txn) / this->num_banks) *
(this->partial_page_spec.num_partial_pages_per_full_page *
this->partial_page_spec.unpadded_partial_page_size);
this->src_page_index = (this->src_page_index + this->pages_per_txn) % this->num_banks;
}
};

struct ShardedBufferReadDispatchParams : BufferReadDispatchParams {
bool width_split = false;
uint32_t initial_pages_skipped = 0;
uint32_t starting_src_host_page_index = 0;
std::shared_ptr<const BufferPageMapping> buffer_page_mapping = nullptr;
uint32_t total_pages_to_read = 0;
// uint32_t total_pages_to_read = 0;
uint32_t total_pages_read = 0;
uint32_t max_pages_per_shard = 0;
CoreCoord core;
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/third_party/umd
Submodule umd updated 47 files
+3 −3 .github/workflows/build-tests.yml
+6 −9 cmake/CPM.cmake
+13 −16 cmake/dependencies.cmake
+1 −0 device/CMakeLists.txt
+56 −0 device/api/umd/device/blackhole_arc_telemetry_reader.h
+3 −0 device/api/umd/device/blackhole_implementation.h
+12 −13 device/api/umd/device/cluster.h
+4 −0 device/api/umd/device/tt_device/blackhole_tt_device.h
+2 −0 device/api/umd/device/tt_device/grayskull_tt_device.h
+4 −7 device/api/umd/device/tt_device/tt_device.h
+2 −0 device/api/umd/device/tt_device/wormhole_tt_device.h
+2 −0 device/api/umd/device/tt_simulation_device.h
+4 −3 device/api/umd/device/tt_soc_descriptor.h
+56 −0 device/api/umd/device/types/blackhole_telemetry.h
+31 −0 device/api/umd/device/types/cluster_descriptor_types.h
+93 −0 device/blackhole/blackhole_arc_telemetry_reader.cpp
+3 −3 device/blackhole/blackhole_coordinate_manager.cpp
+3 −1 device/blackhole/blackhole_implementation.cpp
+13 −12 device/cluster.cpp
+35 −25 device/coordinate_manager.cpp
+5 −5 device/grayskull/grayskull_coordinate_manager.cpp
+5 −1 device/mockup/tt_mockup_device.hpp
+5 −1 device/simulation/tt_simulation_device.cpp
+32 −1 device/tt_device/blackhole_tt_device.cpp
+5 −0 device/tt_device/grayskull_tt_device.cpp
+1 −18 device/tt_device/tt_device.cpp
+5 −0 device/tt_device/wormhole_tt_device.cpp
+3 −1 device/tt_soc_descriptor.cpp
+249 −0 docs/coordinate_systems.md
+ docs/images/tensix_grid.png
+ docs/images/tensix_harvested_rows.png
+ docs/images/tensix_logical_coordinates.png
+ docs/images/tensix_logical_coordinates_harvested.png
+ docs/images/tensix_phyiscal_coordinates_harvested.png
+ docs/images/tensix_physical_coordinates.png
+ docs/images/tensix_translated_coordinates.png
+ docs/images/tensix_translated_coordinates_harvested.png
+ docs/images/tensix_virtual_coordinates_harvested.png
+0 −1 tests/api/test_cluster.cpp
+2 −0 tests/blackhole/CMakeLists.txt
+23 −0 tests/blackhole/test_arc_telemetry_bh.cpp
+0 −82 tests/blackhole/test_bh_common.h
+21 −0 tests/blackhole/test_chip_info_bh.cpp
+5 −3 tests/blackhole/test_cluster_bh.cpp
+1 −15 tests/microbenchmark/device_fixture.hpp
+25 −46 tests/microbenchmark/test_rw_tensix.cpp
+5 −3 tests/wormhole/test_cluster_wh.cpp
Loading

0 comments on commit 1110f4b

Please sign in to comment.