Add TT-Mesh tests to N300 post commit

tenstorrent · Feb 16, 2025 · 7ee3c89 · 7ee3c89
1 parent 52c53d5
commit 7ee3c89
Show file tree

Hide file tree

Showing 8 changed files with 223 additions and 147 deletions.
diff --git a/.github/workflows/cpp-post-commit.yaml b/.github/workflows/cpp-post-commit.yaml
@@ -62,11 +62,9 @@ jobs:
           {name: eth, cmd: "./build/test/tt_metal/unit_tests_eth_${{ inputs.arch }}"},
           {name: llk, cmd: "./build/test/tt_metal/unit_tests_llk"},
           {name: stl, cmd: "./build/test/tt_metal/unit_tests_stl"},
-          {name: distributed, cmd: "./build/test/tt_metal/distributed/distributed_unit_tests_${{ inputs.arch }} --gtest_filter=MeshDeviceSuite.*"},
-
+          {name: distributed, cmd: "./build/test/tt_metal/distributed/distributed_unit_tests_${{ inputs.arch }}"},
           {name: lightmetal, cmd: "./build/test/tt_metal/unit_tests_lightmetal"},
           {name: dispatch multicmd queue, cmd: "TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch_${{ inputs.arch }} --gtest_filter=MultiCommandQueue*Fixture.*"},
-
           {name: ttnn cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn},
           {name: ttnn ccl cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_ccl},
           {name: ttnn tensor cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_tensor},

diff --git a/tests/tt_metal/distributed/test_distributed.cpp b/tests/tt_metal/distributed/test_distributed.cpp
@@ -9,7 +9,7 @@
 namespace tt::tt_metal::distributed::test {
 namespace {
 
-TEST_F(T3000MultiDeviceFixture, SimpleMeshDeviceTest) {
+TEST_F(T3000MeshDeviceFixture, SimpleMeshDeviceTest) {
     EXPECT_EQ(mesh_device_->num_devices(), 8);
     EXPECT_EQ(mesh_device_->num_rows(), 2);
     EXPECT_EQ(mesh_device_->num_cols(), 4);

diff --git a/tests/tt_metal/distributed/test_mesh_allocator.cpp b/tests/tt_metal/distributed/test_mesh_allocator.cpp
@@ -10,7 +10,7 @@
 
 namespace tt::tt_metal::distributed::test {
 
-using MeshAllocatorTest = T3000MultiDeviceFixture;
+using MeshAllocatorTest = T3000MeshDeviceFixture;
 
 TEST_F(MeshAllocatorTest, BasicAllocationSanityCheck) {
     const size_t allocation_size = 1024 * 8;  // 1KB

diff --git a/tests/tt_metal/distributed/test_mesh_buffer.cpp b/tests/tt_metal/distributed/test_mesh_buffer.cpp
@@ -14,7 +14,8 @@
 namespace tt::tt_metal::distributed::test {
 namespace {
 
-using MeshBufferTest = T3000MultiDeviceFixture;
+using MeshBufferTestT3000 = T3000MeshDeviceFixture;
+using MeshBufferTestSuite = GenericMeshDeviceFixture;
 
 struct DeviceLocalShardedBufferTestConfig {
     Shape2D num_pages_per_core;
@@ -47,36 +48,8 @@ struct DeviceLocalShardedBufferTestConfig {
     }
 };
 
-TEST_F(MeshBufferTest, ConfigValidation) {
-    const DeviceLocalBufferConfig device_local_config{
-        .page_size = 1024,
-        .buffer_type = BufferType::DRAM,
-        .buffer_layout = TensorMemoryLayout::INTERLEAVED,
-        .bottom_up = false};
-
-    ASSERT_EQ(mesh_device_->num_rows(), 2);
-    ASSERT_EQ(mesh_device_->num_cols(), 4);
-
-    // Unaligned shard shape
-    EXPECT_ANY_THROW(MeshBuffer::create(
-        ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {32, 120}},
-        device_local_config,
-        mesh_device_.get()));
-
-    // Number of shards exceeds the number of devices
-    EXPECT_ANY_THROW(MeshBuffer::create(
-        ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {16, 16}},
-        device_local_config,
-        mesh_device_.get()));
-
-    // 32x32 shards distributed across 2x4 mesh, resulting in 64x128 global shape.
-    auto buffer = MeshBuffer::create(
-        ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {32, 32}},
-        device_local_config,
-        mesh_device_.get());
-}
-
-TEST_F(MeshBufferTest, ShardedBufferInitialization) {
+// MeshBuffer tests on T3000
+TEST_F(MeshBufferTestT3000, ShardedBufferInitialization) {
     const DeviceLocalBufferConfig device_local_config{
         .page_size = 1024,
         .buffer_type = BufferType::DRAM,
@@ -93,7 +66,7 @@ TEST_F(MeshBufferTest, ShardedBufferInitialization) {
     EXPECT_EQ(sharded_buffer->device_local_size(), 2 << 10);
 }
 
-TEST_F(MeshBufferTest, ReplicatedBufferInitialization) {
+TEST_F(MeshBufferTestT3000, ReplicatedBufferInitialization) {
     const DeviceLocalBufferConfig device_local_config{
         .page_size = 1024,
         .buffer_type = BufferType::DRAM,
@@ -108,7 +81,7 @@ TEST_F(MeshBufferTest, ReplicatedBufferInitialization) {
     EXPECT_EQ(replicated_buffer->device_local_size(), 16 << 10);
 }
 
-TEST_F(MeshBufferTest, Deallocation) {
+TEST_F(MeshBufferTestT3000, Deallocation) {
     // Verify that a buffer is deallocated on the MeshDevice when it goes
     // out of scope on host. Create a buffer with a certain config in limited
     // scope. Record its address. Create another buffer with the same config
@@ -149,7 +122,7 @@ TEST_F(MeshBufferTest, Deallocation) {
     EXPECT_FALSE(buffer_view->is_allocated());
 }
 
-TEST_F(MeshBufferTest, GetDeviceBuffer) {
+TEST_F(MeshBufferTestT3000, GetDeviceBuffer) {
     const DeviceLocalBufferConfig device_local_config{
         .page_size = 1024,
         .buffer_type = BufferType::DRAM,
@@ -165,50 +138,8 @@ TEST_F(MeshBufferTest, GetDeviceBuffer) {
     EXPECT_NO_THROW(replicated_buffer->get_device_buffer(Coordinate{1, 3}));
 }
 
-TEST_F(MeshBufferTest, InterleavedShardsReadWrite) {
-    constexpr uint32_t NUM_ITERS = 100;
-    uint32_t seed = tt::parse_env("TT_METAL_SEED", 0);
-    uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);
-
-    for (auto buffer_type : {BufferType::L1, BufferType::DRAM}) {
-        DeviceLocalBufferConfig per_device_buffer_config{
-            .page_size = single_tile_size,
-            .buffer_type = BufferType::L1,
-            .buffer_layout = TensorMemoryLayout::INTERLEAVED,
-            .bottom_up = false};
-
-        std::uniform_int_distribution<int> gen_num_tiles(1, 1024);
-        std::mt19937 rng(seed);
-        for (int i = 0; i < NUM_ITERS; i++) {
-            uint32_t num_random_tiles = gen_num_tiles(rng);
-            ReplicatedBufferConfig global_buffer_config = {
-                .size = num_random_tiles * single_tile_size,
-            };
-
-            std::shared_ptr<MeshBuffer> buf =
-                MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device_.get());
-
-            std::vector<uint32_t> src_vec(num_random_tiles * single_tile_size / sizeof(uint32_t), 0);
-            std::iota(src_vec.begin(), src_vec.end(), i);
-            for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) {
-                for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) {
-                    WriteShard(mesh_device_->mesh_command_queue(), buf, src_vec, Coordinate(logical_y, logical_x));
-                }
-            }
-
-            for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) {
-                for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) {
-                    std::vector<uint32_t> dst_vec = {};
-                    ReadShard(mesh_device_->mesh_command_queue(), dst_vec, buf, Coordinate(logical_y, logical_x));
-                    EXPECT_EQ(dst_vec, src_vec);
-                }
-            }
-        }
-    }
-}
-
 class DeviceLocalMeshBufferShardingTest
-    : public MeshBufferTest,
+    : public MeshBufferTestT3000,
       public testing::WithParamInterface<
           std::tuple<std::array<uint32_t, 2>, std::array<uint32_t, 2>, TensorMemoryLayout>> {};
 
@@ -274,7 +205,7 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::Values(
             TensorMemoryLayout::HEIGHT_SHARDED, TensorMemoryLayout::WIDTH_SHARDED, TensorMemoryLayout::BLOCK_SHARDED)));
 
-TEST_F(MeshBufferTest, SweepShardAndConcat) {
+TEST_F(MeshBufferTestT3000, SweepShardAndConcat) {
     uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);
 
     DeviceLocalBufferConfig per_device_buffer_config{
@@ -312,7 +243,79 @@ TEST_F(MeshBufferTest, SweepShardAndConcat) {
     }
 }
 
-TEST_F(MeshBufferTest, RowMajorShardingAndReplication) {
+// MeshBuffer tests on N300 and T3000
+TEST_F(MeshBufferTestSuite, ConfigValidation) {
+    const DeviceLocalBufferConfig device_local_config{
+        .page_size = 1024,
+        .buffer_type = BufferType::DRAM,
+        .buffer_layout = TensorMemoryLayout::INTERLEAVED,
+        .bottom_up = false};
+
+    // Unaligned shard shape
+    EXPECT_ANY_THROW(MeshBuffer::create(
+        ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {32, 120}},
+        device_local_config,
+        mesh_device_.get()));
+
+    // Number of shards exceeds the number of devices
+    EXPECT_ANY_THROW(MeshBuffer::create(
+        ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {16, 16}},
+        device_local_config,
+        mesh_device_.get()));
+
+    // Buffer with a global shape of 64x128 distributed across a 2x4 or 2x1 Mesh.
+    auto buffer = MeshBuffer::create(
+        ShardedBufferConfig{
+            .global_size = 16 << 10,
+            .global_buffer_shape = {64, 128},
+            .shard_shape = {64 / mesh_device_->num_rows(), 128 / mesh_device_->num_cols()}},
+        device_local_config,
+        mesh_device_.get());
+}
+
+TEST_F(MeshBufferTestSuite, InterleavedShardsReadWrite) {
+    constexpr uint32_t NUM_ITERS = 100;
+    uint32_t seed = tt::parse_env("TT_METAL_SEED", 0);
+    uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);
+
+    for (auto buffer_type : {BufferType::L1, BufferType::DRAM}) {
+        DeviceLocalBufferConfig per_device_buffer_config{
+            .page_size = single_tile_size,
+            .buffer_type = BufferType::L1,
+            .buffer_layout = TensorMemoryLayout::INTERLEAVED,
+            .bottom_up = false};
+
+        std::uniform_int_distribution<int> gen_num_tiles(1, 1024);
+        std::mt19937 rng(seed);
+        for (int i = 0; i < NUM_ITERS; i++) {
+            uint32_t num_random_tiles = gen_num_tiles(rng);
+            ReplicatedBufferConfig global_buffer_config = {
+                .size = num_random_tiles * single_tile_size,
+            };
+
+            std::shared_ptr<MeshBuffer> buf =
+                MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device_.get());
+
+            std::vector<uint32_t> src_vec(num_random_tiles * single_tile_size / sizeof(uint32_t), 0);
+            std::iota(src_vec.begin(), src_vec.end(), i);
+            for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) {
+                for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) {
+                    WriteShard(mesh_device_->mesh_command_queue(), buf, src_vec, Coordinate(logical_y, logical_x));
+                }
+            }
+
+            for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) {
+                for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) {
+                    std::vector<uint32_t> dst_vec = {};
+                    ReadShard(mesh_device_->mesh_command_queue(), dst_vec, buf, Coordinate(logical_y, logical_x));
+                    EXPECT_EQ(dst_vec, src_vec);
+                }
+            }
+        }
+    }
+}
+
+TEST_F(MeshBufferTestSuite, RowMajorShardingAndReplication) {
     uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);
 
     DeviceLocalBufferConfig per_device_buffer_config{
@@ -366,7 +369,7 @@ TEST_F(MeshBufferTest, RowMajorShardingAndReplication) {
     }
 }
 
-TEST_F(MeshBufferTest, ColMajorShardingAndReplication) {
+TEST_F(MeshBufferTestSuite, ColMajorShardingAndReplication) {
     uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);
 
     DeviceLocalBufferConfig per_device_buffer_config{

diff --git a/tests/tt_metal/distributed/test_mesh_events.cpp b/tests/tt_metal/distributed/test_mesh_events.cpp
@@ -14,9 +14,10 @@
 namespace tt::tt_metal::distributed::test {
 namespace {
 
-using MeshEventsTest = T3000MultiCQMultiDeviceFixture;
+using MeshEventsTestT3000 = T3000MultiCQMeshDeviceFixture;
+using MeshEventsTestSuite = GenericMultiCQMeshDeviceFixture;
 
-TEST_F(MeshEventsTest, ReplicatedAsyncIO) {
+TEST_F(MeshEventsTestSuite, ReplicatedAsyncIO) {
     uint32_t NUM_TILES = 1000;
     uint32_t num_iterations = 20;
     int32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);
@@ -61,7 +62,7 @@ TEST_F(MeshEventsTest, ReplicatedAsyncIO) {
     }
 }
 
-TEST_F(MeshEventsTest, ShardedAsyncIO) {
+TEST_F(MeshEventsTestT3000, ShardedAsyncIO) {
     uint32_t num_iterations = 20;
     uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);
 
@@ -108,7 +109,7 @@ TEST_F(MeshEventsTest, ShardedAsyncIO) {
     }
 }
 
-TEST_F(MeshEventsTest, AsyncWorkloadAndIO) {
+TEST_F(MeshEventsTestSuite, AsyncWorkloadAndIO) {
     uint32_t num_iters = 5;
     std::vector<std::shared_ptr<MeshBuffer>> src0_bufs = {};
     std::vector<std::shared_ptr<MeshBuffer>> src1_bufs = {};
@@ -119,8 +120,8 @@ TEST_F(MeshEventsTest, AsyncWorkloadAndIO) {
     auto programs = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs(
         mesh_device_, src0_bufs, src1_bufs, output_bufs);
     auto mesh_workload = CreateMeshWorkload();
-    LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0});
-    LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1});
+    LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0});
+    LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1});
 
     AddProgramToMeshWorkload(mesh_workload, *programs[0], devices_0);
     AddProgramToMeshWorkload(mesh_workload, *programs[1], devices_1);
@@ -189,7 +190,7 @@ TEST_F(MeshEventsTest, AsyncWorkloadAndIO) {
     }
 }
 
-TEST_F(MeshEventsTest, CustomDeviceRanges) {
+TEST_F(MeshEventsTestSuite, CustomDeviceRanges) {
     uint32_t NUM_TILES = 1000;
     uint32_t num_iterations = 20;
     int32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);
@@ -209,8 +210,8 @@ TEST_F(MeshEventsTest, CustomDeviceRanges) {
     for (std::size_t i = 0; i < num_iterations; i++) {
         std::vector<uint32_t> src_vec(NUM_TILES * single_tile_size / sizeof(uint32_t), i);
         std::iota(src_vec.begin(), src_vec.end(), i);
-        LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0});
-        LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1});
+        LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0});
+        LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1});
 
         std::vector<std::vector<uint32_t>> readback_vecs = {};
         std::shared_ptr<MeshEvent> event_0 = std::make_shared<MeshEvent>();

diff --git a/tests/tt_metal/distributed/test_mesh_sub_device.cpp b/tests/tt_metal/distributed/test_mesh_sub_device.cpp
@@ -12,9 +12,9 @@
 namespace tt::tt_metal::distributed::test {
 namespace {
 
-using MeshSubDeviceTest = T3000MultiDeviceFixture;
+using MeshSubDeviceTestSuite = GenericMeshDeviceFixture;
 
-TEST_F(MeshSubDeviceTest, SyncWorkloadsOnSubDevice) {
+TEST_F(MeshSubDeviceTestSuite, SyncWorkloadsOnSubDevice) {
     SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
     SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
 
@@ -43,7 +43,7 @@ TEST_F(MeshSubDeviceTest, SyncWorkloadsOnSubDevice) {
     Finish(mesh_device_->mesh_command_queue());
 }
 
-TEST_F(MeshSubDeviceTest, DataCopyOnSubDevices) {
+TEST_F(MeshSubDeviceTestSuite, DataCopyOnSubDevices) {
     SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {0, 0}))});
     SubDevice sub_device_2(std::array{CoreRangeSet(CoreRange({1, 1}, {1, 1}))});
     SubDevice sub_device_3(std::array{CoreRangeSet(CoreRange({2, 2}, {2, 2}))});
@@ -136,7 +136,7 @@ TEST_F(MeshSubDeviceTest, DataCopyOnSubDevices) {
     }
 }
 
-TEST_F(MeshSubDeviceTest, SubDeviceSwitching) {
+TEST_F(MeshSubDeviceTestSuite, SubDeviceSwitching) {
     // Sub Devices for config 0
     SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
     SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});