Skip to content

Commit

Permalink
Add TT-Mesh tests to N300 post commit
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-asaigal committed Feb 16, 2025
1 parent 52c53d5 commit 7ee3c89
Show file tree
Hide file tree
Showing 8 changed files with 223 additions and 147 deletions.
4 changes: 1 addition & 3 deletions .github/workflows/cpp-post-commit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,9 @@ jobs:
{name: eth, cmd: "./build/test/tt_metal/unit_tests_eth_${{ inputs.arch }}"},
{name: llk, cmd: "./build/test/tt_metal/unit_tests_llk"},
{name: stl, cmd: "./build/test/tt_metal/unit_tests_stl"},
{name: distributed, cmd: "./build/test/tt_metal/distributed/distributed_unit_tests_${{ inputs.arch }} --gtest_filter=MeshDeviceSuite.*"},

{name: distributed, cmd: "./build/test/tt_metal/distributed/distributed_unit_tests_${{ inputs.arch }}"},
{name: lightmetal, cmd: "./build/test/tt_metal/unit_tests_lightmetal"},
{name: dispatch multicmd queue, cmd: "TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_dispatch_${{ inputs.arch }} --gtest_filter=MultiCommandQueue*Fixture.*"},

{name: ttnn cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn},
{name: ttnn ccl cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_ccl},
{name: ttnn tensor cpp unit tests, cmd: ./build/test/ttnn/unit_tests_ttnn_tensor},
Expand Down
2 changes: 1 addition & 1 deletion tests/tt_metal/distributed/test_distributed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
namespace tt::tt_metal::distributed::test {
namespace {

TEST_F(T3000MultiDeviceFixture, SimpleMeshDeviceTest) {
TEST_F(T3000MeshDeviceFixture, SimpleMeshDeviceTest) {
EXPECT_EQ(mesh_device_->num_devices(), 8);
EXPECT_EQ(mesh_device_->num_rows(), 2);
EXPECT_EQ(mesh_device_->num_cols(), 4);
Expand Down
2 changes: 1 addition & 1 deletion tests/tt_metal/distributed/test_mesh_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

namespace tt::tt_metal::distributed::test {

using MeshAllocatorTest = T3000MultiDeviceFixture;
using MeshAllocatorTest = T3000MeshDeviceFixture;

TEST_F(MeshAllocatorTest, BasicAllocationSanityCheck) {
const size_t allocation_size = 1024 * 8; // 1KB
Expand Down
163 changes: 83 additions & 80 deletions tests/tt_metal/distributed/test_mesh_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
namespace tt::tt_metal::distributed::test {
namespace {

using MeshBufferTest = T3000MultiDeviceFixture;
using MeshBufferTestT3000 = T3000MeshDeviceFixture;
using MeshBufferTestSuite = GenericMeshDeviceFixture;

struct DeviceLocalShardedBufferTestConfig {
Shape2D num_pages_per_core;
Expand Down Expand Up @@ -47,36 +48,8 @@ struct DeviceLocalShardedBufferTestConfig {
}
};

TEST_F(MeshBufferTest, ConfigValidation) {
const DeviceLocalBufferConfig device_local_config{
.page_size = 1024,
.buffer_type = BufferType::DRAM,
.buffer_layout = TensorMemoryLayout::INTERLEAVED,
.bottom_up = false};

ASSERT_EQ(mesh_device_->num_rows(), 2);
ASSERT_EQ(mesh_device_->num_cols(), 4);

// Unaligned shard shape
EXPECT_ANY_THROW(MeshBuffer::create(
ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {32, 120}},
device_local_config,
mesh_device_.get()));

// Number of shards exceeds the number of devices
EXPECT_ANY_THROW(MeshBuffer::create(
ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {16, 16}},
device_local_config,
mesh_device_.get()));

// 32x32 shards distributed across 2x4 mesh, resulting in 64x128 global shape.
auto buffer = MeshBuffer::create(
ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {32, 32}},
device_local_config,
mesh_device_.get());
}

TEST_F(MeshBufferTest, ShardedBufferInitialization) {
// MeshBuffer tests on T3000
TEST_F(MeshBufferTestT3000, ShardedBufferInitialization) {
const DeviceLocalBufferConfig device_local_config{
.page_size = 1024,
.buffer_type = BufferType::DRAM,
Expand All @@ -93,7 +66,7 @@ TEST_F(MeshBufferTest, ShardedBufferInitialization) {
EXPECT_EQ(sharded_buffer->device_local_size(), 2 << 10);
}

TEST_F(MeshBufferTest, ReplicatedBufferInitialization) {
TEST_F(MeshBufferTestT3000, ReplicatedBufferInitialization) {
const DeviceLocalBufferConfig device_local_config{
.page_size = 1024,
.buffer_type = BufferType::DRAM,
Expand All @@ -108,7 +81,7 @@ TEST_F(MeshBufferTest, ReplicatedBufferInitialization) {
EXPECT_EQ(replicated_buffer->device_local_size(), 16 << 10);
}

TEST_F(MeshBufferTest, Deallocation) {
TEST_F(MeshBufferTestT3000, Deallocation) {
// Verify that a buffer is deallocated on the MeshDevice when it goes
// out of scope on host. Create a buffer with a certain config in limited
// scope. Record its address. Create another buffer with the same config
Expand Down Expand Up @@ -149,7 +122,7 @@ TEST_F(MeshBufferTest, Deallocation) {
EXPECT_FALSE(buffer_view->is_allocated());
}

TEST_F(MeshBufferTest, GetDeviceBuffer) {
TEST_F(MeshBufferTestT3000, GetDeviceBuffer) {
const DeviceLocalBufferConfig device_local_config{
.page_size = 1024,
.buffer_type = BufferType::DRAM,
Expand All @@ -165,50 +138,8 @@ TEST_F(MeshBufferTest, GetDeviceBuffer) {
EXPECT_NO_THROW(replicated_buffer->get_device_buffer(Coordinate{1, 3}));
}

TEST_F(MeshBufferTest, InterleavedShardsReadWrite) {
constexpr uint32_t NUM_ITERS = 100;
uint32_t seed = tt::parse_env("TT_METAL_SEED", 0);
uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);

for (auto buffer_type : {BufferType::L1, BufferType::DRAM}) {
DeviceLocalBufferConfig per_device_buffer_config{
.page_size = single_tile_size,
.buffer_type = BufferType::L1,
.buffer_layout = TensorMemoryLayout::INTERLEAVED,
.bottom_up = false};

std::uniform_int_distribution<int> gen_num_tiles(1, 1024);
std::mt19937 rng(seed);
for (int i = 0; i < NUM_ITERS; i++) {
uint32_t num_random_tiles = gen_num_tiles(rng);
ReplicatedBufferConfig global_buffer_config = {
.size = num_random_tiles * single_tile_size,
};

std::shared_ptr<MeshBuffer> buf =
MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device_.get());

std::vector<uint32_t> src_vec(num_random_tiles * single_tile_size / sizeof(uint32_t), 0);
std::iota(src_vec.begin(), src_vec.end(), i);
for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) {
for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) {
WriteShard(mesh_device_->mesh_command_queue(), buf, src_vec, Coordinate(logical_y, logical_x));
}
}

for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) {
for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) {
std::vector<uint32_t> dst_vec = {};
ReadShard(mesh_device_->mesh_command_queue(), dst_vec, buf, Coordinate(logical_y, logical_x));
EXPECT_EQ(dst_vec, src_vec);
}
}
}
}
}

class DeviceLocalMeshBufferShardingTest
: public MeshBufferTest,
: public MeshBufferTestT3000,
public testing::WithParamInterface<
std::tuple<std::array<uint32_t, 2>, std::array<uint32_t, 2>, TensorMemoryLayout>> {};

Expand Down Expand Up @@ -274,7 +205,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(
TensorMemoryLayout::HEIGHT_SHARDED, TensorMemoryLayout::WIDTH_SHARDED, TensorMemoryLayout::BLOCK_SHARDED)));

TEST_F(MeshBufferTest, SweepShardAndConcat) {
TEST_F(MeshBufferTestT3000, SweepShardAndConcat) {
uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);

DeviceLocalBufferConfig per_device_buffer_config{
Expand Down Expand Up @@ -312,7 +243,79 @@ TEST_F(MeshBufferTest, SweepShardAndConcat) {
}
}

TEST_F(MeshBufferTest, RowMajorShardingAndReplication) {
// MeshBuffer tests on N300 and T3000
TEST_F(MeshBufferTestSuite, ConfigValidation) {
const DeviceLocalBufferConfig device_local_config{
.page_size = 1024,
.buffer_type = BufferType::DRAM,
.buffer_layout = TensorMemoryLayout::INTERLEAVED,
.bottom_up = false};

// Unaligned shard shape
EXPECT_ANY_THROW(MeshBuffer::create(
ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {32, 120}},
device_local_config,
mesh_device_.get()));

// Number of shards exceeds the number of devices
EXPECT_ANY_THROW(MeshBuffer::create(
ShardedBufferConfig{.global_size = 16 << 10, .global_buffer_shape = {64, 128}, .shard_shape = {16, 16}},
device_local_config,
mesh_device_.get()));

// Buffer with a global shape of 64x128 distributed across a 2x4 or 2x1 Mesh.
auto buffer = MeshBuffer::create(
ShardedBufferConfig{
.global_size = 16 << 10,
.global_buffer_shape = {64, 128},
.shard_shape = {64 / mesh_device_->num_rows(), 128 / mesh_device_->num_cols()}},
device_local_config,
mesh_device_.get());
}

TEST_F(MeshBufferTestSuite, InterleavedShardsReadWrite) {
constexpr uint32_t NUM_ITERS = 100;
uint32_t seed = tt::parse_env("TT_METAL_SEED", 0);
uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);

for (auto buffer_type : {BufferType::L1, BufferType::DRAM}) {
DeviceLocalBufferConfig per_device_buffer_config{
.page_size = single_tile_size,
.buffer_type = BufferType::L1,
.buffer_layout = TensorMemoryLayout::INTERLEAVED,
.bottom_up = false};

std::uniform_int_distribution<int> gen_num_tiles(1, 1024);
std::mt19937 rng(seed);
for (int i = 0; i < NUM_ITERS; i++) {
uint32_t num_random_tiles = gen_num_tiles(rng);
ReplicatedBufferConfig global_buffer_config = {
.size = num_random_tiles * single_tile_size,
};

std::shared_ptr<MeshBuffer> buf =
MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device_.get());

std::vector<uint32_t> src_vec(num_random_tiles * single_tile_size / sizeof(uint32_t), 0);
std::iota(src_vec.begin(), src_vec.end(), i);
for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) {
for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) {
WriteShard(mesh_device_->mesh_command_queue(), buf, src_vec, Coordinate(logical_y, logical_x));
}
}

for (std::size_t logical_x = 0; logical_x < buf->device()->num_cols(); logical_x++) {
for (std::size_t logical_y = 0; logical_y < buf->device()->num_rows(); logical_y++) {
std::vector<uint32_t> dst_vec = {};
ReadShard(mesh_device_->mesh_command_queue(), dst_vec, buf, Coordinate(logical_y, logical_x));
EXPECT_EQ(dst_vec, src_vec);
}
}
}
}
}

TEST_F(MeshBufferTestSuite, RowMajorShardingAndReplication) {
uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);

DeviceLocalBufferConfig per_device_buffer_config{
Expand Down Expand Up @@ -366,7 +369,7 @@ TEST_F(MeshBufferTest, RowMajorShardingAndReplication) {
}
}

TEST_F(MeshBufferTest, ColMajorShardingAndReplication) {
TEST_F(MeshBufferTestSuite, ColMajorShardingAndReplication) {
uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);

DeviceLocalBufferConfig per_device_buffer_config{
Expand Down
19 changes: 10 additions & 9 deletions tests/tt_metal/distributed/test_mesh_events.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
namespace tt::tt_metal::distributed::test {
namespace {

using MeshEventsTest = T3000MultiCQMultiDeviceFixture;
using MeshEventsTestT3000 = T3000MultiCQMeshDeviceFixture;
using MeshEventsTestSuite = GenericMultiCQMeshDeviceFixture;

TEST_F(MeshEventsTest, ReplicatedAsyncIO) {
TEST_F(MeshEventsTestSuite, ReplicatedAsyncIO) {
uint32_t NUM_TILES = 1000;
uint32_t num_iterations = 20;
int32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);
Expand Down Expand Up @@ -61,7 +62,7 @@ TEST_F(MeshEventsTest, ReplicatedAsyncIO) {
}
}

TEST_F(MeshEventsTest, ShardedAsyncIO) {
TEST_F(MeshEventsTestT3000, ShardedAsyncIO) {
uint32_t num_iterations = 20;
uint32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);

Expand Down Expand Up @@ -108,7 +109,7 @@ TEST_F(MeshEventsTest, ShardedAsyncIO) {
}
}

TEST_F(MeshEventsTest, AsyncWorkloadAndIO) {
TEST_F(MeshEventsTestSuite, AsyncWorkloadAndIO) {
uint32_t num_iters = 5;
std::vector<std::shared_ptr<MeshBuffer>> src0_bufs = {};
std::vector<std::shared_ptr<MeshBuffer>> src1_bufs = {};
Expand All @@ -119,8 +120,8 @@ TEST_F(MeshEventsTest, AsyncWorkloadAndIO) {
auto programs = tt::tt_metal::distributed::test::utils::create_eltwise_bin_programs(
mesh_device_, src0_bufs, src1_bufs, output_bufs);
auto mesh_workload = CreateMeshWorkload();
LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0});
LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1});
LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0});
LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1});

AddProgramToMeshWorkload(mesh_workload, *programs[0], devices_0);
AddProgramToMeshWorkload(mesh_workload, *programs[1], devices_1);
Expand Down Expand Up @@ -189,7 +190,7 @@ TEST_F(MeshEventsTest, AsyncWorkloadAndIO) {
}
}

TEST_F(MeshEventsTest, CustomDeviceRanges) {
TEST_F(MeshEventsTestSuite, CustomDeviceRanges) {
uint32_t NUM_TILES = 1000;
uint32_t num_iterations = 20;
int32_t single_tile_size = ::tt::tt_metal::detail::TileSize(DataFormat::UInt32);
Expand All @@ -209,8 +210,8 @@ TEST_F(MeshEventsTest, CustomDeviceRanges) {
for (std::size_t i = 0; i < num_iterations; i++) {
std::vector<uint32_t> src_vec(NUM_TILES * single_tile_size / sizeof(uint32_t), i);
std::iota(src_vec.begin(), src_vec.end(), i);
LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {3, 0});
LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {3, 1});
LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {mesh_device_->num_cols() - 1, 0});
LogicalDeviceRange devices_1 = LogicalDeviceRange({0, 1}, {mesh_device_->num_cols() - 1, 1});

std::vector<std::vector<uint32_t>> readback_vecs = {};
std::shared_ptr<MeshEvent> event_0 = std::make_shared<MeshEvent>();
Expand Down
8 changes: 4 additions & 4 deletions tests/tt_metal/distributed/test_mesh_sub_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
namespace tt::tt_metal::distributed::test {
namespace {

using MeshSubDeviceTest = T3000MultiDeviceFixture;
using MeshSubDeviceTestSuite = GenericMeshDeviceFixture;

TEST_F(MeshSubDeviceTest, SyncWorkloadsOnSubDevice) {
TEST_F(MeshSubDeviceTestSuite, SyncWorkloadsOnSubDevice) {
SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});

Expand Down Expand Up @@ -43,7 +43,7 @@ TEST_F(MeshSubDeviceTest, SyncWorkloadsOnSubDevice) {
Finish(mesh_device_->mesh_command_queue());
}

TEST_F(MeshSubDeviceTest, DataCopyOnSubDevices) {
TEST_F(MeshSubDeviceTestSuite, DataCopyOnSubDevices) {
SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {0, 0}))});
SubDevice sub_device_2(std::array{CoreRangeSet(CoreRange({1, 1}, {1, 1}))});
SubDevice sub_device_3(std::array{CoreRangeSet(CoreRange({2, 2}, {2, 2}))});
Expand Down Expand Up @@ -136,7 +136,7 @@ TEST_F(MeshSubDeviceTest, DataCopyOnSubDevices) {
}
}

TEST_F(MeshSubDeviceTest, SubDeviceSwitching) {
TEST_F(MeshSubDeviceTestSuite, SubDeviceSwitching) {
// Sub Devices for config 0
SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {2, 2}))});
SubDevice sub_device_2(std::array{CoreRangeSet(std::vector{CoreRange({3, 3}, {3, 3}), CoreRange({4, 4}, {4, 4})})});
Expand Down
Loading

0 comments on commit 7ee3c89

Please sign in to comment.