diff --git a/include/ttlang/Dialect/Utils/LayoutUtils.h b/include/ttlang/Dialect/Utils/LayoutUtils.h index 88d96de5..ff183f3c 100644 --- a/include/ttlang/Dialect/Utils/LayoutUtils.h +++ b/include/ttlang/Dialect/Utils/LayoutUtils.h @@ -18,6 +18,44 @@ struct ContiguousLayoutInfo { int64_t pageSizeBytes; }; +/// Contiguity classification for tensor layouts. +/// Determines the granularity of block transfers that can be used. +enum class ContiguityLevel { + FullyContiguous, // Entire tensor is contiguous - single block transfer + RowContiguous, // Each row is contiguous - per-row block transfers + TileContiguous, // Only 32x32 tiles are contiguous - tile-level transfers + NonContiguous // Scattered/complex layout - error case +}; + +/// Extended layout analysis result with contiguity information. +/// Used to determine optimal data transfer strategy for ttl.copy lowering. +struct LayoutContiguityInfo { + ContiguityLevel level; + int64_t totalElements; // Total elements in tensor + int64_t totalSizeBytes; // Total size in bytes + int64_t rowSizeBytes; // Bytes per contiguous row + int64_t rowStrideBytes; // Bytes between row starts (may include padding) + int64_t numRows; // Number of rows + int64_t elemByteWidth; // Bytes per element + bool isRowMajor; + bool hasPadding; +}; + +/// Analyze tensor layout contiguity from TTNNLayoutAttr. +/// +/// Determines the optimal data transfer strategy by inspecting the tensor's +/// layout encoding. The analysis checks: +/// 1. Layout type (RowMajor vs Tile) +/// 2. Memory layout (Interleaved vs Sharded) +/// 3. Affine map (identity vs permuted) +/// +/// Returns: +/// - FullyContiguous: Entire tensor can be transferred as one block +/// - RowContiguous: Each row can be transferred as a block (with padding) +/// - TileContiguous: Fall back to per-tile transfers (current behavior) +/// - NonContiguous: Complex/unsupported layout +LayoutContiguityInfo analyzeLayoutContiguity(RankedTensorType tensorTy); + inline ContiguousLayoutInfo computeContiguousLayout(RankedTensorType tensorTy) { ArrayRef shape = tensorTy.getShape(); // TODO(ttl): Replace this contiguous fallback with stride/page derivation @@ -29,8 +67,7 @@ inline ContiguousLayoutInfo computeContiguousLayout(RankedTensorType tensorTy) { int64_t elemBits = tensorTy.getElementType().getIntOrFloatBitWidth(); int64_t elemByteWidth = elemBits / 8; - // TODO(ttl): Derive page size from actual tiling/sharding when available. - // Issue: #83. + // TODO(#83): Derive page size from actual tiling/sharding when available. int64_t pageSizeBytes = elemByteWidth * rowStrideElems; return {rowStrideElems, colStrideElems, elemByteWidth, pageSizeBytes}; diff --git a/lib/Dialect/CMakeLists.txt b/lib/Dialect/CMakeLists.txt index 388251aa..86f8665e 100644 --- a/lib/Dialect/CMakeLists.txt +++ b/lib/Dialect/CMakeLists.txt @@ -4,3 +4,4 @@ add_subdirectory(D2M) add_subdirectory(TTKernel) add_subdirectory(TTL) add_subdirectory(TTMetal) +add_subdirectory(Utils) diff --git a/lib/Dialect/TTL/Transforms/CMakeLists.txt b/lib/Dialect/TTL/Transforms/CMakeLists.txt index 809caf0b..36637e72 100644 --- a/lib/Dialect/TTL/Transforms/CMakeLists.txt +++ b/lib/Dialect/TTL/Transforms/CMakeLists.txt @@ -23,4 +23,5 @@ add_mlir_dialect_library(TTLangTTLTransforms MLIRTTKernelDialect MLIRTTLDialect TTLangTTKernelTransforms + TTLangUtils ) diff --git a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp index ee58ca62..8b40e20f 100644 --- a/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp +++ b/lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp @@ -515,6 +515,126 @@ emitTileLoop(OpBuilder &builder, Location loc, int64_t tilesY, int64_t tilesX, } } +//===----------------------------------------------------------------------===// +// Block Transfer Optimization +//===----------------------------------------------------------------------===// + +/// Emit a single block transfer for fully contiguous tensors. +/// Replaces entire tile loop with one noc_async_read/write operation. +static void +emitSingleBlockTransfer(OpBuilder &builder, Location loc, Value tensorAccessor, + Value cbPtr, bool isRead, + const utils::LayoutContiguityInfo &layoutInfo) { + // Get NOC address for tile 0 (start of tensor). + auto zero = builder.create(loc, 0, 32); + auto nocAddr = builder.create( + loc, tensorAccessor, zero, zero, /*offset=*/nullptr); + + // Total size in bytes. + auto size = + builder.create(loc, layoutInfo.totalSizeBytes, 32); + + if (isRead) { + // Tensor -> CB: read(srcNocAddr, dstL1Addr, size) + builder.create(loc, nocAddr, cbPtr, size); + } else { + // CB -> Tensor: write(srcL1Addr, dstNocAddr, size) + builder.create(loc, cbPtr, nocAddr, size); + } +} + +/// Emit row-by-row block transfers for row-contiguous tensors with padding. +/// Generates loop over rows, one noc_async_read/write per row. +static void emitRowBlockTransfers(OpBuilder &builder, Location loc, + Value tensorAccessor, Value cbPtr, + bool isRead, + const utils::LayoutContiguityInfo &layoutInfo, + int64_t tilesPerRow) { + auto zero = builder.create(loc, 0); + auto one = builder.create(loc, 1); + auto numRows = + builder.create(loc, layoutInfo.numRows); + auto rowSizeBytes = + builder.create(loc, layoutInfo.rowSizeBytes, 32); + auto tilesPerRowVal = + builder.create(loc, tilesPerRow); + auto tileSize = builder.create(loc, 32); + + scf::buildLoopNest( + builder, loc, ValueRange{zero}, ValueRange{numRows}, ValueRange{one}, + [&](OpBuilder &b, Location bodyLoc, ValueRange ivs) { + Value rowIdx = ivs[0]; + + // Compute tile ID for start of row: tileId = (rowIdx / 32) * + // tilesPerRow + Value tileRow = b.create(bodyLoc, rowIdx, tileSize); + Value tileId = + b.create(bodyLoc, tileRow, tilesPerRowVal); + Value tileId32 = + b.create(bodyLoc, b.getI32Type(), tileId); + auto offset = b.create(bodyLoc, 0, 32); + + // Get NOC address for this row. + auto nocAddr = b.create( + bodyLoc, tensorAccessor, tileId32, offset, /*offset=*/nullptr); + + // Compute CB address for this row: cbPtr + (rowIdx * rowSizeBytes) + Value rowIdxI32 = + b.create(bodyLoc, b.getI32Type(), rowIdx); + Value cbRowOffset = + b.create(bodyLoc, rowIdxI32, rowSizeBytes); + Value cbAddrForRow = + b.create(bodyLoc, cbPtr, cbRowOffset); + + if (isRead) { + b.create(bodyLoc, nocAddr, cbAddrForRow, + rowSizeBytes); + } else { + b.create(bodyLoc, cbAddrForRow, nocAddr, + rowSizeBytes); + } + }); +} + +/// Emit optimized data transfer based on layout contiguity analysis. +/// Implements three-level optimization hierarchy: +/// 1. FullyContiguous -> single block transfer +/// 2. RowContiguous -> per-row block transfers +/// 3. TileContiguous/NonContiguous -> per-tile transfers +static void emitOptimizedTransfer(OpBuilder &builder, Location loc, + Value tensorAccessor, Value cbPtr, + bool isRead, RankedTensorType tensorTy, + int64_t tilesY, int64_t tilesX) { + auto layoutInfo = utils::analyzeLayoutContiguity(tensorTy); + + switch (layoutInfo.level) { + case utils::ContiguityLevel::FullyContiguous: + emitSingleBlockTransfer(builder, loc, tensorAccessor, cbPtr, isRead, + layoutInfo); + break; + + case utils::ContiguityLevel::RowContiguous: + emitRowBlockTransfers(builder, loc, tensorAccessor, cbPtr, isRead, + layoutInfo, tilesX); + break; + + case utils::ContiguityLevel::TileContiguous: + case utils::ContiguityLevel::NonContiguous: + // Existing tile loop fallback. + emitTileLoop(builder, loc, tilesY, tilesX, + [&](OpBuilder &b, Location bodyLoc, Value tileOffset) { + if (isRead) { + b.create(bodyLoc, tileOffset, + tensorAccessor, cbPtr); + } else { + b.create(bodyLoc, tileOffset, + tensorAccessor, cbPtr); + } + }); + break; + } +} + //===----------------------------------------------------------------------===// // DMA Copy Grouping for Fused Loop Emission //===----------------------------------------------------------------------===// @@ -524,7 +644,8 @@ struct CopyInfo { CopyOp op; int64_t tilesY; int64_t tilesX; - bool isRead; // tensor->CB (read) vs CB->tensor (write) + bool isRead; // tensor->CB (read) vs CB->tensor (write) + RankedTensorType tensorTy; // Tensor type for layout analysis }; /// Check if all operands of a copy operation properly dominate the given @@ -659,20 +780,42 @@ static void emitGroupedCopies(ArrayRef copies, cbPtrs.size() != subgroup.size()) { continue; } + // Phase 2: Emit transfers based on layout contiguity. + // Check if all copies in subgroup have TileContiguous layout (benefit + // from loop fusion). For FullyContiguous/RowContiguous, emit separate + // optimized transfers since they don't benefit from fusion. + bool allTileContiguous = true; + for (const CopyInfo &info : subgroup) { + auto layoutInfo = utils::analyzeLayoutContiguity(info.tensorTy); + if (layoutInfo.level != utils::ContiguityLevel::TileContiguous && + layoutInfo.level != utils::ContiguityLevel::NonContiguous) { + allTileContiguous = false; + break; + } + } - // Phase 2: Emit single fused tile loop. - emitTileLoop(builder, loc, tilesY, tilesX, - [&](OpBuilder &b, Location bodyLoc, Value tileOffset) { - for (size_t i = 0; i < subgroup.size(); ++i) { - if (isRead) { - b.create( - bodyLoc, tileOffset, accessors[i], cbPtrs[i]); - } else { - b.create( - bodyLoc, tileOffset, accessors[i], cbPtrs[i]); + if (allTileContiguous) { + // All copies are tile-contiguous: emit single fused tile loop. + emitTileLoop(builder, loc, tilesY, tilesX, + [&](OpBuilder &b, Location bodyLoc, Value tileOffset) { + for (size_t i = 0; i < subgroup.size(); ++i) { + if (isRead) { + b.create( + bodyLoc, tileOffset, accessors[i], cbPtrs[i]); + } else { + b.create( + bodyLoc, tileOffset, accessors[i], cbPtrs[i]); + } } - } - }); + }); + } else { + // Some copies are fully/row contiguous: emit separate optimized + // transfers (block transfers don't benefit from loop fusion). + for (size_t i = 0; i < subgroup.size(); ++i) { + emitOptimizedTransfer(builder, loc, accessors[i], cbPtrs[i], isRead, + subgroup[i].tensorTy, tilesY, tilesX); + } + } // Phase 3: Replace original copy ops with dummy handle and mark handled. // Use UnrealizedConversionCastOp to preserve TransferHandleType so that @@ -720,8 +863,9 @@ static void processBlockForCopyGrouping(Block &block, Value tensor = isRead ? src : copyOp.getDst(); auto [tilesY, tilesX] = getTileGridShapeFromValue(tensor); + auto tensorTy = mlir::cast(tensor.getType()); - copies.push_back({copyOp, tilesY, tilesX, isRead}); + copies.push_back({copyOp, tilesY, tilesX, isRead, tensorTy}); } else if (isa(&op)) { // Process accumulated copies at synchronization boundary. emitGroupedCopies(copies, handledOps, builder); @@ -779,13 +923,11 @@ static LogicalResult lowerTensorToCB(CopyOp op, Value srcTensor, Value dstCB, auto [tilesY, tilesX] = getTileGridShapeFromValue(srcTensor); - // TODO(#138): Emit single block transfer for contiguous layouts instead of - // tile loop. - emitTileLoop(rewriter, loc, tilesY, tilesX, - [&](OpBuilder &b, Location bodyLoc, Value tileOffset) { - b.create(bodyLoc, tileOffset, - *srcAccessor, cbWritePtr); - }); + // Emit optimized transfer based on layout contiguity (single block, row + // blocks, or tile fallback). + auto tensorTy = mlir::cast(srcTensor.getType()); + emitOptimizedTransfer(rewriter, loc, *srcAccessor, cbWritePtr, + /*isRead=*/true, tensorTy, tilesY, tilesX); auto handle = makeZeroI32(loc, rewriter); rewriter.replaceOp(op, handle); @@ -820,13 +962,11 @@ static LogicalResult lowerCBToTensor(CopyOp op, Value srcCB, Value dstTensor, auto [tilesY, tilesX] = getTileGridShapeFromValue(dstTensor); - // TODO(#138): Emit single block transfer for contiguous layouts instead of - // tile loop. - emitTileLoop(rewriter, loc, tilesY, tilesX, - [&](OpBuilder &b, Location bodyLoc, Value tileOffset) { - b.create(bodyLoc, tileOffset, - *dstAccessor, cbReadPtr); - }); + // Emit optimized transfer based on layout contiguity (single block, row + // blocks, or tile fallback). + auto tensorTy = mlir::cast(dstTensor.getType()); + emitOptimizedTransfer(rewriter, loc, *dstAccessor, cbReadPtr, + /*isRead=*/false, tensorTy, tilesY, tilesX); auto handle = makeZeroI32(loc, rewriter); rewriter.replaceOp(op, handle); diff --git a/lib/Dialect/Utils/CMakeLists.txt b/lib/Dialect/Utils/CMakeLists.txt new file mode 100644 index 00000000..33c46b11 --- /dev/null +++ b/lib/Dialect/Utils/CMakeLists.txt @@ -0,0 +1,7 @@ +add_mlir_dialect_library(TTLangUtils + LayoutUtils.cpp + + LINK_LIBS PUBLIC + MLIRIR + MLIRTTNNDialect +) diff --git a/lib/Dialect/Utils/LayoutUtils.cpp b/lib/Dialect/Utils/LayoutUtils.cpp new file mode 100644 index 00000000..72c48e29 --- /dev/null +++ b/lib/Dialect/Utils/LayoutUtils.cpp @@ -0,0 +1,103 @@ +// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "ttlang/Dialect/Utils/LayoutUtils.h" + +#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h" + +#include + +namespace mlir::tt::ttl::utils { + +LayoutContiguityInfo analyzeLayoutContiguity(RankedTensorType tensorTy) { + LayoutContiguityInfo result{}; + ArrayRef shape = tensorTy.getShape(); + + // Compute basic size info. + int64_t elemBits = tensorTy.getElementType().getIntOrFloatBitWidth(); + result.elemByteWidth = elemBits / CHAR_BIT; + result.totalElements = 1; + for (int64_t dim : shape) { + result.totalElements *= dim; + } + result.totalSizeBytes = result.totalElements * result.elemByteWidth; + + // Default to TileContiguous (current behavior) if no layout encoding. + auto encoding = tensorTy.getEncoding(); + if (!encoding) { + result.level = ContiguityLevel::TileContiguous; + result.isRowMajor = true; + result.hasPadding = false; + if (shape.size() >= 2) { + result.numRows = shape[shape.size() - 2]; + result.rowSizeBytes = shape.back() * result.elemByteWidth; + result.rowStrideBytes = result.rowSizeBytes; + } else { + result.numRows = 1; + result.rowSizeBytes = result.totalSizeBytes; + result.rowStrideBytes = result.rowSizeBytes; + } + return result; + } + + auto layout = mlir::dyn_cast(encoding); + if (!layout) { + // Unknown encoding - fall back to tile transfers. + result.level = ContiguityLevel::TileContiguous; + return result; + } + + // Check layout type. + bool isTiled = layout.isTiled(); + result.isRowMajor = !isTiled; + + // Check memory layout (interleaved vs sharded). + bool isInterleaved = true; + if (auto memLayout = layout.getMemLayout()) { + isInterleaved = !tt::ttnn::isShardedMemoryLayout(memLayout.getValue()); + } + + // Check affine map for identity (no permutation). + bool hasIdentityMap = true; + if (auto linearMap = layout.getLinear()) { + // Identity map: (d0, d1, ...) -> (d0, d1, ...) + hasIdentityMap = linearMap.isIdentity(); + } + + // Determine contiguity level based on layout properties. + if (!isTiled && isInterleaved && hasIdentityMap) { + // RowMajor + Interleaved + Identity = FullyContiguous + result.level = ContiguityLevel::FullyContiguous; + result.hasPadding = false; + } else if (!isTiled && hasIdentityMap) { + // RowMajor + Identity but sharded = RowContiguous + result.level = ContiguityLevel::RowContiguous; + result.hasPadding = true; + } else if (isTiled) { + // Tiled layout = TileContiguous (per-tile transfers) + result.level = ContiguityLevel::TileContiguous; + result.hasPadding = false; + } else { + // Complex layout (permuted map, etc) = NonContiguous + result.level = ContiguityLevel::NonContiguous; + result.hasPadding = true; + } + + // Compute row info for RowContiguous transfers. + if (shape.size() >= 2) { + result.numRows = shape[shape.size() - 2]; + result.rowSizeBytes = shape.back() * result.elemByteWidth; + // For sharded layouts, rowStrideBytes may differ from rowSizeBytes. + // TODO(#138): Extract actual stride from sharding spec. + result.rowStrideBytes = result.rowSizeBytes; + } else { + result.numRows = 1; + result.rowSizeBytes = result.totalSizeBytes; + result.rowStrideBytes = result.rowSizeBytes; + } + + return result; +} + +} // namespace mlir::tt::ttl::utils diff --git a/test/ttlang/Conversion/TTLToTTKernel/block_transfers.mlir b/test/ttlang/Conversion/TTLToTTKernel/block_transfers.mlir new file mode 100644 index 00000000..dc14bad5 --- /dev/null +++ b/test/ttlang/Conversion/TTLToTTKernel/block_transfers.mlir @@ -0,0 +1,96 @@ +// RUN: ttlang-opt --ttl-to-ttkernel-pipeline --split-input-file %s | FileCheck %s + +// Block transfer optimization tests. +// +// Tests verify that TTL copy operations lower to appropriate transfer strategies +// based on tensor layout contiguity: +// 1. FullyContiguous (row-major + interleaved): single noc_async_read/write +// 2. RowContiguous (row-major + sharded): per-row transfers (TODO: #118) +// 3. TileContiguous (tiled layout): per-tile noc_async_read_tile/write_tile + +#dram = #ttnn.buffer_type +#layout_tiled = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x2x!ttcore.tile<32x32, f32>, #dram>, > + +// Tiled layout (TileContiguous): generates nested loops with per-tile transfers. +// CHECK-LABEL: func.func @tiled_layout_uses_tile_transfers +// CHECK: %[[CB:.*]] = ttkernel.get_compile_time_arg_val +// CHECK: %[[ACCESSOR_ARGS:.*]] = ttkernel.TensorAccessorArgs +// CHECK-NEXT: %[[ACCESSOR:.*]] = ttkernel.TensorAccessor +// CHECK-NEXT: %[[CB_PTR:.*]] = ttkernel.get_write_ptr(%[[CB]]) +// CHECK-NEXT: scf.for +// CHECK-NEXT: scf.for +// CHECK: ttkernel.noc_async_read_tile({{.*}}, %[[ACCESSOR]], %[[CB_PTR]]) +// CHECK: ttkernel.noc_async_read_barrier +// CHECK-NOT: ttkernel.noc_async_read_barrier +// CHECK-NOT: ttkernel.noc_async_write_barrier +module { + func.func @tiled_layout_uses_tile_transfers(%arg0: tensor<64x64xf32, #layout_tiled>) + attributes {ttl.kernel_thread = #ttkernel.thread} { + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %xf = ttl.copy %arg0, %cb : (tensor<64x64xf32, #layout_tiled>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout_row_major = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x64xf32, #dram>, > + +// Row-major interleaved (FullyContiguous): single block transfer for entire tensor. +// Verifies: no loops, uses noc_async_read (not _tile), single barrier, correct size. +// CHECK-LABEL: func.func @row_major_uses_single_block_transfer +// CHECK: %[[SIZE:.*]] = arith.constant 16384 : i32 +// CHECK: %[[CB:.*]] = ttkernel.get_compile_time_arg_val +// CHECK: %[[ACCESSOR_ARGS:.*]] = ttkernel.TensorAccessorArgs +// CHECK-NEXT: %[[ACCESSOR:.*]] = ttkernel.TensorAccessor +// CHECK-NEXT: %[[CB_PTR:.*]] = ttkernel.get_write_ptr(%[[CB]]) +// CHECK-NEXT: %[[NOC_ADDR:.*]] = ttkernel.tensor_accessor.get_noc_addr(%[[ACCESSOR]] +// CHECK-NEXT: ttkernel.noc_async_read(%[[NOC_ADDR]], %[[CB_PTR]], %[[SIZE]]) +// CHECK-NEXT: ttkernel.noc_async_read_barrier +// CHECK-NOT: scf.for +// CHECK-NOT: ttkernel.noc_async_read_tile +// CHECK-NOT: ttkernel.noc_async_read_barrier +// CHECK-NOT: ttkernel.noc_async_write_barrier +module { + func.func @row_major_uses_single_block_transfer(%arg0: tensor<64x64xf32, #layout_row_major>) + attributes {ttl.kernel_thread = #ttkernel.thread} { + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %xf = ttl.copy %arg0, %cb : (tensor<64x64xf32, #layout_row_major>, !ttl.cb<[1, 1], f32, 2>) -> !ttl.transfer_handle + ttl.wait %xf : !ttl.transfer_handle + func.return + } +} + +// ----- + +#dram = #ttnn.buffer_type +#layout_row_major = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<64x64xf32, #dram>, > + +// Row-major interleaved write (FullyContiguous): single block write. +// Verifies: no loops, uses noc_async_write (not _tile), single barrier. +// CHECK-LABEL: func.func @row_major_write_uses_single_block_transfer +// CHECK: %[[SIZE:.*]] = arith.constant 16384 : i32 +// CHECK: %[[CB:.*]] = ttkernel.get_compile_time_arg_val +// CHECK: %[[ACCESSOR_ARGS:.*]] = ttkernel.TensorAccessorArgs +// CHECK-NEXT: %[[ACCESSOR:.*]] = ttkernel.TensorAccessor +// CHECK-NEXT: %[[CB_PTR:.*]] = ttkernel.get_read_ptr(%[[CB]]) +// CHECK-NEXT: %[[NOC_ADDR:.*]] = ttkernel.tensor_accessor.get_noc_addr(%[[ACCESSOR]] +// CHECK-NEXT: ttkernel.noc_async_write(%[[CB_PTR]], %[[NOC_ADDR]], %[[SIZE]]) +// CHECK-NEXT: ttkernel.noc_async_write_barrier +// CHECK-NOT: scf.for +// CHECK-NOT: ttkernel.noc_async_write_tile +// CHECK-NOT: ttkernel.noc_async_write_barrier +// CHECK-NOT: ttkernel.noc_async_read_barrier +module { + func.func @row_major_write_uses_single_block_transfer(%arg0: tensor<64x64xf32, #layout_row_major>) + attributes {ttl.kernel_thread = #ttkernel.thread} { + %cb = ttl.bind_cb {cb_index = 0, buffer_factor = 2} : !ttl.cb<[1, 1], f32, 2> + %xf = ttl.copy %cb, %arg0 : (!ttl.cb<[1, 1], f32, 2>, tensor<64x64xf32, #layout_row_major>) -> !ttl.transfer_handle + ttl.wait %xf : !ttl.transfer_handle + func.return + } +} + +// TODO(#118): Add tests for sharded row-major layouts (RowContiguous) when supported. diff --git a/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir b/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir index 12ab4de3..be651a51 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_batched_single_tile.mlir @@ -33,6 +33,7 @@ // CHECK-NEXT: noc_async_read_tile([[ZERO]], [[ACCESSOR1]], [[CB_PTR1]]); // Consecutive barriers deduplicated to single barrier. // CHECK: noc_async_read_barrier(); +// CHECK-NOT: noc_async_read_barrier // CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_1d_fused.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_1d_fused.mlir index 7f42b7b1..09c6f6b4 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_1d_fused.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_1d_fused.mlir @@ -24,32 +24,32 @@ #layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<1x2x!ttcore.tile<32x32, f32>, #dram>, > // CHECK-LABEL: // multi_tile_1d_fused -// CHECK-NEXT: #include -// CHECK-NEXT: #include "tools/profiler/kernel_profiler.hpp" -// CHECK-NEXT: #include "dataflow_api.h" -// CHECK-NEXT: void kernel_main() { -// CHECK-DAG: size_t [[TILES_X:v[0-9]+]] = 2; -// CHECK-DAG: size_t [[STEP:v[0-9]+]] = 1; -// CHECK-DAG: size_t [[LB:v[0-9]+]] = 0; +// CHECK-NEXT: #include +// CHECK-NEXT: #include "tools/profiler/kernel_profiler.hpp" +// CHECK-NEXT: #include "dataflow_api.h" +// CHECK-NEXT: void kernel_main() { +// CHECK-DAG: size_t [[TILES_X:v[0-9]+]] = 2; +// CHECK-DAG: size_t [[STEP:v[0-9]+]] = 1; +// CHECK-DAG: size_t [[LB:v[0-9]+]] = 0; // Setup: all tensor accessors and CB pointers created before tile loop -// CHECK: TensorAccessor [[ACC1:v[0-9]+]] = TensorAccessor( -// CHECK: int32_t [[PTR1:v[0-9]+]] = get_write_ptr( -// CHECK: TensorAccessor [[ACC2:v[0-9]+]] = TensorAccessor( -// CHECK: int32_t [[PTR2:v[0-9]+]] = get_write_ptr( +// CHECK: TensorAccessor [[ACC1:v[0-9]+]] = TensorAccessor( +// CHECK: int32_t [[PTR1:v[0-9]+]] = get_write_ptr( +// CHECK: TensorAccessor [[ACC2:v[0-9]+]] = TensorAccessor( +// CHECK: int32_t [[PTR2:v[0-9]+]] = get_write_ptr( // Fused tile loop: single loop for 1x2 grid with both DMAs in body // (y-loop removed by canonicalization since it only iterates once) -// CHECK: for (size_t [[TILE_X:[a-z][0-9]+]] = [[LB]]; [[TILE_X]] < [[TILES_X]]; [[TILE_X]] += [[STEP]]) { -// CHECK: noc_async_read_tile({{.*}}, [[ACC1]], [[PTR1]]); -// CHECK-NEXT: noc_async_read_tile({{.*}}, [[ACC2]], [[PTR2]]); -// CHECK: } +// CHECK: for (size_t [[TILE_X:[a-z][0-9]+]] = [[LB]]; [[TILE_X]] < [[TILES_X]]; [[TILE_X]] += [[STEP]]) { +// CHECK: noc_async_read_tile({{.*}}, [[ACC1]], [[PTR1]]); +// CHECK-NEXT: noc_async_read_tile({{.*}}, [[ACC2]], [[PTR2]]); +// CHECK: } // Consecutive barriers deduplicated to single barrier. -// CHECK: noc_async_read_barrier(); -// CHECK-NOT: noc_async_read_barrier(); -// CHECK-NOT: noc_async_write_barrier(); -// CHECK: return; +// CHECK: noc_async_read_barrier(); +// CHECK-NOT: noc_async_read_barrier +// CHECK-NOT: noc_async_write_barrier +// CHECK: return; // CHECK-NEXT: } module { diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_fused.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_fused.mlir index a56338cb..1daf8d5c 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_fused.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_fused.mlir @@ -29,40 +29,42 @@ #layout = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <1x1>, memref<2x2x!ttcore.tile<32x32, f32>, #dram>, > // CHECK-LABEL: // batched_multi_tile_fused -// CHECK-NEXT: #include -// CHECK-NEXT: #include "tools/profiler/kernel_profiler.hpp" -// CHECK-NEXT: #include "dataflow_api.h" -// CHECK-NEXT: void kernel_main() { -// CHECK-DAG: size_t [[TILES_BOUND:v[0-9]+]] = 2; -// CHECK-DAG: size_t [[USER_UB:v[0-9]+]] = 3; -// CHECK-DAG: size_t [[STEP:v[0-9]+]] = 1; -// CHECK-DAG: size_t [[LB:v[0-9]+]] = 0; +// CHECK-NEXT: #include +// CHECK-NEXT: #include "tools/profiler/kernel_profiler.hpp" +// CHECK-NEXT: #include "dataflow_api.h" +// CHECK-NEXT: void kernel_main() { +// CHECK-DAG: size_t [[TILES_BOUND:v[0-9]+]] = 2; +// CHECK-DAG: size_t [[USER_UB:v[0-9]+]] = 3; +// CHECK-DAG: size_t [[STEP:v[0-9]+]] = 1; +// CHECK-DAG: size_t [[LB:v[0-9]+]] = 0; // User loop from input MLIR (0..3) -// CHECK: for (size_t [[USER_ITER:[a-z][0-9]+]] = [[LB]]; [[USER_ITER]] < [[USER_UB]]; [[USER_ITER]] += [[STEP]]) { +// CHECK: for (size_t [[USER_ITER:[a-z][0-9]+]] = [[LB]]; [[USER_ITER]] < [[USER_UB]]; [[USER_ITER]] += [[STEP]]) { // Setup: all tensor accessors and CB pointers created before tile loop -// CHECK: TensorAccessor [[ACC1:v[0-9]+]] = TensorAccessor( -// CHECK: int32_t [[PTR1:v[0-9]+]] = get_write_ptr( -// CHECK: TensorAccessor [[ACC2:v[0-9]+]] = TensorAccessor( -// CHECK: int32_t [[PTR2:v[0-9]+]] = get_write_ptr( -// CHECK: TensorAccessor [[ACC3:v[0-9]+]] = TensorAccessor( -// CHECK: int32_t [[PTR3:v[0-9]+]] = get_write_ptr( +// CHECK: TensorAccessor [[ACC1:v[0-9]+]] = TensorAccessor( +// CHECK: int32_t [[PTR1:v[0-9]+]] = get_write_ptr( +// CHECK: TensorAccessor [[ACC2:v[0-9]+]] = TensorAccessor( +// CHECK: int32_t [[PTR2:v[0-9]+]] = get_write_ptr( +// CHECK: TensorAccessor [[ACC3:v[0-9]+]] = TensorAccessor( +// CHECK: int32_t [[PTR3:v[0-9]+]] = get_write_ptr( // Fused tile loops: single nested loop with all three DMAs -// CHECK: for (size_t [[TILE_Y:[a-z][0-9]+]] = [[LB]]; [[TILE_Y]] < [[TILES_BOUND]]; [[TILE_Y]] += [[STEP]]) { -// CHECK-NEXT: for (size_t [[TILE_X:[a-z][0-9]+]] = [[LB]]; [[TILE_X]] < [[TILES_BOUND]]; [[TILE_X]] += [[STEP]]) { -// CHECK: noc_async_read_tile({{.*}}, [[ACC1]], [[PTR1]]); -// CHECK-NEXT: noc_async_read_tile({{.*}}, [[ACC2]], [[PTR2]]); -// CHECK-NEXT: noc_async_read_tile({{.*}}, [[ACC3]], [[PTR3]]); -// CHECK: } -// CHECK-NEXT: } +// CHECK: for (size_t [[TILE_Y:[a-z][0-9]+]] = [[LB]]; [[TILE_Y]] < [[TILES_BOUND]]; [[TILE_Y]] += [[STEP]]) { +// CHECK-NEXT: for (size_t [[TILE_X:[a-z][0-9]+]] = [[LB]]; [[TILE_X]] < [[TILES_BOUND]]; [[TILE_X]] += [[STEP]]) { +// CHECK: noc_async_read_tile({{.*}}, [[ACC1]], [[PTR1]]); +// CHECK-NEXT: noc_async_read_tile({{.*}}, [[ACC2]], [[PTR2]]); +// CHECK-NEXT: noc_async_read_tile({{.*}}, [[ACC3]], [[PTR3]]); +// CHECK: } +// CHECK-NEXT: } // Consecutive barriers deduplicated to single barrier. -// CHECK: noc_async_read_barrier(); -// CHECK: } -// CHECK: return; -// CHECK-NEXT: } +// CHECK: noc_async_read_barrier(); +// CHECK-NOT: noc_async_read_barrier +// CHECK-NOT: noc_async_write_barrier +// CHECK: } +// CHECK: return; +// CHECK-NEXT: } module { func.func @batched_multi_tile_fused(%arg0: tensor<64x64xf32, #layout>, %arg1: tensor<64x64xf32, #layout>, %arg2: tensor<64x64xf32, #layout>) diff --git a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir index 376e1285..1d5e56ed 100644 --- a/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir +++ b/test/ttlang/Translate/TTLToCpp/dma_multi_tile_batched_in_user_loop.mlir @@ -54,6 +54,7 @@ // Consecutive barriers deduplicated to single barrier. // CHECK: noc_async_read_barrier(); +// CHECK-NOT: noc_async_read_barrier // CHECK: } // CHECK: return; // CHECK-NEXT: }