Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 39 additions & 2 deletions include/ttlang/Dialect/Utils/LayoutUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,44 @@ struct ContiguousLayoutInfo {
int64_t pageSizeBytes;
};

/// Contiguity classification for tensor layouts.
/// Determines the granularity of block transfers that can be used.
enum class ContiguityLevel {
FullyContiguous, // Entire tensor is contiguous - single block transfer
RowContiguous, // Each row is contiguous - per-row block transfers
TileContiguous, // Only 32x32 tiles are contiguous - tile-level transfers
NonContiguous // Scattered/complex layout - error case
};

/// Extended layout analysis result with contiguity information.
/// Used to determine optimal data transfer strategy for ttl.copy lowering.
struct LayoutContiguityInfo {
ContiguityLevel level;
int64_t totalElements; // Total elements in tensor
int64_t totalSizeBytes; // Total size in bytes
int64_t rowSizeBytes; // Bytes per contiguous row
int64_t rowStrideBytes; // Bytes between row starts (may include padding)
int64_t numRows; // Number of rows
int64_t elemByteWidth; // Bytes per element
bool isRowMajor;
bool hasPadding;
};

/// Analyze tensor layout contiguity from TTNNLayoutAttr.
///
/// Determines the optimal data transfer strategy by inspecting the tensor's
/// layout encoding. The analysis checks:
/// 1. Layout type (RowMajor vs Tile)
/// 2. Memory layout (Interleaved vs Sharded)
/// 3. Affine map (identity vs permuted)
///
/// Returns:
/// - FullyContiguous: Entire tensor can be transferred as one block
/// - RowContiguous: Each row can be transferred as a block (with padding)
/// - TileContiguous: Fall back to per-tile transfers (current behavior)
/// - NonContiguous: Complex/unsupported layout
LayoutContiguityInfo analyzeLayoutContiguity(RankedTensorType tensorTy);

inline ContiguousLayoutInfo computeContiguousLayout(RankedTensorType tensorTy) {
ArrayRef<int64_t> shape = tensorTy.getShape();
// TODO(ttl): Replace this contiguous fallback with stride/page derivation
Expand All @@ -29,8 +67,7 @@ inline ContiguousLayoutInfo computeContiguousLayout(RankedTensorType tensorTy) {
int64_t elemBits = tensorTy.getElementType().getIntOrFloatBitWidth();
int64_t elemByteWidth = elemBits / 8;

// TODO(ttl): Derive page size from actual tiling/sharding when available.
// Issue: #83.
// TODO(#83): Derive page size from actual tiling/sharding when available.
int64_t pageSizeBytes = elemByteWidth * rowStrideElems;

return {rowStrideElems, colStrideElems, elemByteWidth, pageSizeBytes};
Expand Down
1 change: 1 addition & 0 deletions lib/Dialect/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ add_subdirectory(D2M)
add_subdirectory(TTKernel)
add_subdirectory(TTL)
add_subdirectory(TTMetal)
add_subdirectory(Utils)
1 change: 1 addition & 0 deletions lib/Dialect/TTL/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ add_mlir_dialect_library(TTLangTTLTransforms
MLIRTTKernelDialect
MLIRTTLDialect
TTLangTTKernelTransforms
TTLangUtils
)
196 changes: 168 additions & 28 deletions lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,126 @@ emitTileLoop(OpBuilder &builder, Location loc, int64_t tilesY, int64_t tilesX,
}
}

//===----------------------------------------------------------------------===//
// Block Transfer Optimization
//===----------------------------------------------------------------------===//

/// Emit a single block transfer for fully contiguous tensors.
/// Replaces entire tile loop with one noc_async_read/write operation.
static void
emitSingleBlockTransfer(OpBuilder &builder, Location loc, Value tensorAccessor,
Value cbPtr, bool isRead,
const utils::LayoutContiguityInfo &layoutInfo) {
// Get NOC address for tile 0 (start of tensor).
auto zero = builder.create<arith::ConstantIntOp>(loc, 0, 32);
auto nocAddr = builder.create<ttk::TensorAccessorGetNocAddrOp>(
loc, tensorAccessor, zero, zero, /*offset=*/nullptr);

// Total size in bytes.
auto size =
builder.create<arith::ConstantIntOp>(loc, layoutInfo.totalSizeBytes, 32);

if (isRead) {
// Tensor -> CB: read(srcNocAddr, dstL1Addr, size)
builder.create<ttk::NocAsyncReadOp>(loc, nocAddr, cbPtr, size);
} else {
// CB -> Tensor: write(srcL1Addr, dstNocAddr, size)
builder.create<ttk::NocAsyncWriteOp>(loc, cbPtr, nocAddr, size);
}
}

/// Emit row-by-row block transfers for row-contiguous tensors with padding.
/// Generates loop over rows, one noc_async_read/write per row.
static void emitRowBlockTransfers(OpBuilder &builder, Location loc,
Value tensorAccessor, Value cbPtr,
bool isRead,
const utils::LayoutContiguityInfo &layoutInfo,
int64_t tilesPerRow) {
auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
auto one = builder.create<arith::ConstantIndexOp>(loc, 1);
auto numRows =
builder.create<arith::ConstantIndexOp>(loc, layoutInfo.numRows);
auto rowSizeBytes =
builder.create<arith::ConstantIntOp>(loc, layoutInfo.rowSizeBytes, 32);
auto tilesPerRowVal =
builder.create<arith::ConstantIndexOp>(loc, tilesPerRow);
auto tileSize = builder.create<arith::ConstantIndexOp>(loc, 32);

scf::buildLoopNest(
builder, loc, ValueRange{zero}, ValueRange{numRows}, ValueRange{one},
[&](OpBuilder &b, Location bodyLoc, ValueRange ivs) {
Value rowIdx = ivs[0];

// Compute tile ID for start of row: tileId = (rowIdx / 32) *
// tilesPerRow
Value tileRow = b.create<arith::DivUIOp>(bodyLoc, rowIdx, tileSize);
Value tileId =
b.create<arith::MulIOp>(bodyLoc, tileRow, tilesPerRowVal);
Value tileId32 =
b.create<arith::IndexCastOp>(bodyLoc, b.getI32Type(), tileId);
auto offset = b.create<arith::ConstantIntOp>(bodyLoc, 0, 32);

// Get NOC address for this row.
auto nocAddr = b.create<ttk::TensorAccessorGetNocAddrOp>(
bodyLoc, tensorAccessor, tileId32, offset, /*offset=*/nullptr);

// Compute CB address for this row: cbPtr + (rowIdx * rowSizeBytes)
Value rowIdxI32 =
b.create<arith::IndexCastOp>(bodyLoc, b.getI32Type(), rowIdx);
Value cbRowOffset =
b.create<arith::MulIOp>(bodyLoc, rowIdxI32, rowSizeBytes);
Value cbAddrForRow =
b.create<arith::AddIOp>(bodyLoc, cbPtr, cbRowOffset);

if (isRead) {
b.create<ttk::NocAsyncReadOp>(bodyLoc, nocAddr, cbAddrForRow,
rowSizeBytes);
} else {
b.create<ttk::NocAsyncWriteOp>(bodyLoc, cbAddrForRow, nocAddr,
rowSizeBytes);
}
});
}

/// Emit optimized data transfer based on layout contiguity analysis.
/// Implements three-level optimization hierarchy:
/// 1. FullyContiguous -> single block transfer
/// 2. RowContiguous -> per-row block transfers
/// 3. TileContiguous/NonContiguous -> per-tile transfers
static void emitOptimizedTransfer(OpBuilder &builder, Location loc,
Value tensorAccessor, Value cbPtr,
bool isRead, RankedTensorType tensorTy,
int64_t tilesY, int64_t tilesX) {
auto layoutInfo = utils::analyzeLayoutContiguity(tensorTy);

switch (layoutInfo.level) {
case utils::ContiguityLevel::FullyContiguous:
emitSingleBlockTransfer(builder, loc, tensorAccessor, cbPtr, isRead,
layoutInfo);
break;

case utils::ContiguityLevel::RowContiguous:
emitRowBlockTransfers(builder, loc, tensorAccessor, cbPtr, isRead,
layoutInfo, tilesX);
break;

case utils::ContiguityLevel::TileContiguous:
case utils::ContiguityLevel::NonContiguous:
// Existing tile loop fallback.
emitTileLoop(builder, loc, tilesY, tilesX,
[&](OpBuilder &b, Location bodyLoc, Value tileOffset) {
if (isRead) {
b.create<ttk::NocAsyncReadTileOp>(bodyLoc, tileOffset,
tensorAccessor, cbPtr);
} else {
b.create<ttk::NocAsyncWriteTileOp>(bodyLoc, tileOffset,
tensorAccessor, cbPtr);
}
});
break;
}
}

//===----------------------------------------------------------------------===//
// DMA Copy Grouping for Fused Loop Emission
//===----------------------------------------------------------------------===//
Expand All @@ -524,7 +644,8 @@ struct CopyInfo {
CopyOp op;
int64_t tilesY;
int64_t tilesX;
bool isRead; // tensor->CB (read) vs CB->tensor (write)
bool isRead; // tensor->CB (read) vs CB->tensor (write)
RankedTensorType tensorTy; // Tensor type for layout analysis
};

/// Check if all operands of a copy operation properly dominate the given
Expand Down Expand Up @@ -659,20 +780,42 @@ static void emitGroupedCopies(ArrayRef<CopyInfo> copies,
cbPtrs.size() != subgroup.size()) {
continue;
}
// Phase 2: Emit transfers based on layout contiguity.
// Check if all copies in subgroup have TileContiguous layout (benefit
// from loop fusion). For FullyContiguous/RowContiguous, emit separate
// optimized transfers since they don't benefit from fusion.
bool allTileContiguous = true;
for (const CopyInfo &info : subgroup) {
auto layoutInfo = utils::analyzeLayoutContiguity(info.tensorTy);
if (layoutInfo.level != utils::ContiguityLevel::TileContiguous &&
layoutInfo.level != utils::ContiguityLevel::NonContiguous) {
allTileContiguous = false;
break;
}
}

// Phase 2: Emit single fused tile loop.
emitTileLoop(builder, loc, tilesY, tilesX,
[&](OpBuilder &b, Location bodyLoc, Value tileOffset) {
for (size_t i = 0; i < subgroup.size(); ++i) {
if (isRead) {
b.create<ttk::NocAsyncReadTileOp>(
bodyLoc, tileOffset, accessors[i], cbPtrs[i]);
} else {
b.create<ttk::NocAsyncWriteTileOp>(
bodyLoc, tileOffset, accessors[i], cbPtrs[i]);
if (allTileContiguous) {
// All copies are tile-contiguous: emit single fused tile loop.
emitTileLoop(builder, loc, tilesY, tilesX,
[&](OpBuilder &b, Location bodyLoc, Value tileOffset) {
for (size_t i = 0; i < subgroup.size(); ++i) {
if (isRead) {
b.create<ttk::NocAsyncReadTileOp>(
bodyLoc, tileOffset, accessors[i], cbPtrs[i]);
} else {
b.create<ttk::NocAsyncWriteTileOp>(
bodyLoc, tileOffset, accessors[i], cbPtrs[i]);
}
}
}
});
});
} else {
// Some copies are fully/row contiguous: emit separate optimized
// transfers (block transfers don't benefit from loop fusion).
for (size_t i = 0; i < subgroup.size(); ++i) {
emitOptimizedTransfer(builder, loc, accessors[i], cbPtrs[i], isRead,
subgroup[i].tensorTy, tilesY, tilesX);
}
}

// Phase 3: Replace original copy ops with dummy handle and mark handled.
// Use UnrealizedConversionCastOp to preserve TransferHandleType so that
Expand Down Expand Up @@ -720,8 +863,9 @@ static void processBlockForCopyGrouping(Block &block,

Value tensor = isRead ? src : copyOp.getDst();
auto [tilesY, tilesX] = getTileGridShapeFromValue(tensor);
auto tensorTy = mlir::cast<RankedTensorType>(tensor.getType());

copies.push_back({copyOp, tilesY, tilesX, isRead});
copies.push_back({copyOp, tilesY, tilesX, isRead, tensorTy});
} else if (isa<WaitOp>(&op)) {
// Process accumulated copies at synchronization boundary.
emitGroupedCopies(copies, handledOps, builder);
Expand Down Expand Up @@ -779,13 +923,11 @@ static LogicalResult lowerTensorToCB(CopyOp op, Value srcTensor, Value dstCB,

auto [tilesY, tilesX] = getTileGridShapeFromValue(srcTensor);

// TODO(#138): Emit single block transfer for contiguous layouts instead of
// tile loop.
emitTileLoop(rewriter, loc, tilesY, tilesX,
[&](OpBuilder &b, Location bodyLoc, Value tileOffset) {
b.create<ttk::NocAsyncReadTileOp>(bodyLoc, tileOffset,
*srcAccessor, cbWritePtr);
});
// Emit optimized transfer based on layout contiguity (single block, row
// blocks, or tile fallback).
auto tensorTy = mlir::cast<RankedTensorType>(srcTensor.getType());
emitOptimizedTransfer(rewriter, loc, *srcAccessor, cbWritePtr,
/*isRead=*/true, tensorTy, tilesY, tilesX);

auto handle = makeZeroI32(loc, rewriter);
rewriter.replaceOp(op, handle);
Expand Down Expand Up @@ -820,13 +962,11 @@ static LogicalResult lowerCBToTensor(CopyOp op, Value srcCB, Value dstTensor,

auto [tilesY, tilesX] = getTileGridShapeFromValue(dstTensor);

// TODO(#138): Emit single block transfer for contiguous layouts instead of
// tile loop.
emitTileLoop(rewriter, loc, tilesY, tilesX,
[&](OpBuilder &b, Location bodyLoc, Value tileOffset) {
b.create<ttk::NocAsyncWriteTileOp>(bodyLoc, tileOffset,
*dstAccessor, cbReadPtr);
});
// Emit optimized transfer based on layout contiguity (single block, row
// blocks, or tile fallback).
auto tensorTy = mlir::cast<RankedTensorType>(dstTensor.getType());
emitOptimizedTransfer(rewriter, loc, *dstAccessor, cbReadPtr,
/*isRead=*/false, tensorTy, tilesY, tilesX);

auto handle = makeZeroI32(loc, rewriter);
rewriter.replaceOp(op, handle);
Expand Down
7 changes: 7 additions & 0 deletions lib/Dialect/Utils/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
add_mlir_dialect_library(TTLangUtils
LayoutUtils.cpp

LINK_LIBS PUBLIC
MLIRIR
MLIRTTNNDialect
)
Loading