Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
a434030
initial singlecore matmul in metal and tt-lang
arichinsTT Dec 2, 2025
7f8c435
blk size for cb shape
arichinsTT Dec 3, 2025
58ef33c
readme
arichinsTT Dec 3, 2025
5877772
tensor accessor and fixing writer cb usgae
arichinsTT Dec 3, 2025
e7fd315
remvoing tensor accessor and setting up imports and pre-commit
arichinsTT Dec 4, 2025
13e706f
precommit
arichinsTT Dec 4, 2025
e1e5c34
real precommit
arichinsTT Dec 4, 2025
bb36ede
cleanup
arichinsTT Dec 4, 2025
5175deb
test name cleanup
arichinsTT Dec 4, 2025
aa01663
fixing wait handles on copys and some nits
arichinsTT Dec 4, 2025
d168a3a
nits and fixes
arichinsTT Dec 4, 2025
c8c46f7
tt-lang program grid, nit
arichinsTT Dec 4, 2025
77998a4
tilized slicing
arichinsTT Dec 8, 2025
030ff8b
multicore and directory reconfiguring
arichinsTT Dec 9, 2025
7cd5454
pre-commit and cleanup
arichinsTT Dec 9, 2025
272df23
missed files
arichinsTT Dec 9, 2025
7fc5102
fixing cbs to be tile sized based
arichinsTT Dec 10, 2025
5cc4ea2
removing larg params util
arichinsTT Dec 10, 2025
6fca739
cleanup
arichinsTT Dec 10, 2025
287cc50
simpler slicing and compute store
arichinsTT Dec 11, 2025
0f5c27f
nits n dead code
arichinsTT Dec 12, 2025
07fc451
simplifying work grid split
arichinsTT Dec 12, 2025
e3e8d19
pre-commit oops
arichinsTT Dec 12, 2025
2def98f
removing future files related to multicore reuse bcast matmul
arichinsTT Dec 12, 2025
dd400e7
moving ulp assert and new split work to cores that is grid sepcific t…
arichinsTT Dec 16, 2025
b3dd0f6
nit fix
arichinsTT Dec 16, 2025
c2faf68
ghost change and acc store
arichinsTT Dec 17, 2025
1448820
removing new
arichinsTT Dec 18, 2025
ba119e8
nits, and envs where I can use them
arichinsTT Dec 23, 2025
090ef52
Merge branch 'main' into arichins/singleCoreMatmul
arichinsTT Dec 23, 2025
fd26167
Merge branch 'main' into arichins/singleCoreMatmul
arichinsTT Dec 23, 2025
5f1400b
unified under tt-lang and tt-mlir env, and nits
arichinsTT Dec 29, 2025
00cb9c9
Merge branch 'main' into arichins/singleCoreMatmul
arichinsTT Dec 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/custom_dm_matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0
from ttlang.ttl_api import *
from utils import assert_allclose
from ttlang.utils.correctness import assert_allclose
import torch


Expand Down
11 changes: 11 additions & 0 deletions examples/metal_examples/README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# To Run Metal Examples
Examples are meant to be run on a machine with a single n150 card. Set the env value TT_VISIBLE_DEVICES to be the pcie card you want to use in a multi-device setting.
Manual build of tt-mlir needed, to source ttnn module
in tt-mlir, source env/activate
Now go to tt-lang, source build/env/activate and run the desired metal kernels, such as the singlecore matmul kernel.
```bash
pytest ./examples/metal_examples/singlecore_matmul/metal/singlecore_matmul.py
```

# TT-Lang Examples
any tt-lang in this folder is up to spec, but currently is not guaranteed to compile/execute
3 changes: 3 additions & 0 deletions examples/metal_examples/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
#
# SPDX-License-Identifier: Apache-2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include "compute_kernel_api/matmul.h"
#include "compute_kernel_api/tile_move_copy.h"
#include "hostdevcommon/kernel_structs.h"
#include <cstdint>

using std::uint32_t;

namespace NAMESPACE {
void MAIN {
uint32_t num_output_tiles =
get_arg_val<uint32_t>(0); // number of output tiles to produce
uint32_t Kt = get_arg_val<uint32_t>(
1); // number of tiles in K dimension for dot product

constexpr tt::CBIndex cb_in0 = tt::CBIndex::c_0;
constexpr tt::CBIndex cb_in1 = tt::CBIndex::c_1;
constexpr tt::CBIndex cb_out = tt::CBIndex::c_16;

// Setup the FPU (matrix engine) for the matmul operation. And specify the
// input and output circular buffers.
mm_init(cb_in0, cb_in1, cb_out);

// the simplest possible version of outer product blocked matmul
// the reader is expected to read the A's and B's tile rows and tile columns
// for each output tile
for (uint32_t i = 0; i < num_output_tiles; ++i) {
// Make sure registers can be used for the output tile. This also sets the
// registers to zero.
tile_regs_acquire();
for (uint32_t kt = 0; kt < Kt; kt++) {
// Wait for the input tiles to be available in the input circular buffers.
cb_wait_front(cb_in0, 1);
cb_wait_front(cb_in1, 1);

// Perform the matrix multiplication for the current tile.
// NOTE: This function also accumulates the result into the destination
// tile.
matmul_tiles(cb_in0, cb_in1, 0, 0, 0, false);

// Mark the input tiles as used by popping them from the front of the
// circular buffers.
cb_pop_front(cb_in0, 1);
cb_pop_front(cb_in1, 1);
}

// Commit and wait for the registers are populated with the results from the
// FPU
tile_regs_commit();
tile_regs_wait();

// Ensure the output circular buffer has space for the result tile.
cb_reserve_back(cb_out, 1);
// Pack the result tile into the output circular buffer.
pack_tile(0, cb_out);
// Mark the output tile as ready so the writer can read it.
cb_push_back(cb_out, 1);

// We don't need the registers anymore, so we can release them and prepare
// for the next output tile.
tile_regs_release();
}
}
} // namespace NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#include "dataflow_api.h"
#include <cstdint>
#include <stdint.h>

#include "debug/dprint.h"

void kernel_main() {
// same arg indices as in reader_binary_diff_lengths for compat
uint32_t src0_addr = get_arg_val<uint32_t>(0);
uint32_t src1_addr = get_arg_val<uint32_t>(1);
uint32_t Mt = get_arg_val<uint32_t>(2);
uint32_t Kt = get_arg_val<uint32_t>(3);
uint32_t Nt = get_arg_val<uint32_t>(4);
uint32_t output_tile_start_id =
get_arg_val<uint32_t>(5); // starting tile ID for output tiles
uint32_t num_output_tiles =
get_arg_val<uint32_t>(6); // number of output tiles to read

constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1;

// Declare address in which we stored the source matrices. We have set the
// exact same format between CBs and DRAM buffers in the host code, so we can
// use the same address for both DRAM and CBs.
const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
const uint32_t in1_tile_bytes = get_tile_size(cb_id_in1);

constexpr auto a_args = TensorAccessorArgs<0>();
const auto a = TensorAccessor(a_args, src0_addr, in0_tile_bytes);

constexpr auto b_args =
TensorAccessorArgs<a_args.next_compile_time_args_offset()>();
const auto b = TensorAccessor(b_args, src1_addr, in1_tile_bytes);

// Simple 2D matmul: A[Mt, Kt] @ B[Kt, Nt] = C[Mt, Nt]
for (uint32_t output_tile = 0; output_tile < num_output_tiles;
output_tile++) {
uint32_t current_tile_id = output_tile_start_id + output_tile;

// Convert linear output tile ID to 2D coordinates
uint32_t out_row = current_tile_id / Nt; // Which row in output
uint32_t out_col = current_tile_id % Nt; // Which col in output

// Read all K tiles for this output position
for (uint32_t k = 0; k < Kt; k++) {
// Read A's tile at (out_row, k)
uint32_t tile_A = out_row * Kt + k; // A is MK, so we stride by Kt
{
cb_reserve_back(cb_id_in0, 1);
uint32_t l1_write_addr_in0 = get_write_ptr(cb_id_in0);
noc_async_read_tile(tile_A, a, l1_write_addr_in0);
noc_async_read_barrier();
cb_push_back(cb_id_in0, 1);
}

// Read B's tile at (k, out_col)
uint32_t tile_B = k * Nt + out_col; // B is KN, so we stride by Nt
{
cb_reserve_back(cb_id_in1, 1);
uint32_t l1_write_addr_in1 = get_write_ptr(cb_id_in1);
noc_async_read_tile(tile_B, b, l1_write_addr_in1);
noc_async_read_barrier();
cb_push_back(cb_id_in1, 1);
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#include "dataflow_api.h"

void kernel_main() {
// Runtime arguments to write data back into the output buffer.
uint32_t dst_addr = get_arg_val<uint32_t>(0);
uint32_t num_tiles =
get_arg_val<uint32_t>(1); // number of output tiles to write
uint32_t start_id =
get_arg_val<uint32_t>(2); // starting tile ID for output tiles

constexpr uint32_t cb_id_out = tt::CBIndex::c_16;

// Create the address generator for the output buffer. Due to us sharing
// buffer and circular buffer configuration parameters (e.g. same data type
// and same page size) in the host code, we can grab the same parameters from
// the circular buffer as we would from the DRAM buffer.
constexpr uint32_t onetile = 1; // single-tile ublocks
const uint32_t tile_bytes = get_tile_size(cb_id_out);

constexpr auto c_args = TensorAccessorArgs<0>();
const auto c = TensorAccessor(c_args, dst_addr, tile_bytes);

// Loop through the tile indices and write each tile to DRAM in order.
uint32_t end_id = start_id + num_tiles;
for (uint32_t i = start_id; i < end_id; ++i) {
// Wait for the kernel to produce an output tile
cb_wait_front(cb_id_out, onetile);
// Write the output tile to DRAM.
uint32_t l1_read_addr = get_read_ptr(cb_id_out);
noc_async_write_tile(i, c, l1_read_addr);
noc_async_write_barrier(); // This will wait until the write is done. As an
// alternative, noc_async_write_flushed() can be
// faster because it waits until the write
// request is sent. In that case, you have to use
// noc_async_write_barrier() at least once at the
// end of data movement kernel to make sure all
// writes are done.
cb_pop_front(cb_id_out, onetile);
}
}
Loading