tenstorrent · arichinsTT · Dec 29, 2025 · Dec 2, 2025 · Dec 3, 2025 · Dec 3, 2025
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 from ttlang.ttl_api import *
-from utils import assert_allclose
+from ttlang.utils.correctness import assert_allclose
 import torch
 
 

@@ -0,0 +1,11 @@
+# To Run Metal Examples
+Examples are meant to be run on a machine with a single n150 card. Set the env value TT_VISIBLE_DEVICES to be the pcie card you want to use in a multi-device setting.
+Manual build of tt-mlir needed, to source ttnn module
+in tt-mlir, source env/activate
+Now go to tt-lang, source build/env/activate and run the desired metal kernels, such as the singlecore matmul kernel.
+```bash
+pytest ./examples/metal_examples/singlecore_matmul/metal/singlecore_matmul.py
+```
+
+# TT-Lang Examples
+any tt-lang in this folder is up to spec, but currently is not guaranteed to compile/execute
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "compute_kernel_api/matmul.h"
+#include "compute_kernel_api/tile_move_copy.h"
+#include "hostdevcommon/kernel_structs.h"
+#include <cstdint>
+
+using std::uint32_t;
+
+namespace NAMESPACE {
+void MAIN {
+  uint32_t num_output_tiles =
+      get_arg_val<uint32_t>(0); // number of output tiles to produce
+  uint32_t Kt = get_arg_val<uint32_t>(
+      1); // number of tiles in K dimension for dot product
+
+  constexpr tt::CBIndex cb_in0 = tt::CBIndex::c_0;
+  constexpr tt::CBIndex cb_in1 = tt::CBIndex::c_1;
+  constexpr tt::CBIndex cb_out = tt::CBIndex::c_16;
+
+  // Setup the FPU (matrix engine) for the matmul operation. And specify the
+  // input and output circular buffers.
+  mm_init(cb_in0, cb_in1, cb_out);
+
+  // the simplest possible version of outer product blocked matmul
+  // the reader is expected to read the A's and B's tile rows and tile columns
+  // for each output tile
+  for (uint32_t i = 0; i < num_output_tiles; ++i) {
+    // Make sure registers can be used for the output tile. This also sets the
+    // registers to zero.
+    tile_regs_acquire();
+    for (uint32_t kt = 0; kt < Kt; kt++) {
+      // Wait for the input tiles to be available in the input circular buffers.
+      cb_wait_front(cb_in0, 1);
+      cb_wait_front(cb_in1, 1);
+
+      // Perform the matrix multiplication for the current tile.
+      // NOTE: This function also accumulates the result into the destination
+      // tile.
+      matmul_tiles(cb_in0, cb_in1, 0, 0, 0, false);
+
+      // Mark the input tiles as used by popping them from the front of the
+      // circular buffers.
+      cb_pop_front(cb_in0, 1);
+      cb_pop_front(cb_in1, 1);
+    }
+
+    // Commit and wait for the registers are populated with the results from the
+    // FPU
+    tile_regs_commit();
+    tile_regs_wait();
+
+    // Ensure the output circular buffer has space for the result tile.
+    cb_reserve_back(cb_out, 1);
+    // Pack the result tile into the output circular buffer.
+    pack_tile(0, cb_out);
+    // Mark the output tile as ready so the writer can read it.
+    cb_push_back(cb_out, 1);
+
+    // We don't need the registers anymore, so we can release them and prepare
+    // for the next output tile.
+    tile_regs_release();
+  }
+}
+} // namespace NAMESPACE
@@ -0,0 +1,71 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+#include <cstdint>
+#include <stdint.h>
+
+#include "debug/dprint.h"
+
+void kernel_main() {
+  // same arg indices as in reader_binary_diff_lengths for compat
+  uint32_t src0_addr = get_arg_val<uint32_t>(0);
+  uint32_t src1_addr = get_arg_val<uint32_t>(1);
+  uint32_t Mt = get_arg_val<uint32_t>(2);
+  uint32_t Kt = get_arg_val<uint32_t>(3);
+  uint32_t Nt = get_arg_val<uint32_t>(4);
+  uint32_t output_tile_start_id =
+      get_arg_val<uint32_t>(5); // starting tile ID for output tiles
+  uint32_t num_output_tiles =
+      get_arg_val<uint32_t>(6); // number of output tiles to read
+
+  constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0;
+  constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1;
+
+  // Declare address in which we stored the source matrices. We have set the
+  // exact same format between CBs and DRAM buffers in the host code, so we can
+  // use the same address for both DRAM and CBs.
+  const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
+  const uint32_t in1_tile_bytes = get_tile_size(cb_id_in1);
+
+  constexpr auto a_args = TensorAccessorArgs<0>();
+  const auto a = TensorAccessor(a_args, src0_addr, in0_tile_bytes);
+
+  constexpr auto b_args =
+      TensorAccessorArgs<a_args.next_compile_time_args_offset()>();
+  const auto b = TensorAccessor(b_args, src1_addr, in1_tile_bytes);
+
+  // Simple 2D matmul: A[Mt, Kt] @ B[Kt, Nt] = C[Mt, Nt]
+  for (uint32_t output_tile = 0; output_tile < num_output_tiles;
+       output_tile++) {
+    uint32_t current_tile_id = output_tile_start_id + output_tile;
+
+    // Convert linear output tile ID to 2D coordinates
+    uint32_t out_row = current_tile_id / Nt; // Which row in output
+    uint32_t out_col = current_tile_id % Nt; // Which col in output
+
+    // Read all K tiles for this output position
+    for (uint32_t k = 0; k < Kt; k++) {
+      // Read A's tile at (out_row, k)
+      uint32_t tile_A = out_row * Kt + k; // A is MK, so we stride by Kt
+      {
+        cb_reserve_back(cb_id_in0, 1);
+        uint32_t l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+        noc_async_read_tile(tile_A, a, l1_write_addr_in0);
+        noc_async_read_barrier();
+        cb_push_back(cb_id_in0, 1);
+      }
+
+      // Read B's tile at (k, out_col)
+      uint32_t tile_B = k * Nt + out_col; // B is KN, so we stride by Nt
+      {
+        cb_reserve_back(cb_id_in1, 1);
+        uint32_t l1_write_addr_in1 = get_write_ptr(cb_id_in1);
+        noc_async_read_tile(tile_B, b, l1_write_addr_in1);
+        noc_async_read_barrier();
+        cb_push_back(cb_id_in1, 1);
+      }
+    }
+  }
+}
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+  // Runtime arguments to write data back into the output buffer.
+  uint32_t dst_addr = get_arg_val<uint32_t>(0);
+  uint32_t num_tiles =
+      get_arg_val<uint32_t>(1); // number of output tiles to write
+  uint32_t start_id =
+      get_arg_val<uint32_t>(2); // starting tile ID for output tiles
+
+  constexpr uint32_t cb_id_out = tt::CBIndex::c_16;
+
+  // Create the address generator for the output buffer. Due to us sharing
+  // buffer and circular buffer configuration parameters (e.g. same data type
+  // and same page size) in the host code, we can grab the same parameters from
+  // the circular buffer as we would from the DRAM buffer.
+  constexpr uint32_t onetile = 1; // single-tile ublocks
+  const uint32_t tile_bytes = get_tile_size(cb_id_out);
+
+  constexpr auto c_args = TensorAccessorArgs<0>();
+  const auto c = TensorAccessor(c_args, dst_addr, tile_bytes);
+
+  // Loop through the tile indices and write each tile to DRAM in order.
+  uint32_t end_id = start_id + num_tiles;
+  for (uint32_t i = start_id; i < end_id; ++i) {
+    // Wait for the kernel to produce an output tile
+    cb_wait_front(cb_id_out, onetile);
+    // Write the output tile to DRAM.
+    uint32_t l1_read_addr = get_read_ptr(cb_id_out);
+    noc_async_write_tile(i, c, l1_read_addr);
+    noc_async_write_barrier(); // This will wait until the write is done. As an
+                               // alternative, noc_async_write_flushed() can be
+                               // faster because it waits until the write
+                               // request is sent. In that case, you have to use
+                               // noc_async_write_barrier() at least once at the
+                               // end of data movement kernel to make sure all
+                               // writes are done.
+    cb_pop_front(cb_id_out, onetile);
+  }
+}