From d98df2c63ac4ac5d7e15f1518d98f96f74f85b31 Mon Sep 17 00:00:00 2001
From: Ryan Zambrotta <ryanzam@cs.washington.edu>
Date: Fri, 23 Aug 2024 13:53:46 -0700
Subject: [PATCH 1/2] Adds Neon ISA example

This CL adds the Neon equivalent
of the x86-AVX example. It changes
the Makefile to build the neon
example. It also modifies
the provided main.c file
to work for either the
generated neon or avx code
---
 examples/Makefile      | 20 +++++++--
 examples/arm_matmul.py | 97 ++++++++++++++++++++++++++++++++++++++++++
 examples/main.c        |  8 +++-
 3 files changed, 120 insertions(+), 5 deletions(-)
 create mode 100644 examples/arm_matmul.py

diff --git a/examples/Makefile b/examples/Makefile
index 28acb78fb..832e5a997 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -1,13 +1,25 @@
 CFLAGS ?= -march=native
 
-avx2_matmul: avx2_matmul.o main.o
+.PHONY: x86
+x86: avx2_matmul
 
-avx2_matmul.c: x86_matmul.py
+# x86 build
+avx2_matmul: avx2_matmul.o main.o
+avx2_matmul.h avx2_matmul.c: x86_matmul.py
 	exocc -o . --stem $(*F) $^
 
-main.c: avx2_matmul.c
+.PHONY: neon
+neon: neon_matmul
+
+# ARM 
+neon_matmul: neon_matmul.o main.o
+neon_matmul.h neon_matmul.c: arm_matmul.py
+	exocc -o . --stem $(*F) $^
 
 .PHONY: clean
 clean:
-	$(RM) avx2_matmul avx2_matmul.* *.o exo_demo
+	$(RM) *.o exo_demo
 	$(RM) -r __pycache__/
+	$(RM) avx2_matmul avx2_matmul.* 
+	$(RM) neon_matmul neon_matmul.*
+	
diff --git a/examples/arm_matmul.py b/examples/arm_matmul.py
new file mode 100644
index 000000000..78bf588c1
--- /dev/null
+++ b/examples/arm_matmul.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+import os
+import sys
+
+from exo import proc
+from exo.platforms.neon import *
+from exo.stdlib.scheduling import *
+
+# Hide output when running through exocc.
+if __name__ != "__main__" and hasattr(os, "devnull"):
+    sys.stdout = open(os.devnull, "w")
+
+
+# Algorithm definition
+@proc
+def rank_k_reduce_6x16(
+    K: size, A: f32[6, K] @ DRAM, B: f32[K, 16] @ DRAM, C: f32[6, 16] @ DRAM
+):
+    for i in seq(0, 6):
+        for j in seq(0, 16):
+            for k in seq(0, K):
+                C[i, j] += A[i, k] * B[k, j]
+
+print("=============Original Matmul==============")
+print(rank_k_reduce_6x16)
+
+# neon only supports vectors of width 4 for f32
+# x86 supports either 4 or 8 wide
+# vec_reg_width = 8
+vec_reg_width = 4
+
+# print("=============Original algorithm==============")
+# print(rank_k_reduce_6x16)
+
+# The first step is thinking about the output memory.
+# In this ex, we want the computation to be "output stationary", which means,
+# we want to preallocate all the output registers at the start.
+neon = rename(rank_k_reduce_6x16, "rank_k_reduce_6x16_scheduled")
+print(neon)
+neon = reorder_loops(neon, "j k")
+neon = reorder_loops(neon, "i k")
+
+# The staging of C will cause us to consume 12 out of the 16 vector registers
+neon = divide_loop(neon, "for j in _: _", vec_reg_width, ["jo", "ji"], perfect=True)
+neon = stage_mem(neon, "for k in _:_", "C[0:6, 0:16]", "C_reg")
+neon = simplify(neon)
+
+# Reshape C_reg so we can map it into vector registers
+neon = divide_dim(neon, "C_reg:_", 1, vec_reg_width)
+neon = repeat(divide_loop)(neon, "for i1 in _: _", vec_reg_width, ["i2", "i3"], perfect=True)
+neon = simplify(neon)
+
+# Map C_reg operations to vector instructions
+neon = set_memory(neon, "C_reg:_", Neon)
+print(neon)
+# this loads 8 items into the register but neon only loads 4
+# neon = replace_all(neon, mm256_loadu_ps)
+neon = replace_all(neon, neon_vld_4xf32)
+# neon = replace_all(neon, mm256_storeu_ps)
+neon = replace_all(neon, neon_vst_4xf32)
+neon = simplify(neon)
+
+# Now, the rest of the compute needs to work with the constraint that the
+# we only have 4 more registers to work with here.
+
+# B is easy, it is just two vector loads
+neon = stage_mem(neon, "for i in _:_", "B[k, 0:16]", "B_reg")
+neon = simplify(neon)
+neon = divide_loop(neon, "for i0 in _: _ #1", vec_reg_width, ["io", "ii"], perfect=True)
+neon = divide_dim(neon, "B_reg:_", 0, vec_reg_width)
+neon = set_memory(neon, "B_reg:_", Neon)
+neon = simplify(neon)
+# neon = replace_all(neon, mm256_loadu_ps)
+neon = replace_all(neon, neon_vld_4xf32)
+neon = simplify(neon)
+
+# Now we've used up two more vector registers.
+# The final part is staging A
+# avx = stage_mem(avx, 'for jo in _:_', 'A[i, k]', 'A_reg')
+neon = bind_expr(neon, "A[i, k]", "A_reg")
+neon = expand_dim(neon, "A_reg", vec_reg_width, "ji")
+neon = lift_alloc(neon, "A_reg", n_lifts=2)
+neon = fission(neon, neon.find("A_reg[ji] = _").after(), n_lifts=2)
+neon = remove_loop(neon, "for jo in _: _")
+neon = set_memory(neon, "A_reg:_", Neon)
+# neon = replace_all(neon, mm256_broadcast_ss)
+neon = replace_all(neon, neon_broadcast_4xf32)
+
+# DO THE COMPUTE!!!
+# neon = replace_all(neon, mm256_fmadd_ps)
+neon = replace_all(neon, neon_vfmadd_4xf32_4xf32)
+neon = simplify(neon)
+
+print("============= Rewritten ==============")
+print(neon)
+
diff --git a/examples/main.c b/examples/main.c
index 1d2606ac6..e0e98dc68 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -1,7 +1,11 @@
 #include <stdio.h>
 #include <time.h>
+#include <stdint.h>
+
+// generated from exo
+void rank_k_reduce_6x16( void *ctxt, int_fast32_t K, const float* A, const float* B, float* C );
+void rank_k_reduce_6x16_scheduled( void *ctxt, int_fast32_t K, const float* A, const float* B, float* C );
 
-#include "avx2_matmul.h"
 
 #define K 2048
 static float A[6 * K];
@@ -31,6 +35,8 @@ int main() {
   clock_t start, end;
   int msec;
 
+  initialize();
+
   // Calling original matmul
   start = clock();
   for (int i = 0; i < 1000; i++)

From 75deca45b476777a9b2047068c11a28319537374 Mon Sep 17 00:00:00 2001
From: Ryan Zambrotta <ryanzam@cs.washington.edu>
Date: Fri, 23 Aug 2024 20:58:03 +0000
Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=A4=96=20apply=20linter=20changes=20(?=
 =?UTF-8?q?will=20not=20trigger=20CI)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/arm_matmul.py | 6 ++++--
 examples/main.c        | 9 +++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/examples/arm_matmul.py b/examples/arm_matmul.py
index 78bf588c1..72a5b7bfa 100644
--- a/examples/arm_matmul.py
+++ b/examples/arm_matmul.py
@@ -22,6 +22,7 @@ def rank_k_reduce_6x16(
             for k in seq(0, K):
                 C[i, j] += A[i, k] * B[k, j]
 
+
 print("=============Original Matmul==============")
 print(rank_k_reduce_6x16)
 
@@ -48,7 +49,9 @@ def rank_k_reduce_6x16(
 
 # Reshape C_reg so we can map it into vector registers
 neon = divide_dim(neon, "C_reg:_", 1, vec_reg_width)
-neon = repeat(divide_loop)(neon, "for i1 in _: _", vec_reg_width, ["i2", "i3"], perfect=True)
+neon = repeat(divide_loop)(
+    neon, "for i1 in _: _", vec_reg_width, ["i2", "i3"], perfect=True
+)
 neon = simplify(neon)
 
 # Map C_reg operations to vector instructions
@@ -94,4 +97,3 @@ def rank_k_reduce_6x16(
 
 print("============= Rewritten ==============")
 print(neon)
-
diff --git a/examples/main.c b/examples/main.c
index e0e98dc68..dc39c947e 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -1,11 +1,12 @@
+#include <stdint.h>
 #include <stdio.h>
 #include <time.h>
-#include <stdint.h>
 
 // generated from exo
-void rank_k_reduce_6x16( void *ctxt, int_fast32_t K, const float* A, const float* B, float* C );
-void rank_k_reduce_6x16_scheduled( void *ctxt, int_fast32_t K, const float* A, const float* B, float* C );
-
+void rank_k_reduce_6x16(
+    void *ctxt, int_fast32_t K, const float *A, const float *B, float *C);
+void rank_k_reduce_6x16_scheduled(
+    void *ctxt, int_fast32_t K, const float *A, const float *B, float *C);
 
 #define K 2048
 static float A[6 * K];