diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 415df11..510dd59 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -12,8 +12,8 @@ jobs:
   build:
     strategy:
       matrix:
-        go-version: [1.22.x, 1.23.x, 1.24.x]
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        go-version: [1.23.x, 1.24.x, 1.25.x]
+        os: [ubuntu-latest, macos-latest, windows-latest, ubuntu-24.04-arm]
     env:
       CGO_ENABLED: 0
     runs-on: ${{ matrix.os }}
@@ -60,7 +60,7 @@ jobs:
     - name: Set up Go
       uses: actions/setup-go@v6.1.0
       with:
-        go-version: 1.23.x
+        go-version: 1.25.x
 
     - name: Checkout code
       uses: actions/checkout@v6.0.1
@@ -82,7 +82,7 @@ jobs:
     - name: Set up Go
       uses: actions/setup-go@v6.1.0
       with:
-        go-version: 1.24.x
+        go-version: 1.25.x
 
     - name: Checkout code
       uses: actions/checkout@v6.0.1
@@ -97,10 +97,10 @@ jobs:
       run: go build github.com/minio/minlz/cmd/mz&&./mz c -verify -o=comp.mz mz&&./mz d -rm comp.mz&&rm ./mz&&rm comp
 
     - name: goreleaser deprecation
-      run: curl -sfL https://git.io/goreleaser | VERSION=v2.3.2 sh -s -- check
+      run: curl -sfL https://git.io/goreleaser | VERSION=v2.13.2 sh -s -- check
 
     - name: goreleaser snapshot
-      run: curl -sL https://git.io/goreleaser | VERSION=v2.3.2 sh -s -- --snapshot --clean
+      run: curl -sL https://git.io/goreleaser | VERSION=v2.13.2 sh -s -- --snapshot --clean
 
     - name: Test GOAMD64 v3
       env:
@@ -117,16 +117,17 @@ jobs:
   fuzz:
     env:
       CGO_ENABLED: 0
-    runs-on: ubuntu-latest
     strategy:
       matrix:
-        tags: [ 'nounsafe', '"noasm,nounsafe"' ]
+        tags: [ '', 'nounsafe', '"noasm,nounsafe"' ]
         tests: ['FuzzEncodingBlocks', 'FuzzDecode', 'FuzzStreamEncode', 'FuzzStreamDecode', 'FuzzLZ4Block']
+        os: [ubuntu-latest, ubuntu-24.04-arm]
+    runs-on: ${{ matrix.os }}
     steps:
       - name: Set up Go
         uses: actions/setup-go@v6.1.0
         with:
-          go-version: 1.24.x
+          go-version: 1.25.x
 
       - name: Checkout code
         uses: actions/checkout@v6.0.1
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 66f03d8..695c9cf 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -21,12 +21,12 @@ jobs:
         name: Set up Go
         uses: actions/setup-go@4dc6199c7b1a012772edbd06daecab0f50c9053c # v5.2.0
         with:
-          go-version: 1.24.x
+          go-version: 1.25.x
       -
         name: Run GoReleaser
         uses: goreleaser/goreleaser-action@e435ccd777264be153ace6237001ef4d979d3a7a # v6.4.0
         with:
-          version: 2.3.2
+          version: 2.13.2
           args: release --clean
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
index a09c56d..1e4f4e9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /.idea
+/minlz_arm64.test
diff --git a/.goreleaser.yaml b/.goreleaser.yaml
index 79c6de8..d50260d 100644
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -37,7 +37,7 @@ archives:
     name_template: "minlz-{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
     format_overrides:
       - goos: windows
-        format: zip
+        formats: zip
     files:
       - README.md
       - LICENSE
diff --git a/asm_arm64.go b/asm_arm64.go
new file mode 100644
index 0000000..9191998
--- /dev/null
+++ b/asm_arm64.go
@@ -0,0 +1,23 @@
+// Copyright 2026 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build arm64 && !appengine && !noasm && gc && !purego
+
+package minlz
+
+// decodeBlockAsm decodes a non-empty src to a guaranteed-large-enough dst.
+// It assumes that the varint-encoded length of the decompressed bytes has already been read.
+//
+//go:noescape
+func decodeBlockAsm(dst []byte, src []byte) int
diff --git a/asm_arm64.s b/asm_arm64.s
new file mode 100644
index 0000000..6ab815c
--- /dev/null
+++ b/asm_arm64.s
@@ -0,0 +1,987 @@
+// Copyright 2026 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build arm64 && !appengine && !noasm && gc && !purego
+
+#include "textflag.h"
+
+// Register allocation:
+// R0:  return value (0=success, 1=corrupt)
+// R1:  dst base (parameter, then dst current pointer)
+// R2:  dst len (parameter)
+// R3:  src base (parameter, then src current pointer)
+// R4:  src len (parameter)
+// R5:  dstEnd (dst base + dst len)
+// R6:  srcEnd (src base + src len)
+// R7:  offset (last copy offset, init to 1)
+// R8:  dstPos (current position in dst for bounds checking)
+// R9:  scratch / length
+// R10: scratch / tag value
+// R11: scratch / value (tag >> 2)
+// R12: srcLimit (srcEnd - margin)
+// R13: dstLimit (dstEnd - margin)
+// R14: copySrc pointer for copies
+// R15: scratch
+// R16: scratch
+// R17: scratch
+// R19-R28: callee-saved (we save/restore as needed)
+
+// Constants
+#define tagLiteral 0x00
+#define tagCopy1   0x01
+#define tagCopy2   0x02
+#define tagCopy3   0x03
+#define minCopy2Offset 64
+#define minCopy3Offset 65536
+
+// func decodeBlockAsm(dst []byte, src []byte) int
+TEXT ·decodeBlockAsm(SB), NOSPLIT, $0-56
+	// Load parameters
+	// dst is at 0(FP): base=0, len=8, cap=16
+	// src is at 24(FP): base=24, len=32, cap=40
+	MOVD dst_base+0(FP), R1     // dst base -> R1 (current dst ptr)
+	MOVD dst_len+8(FP), R2      // dst len
+	MOVD src_base+24(FP), R3    // src base -> R3 (current src ptr)
+	MOVD src_len+32(FP), R4     // src len
+
+	// Save dst base for final validation
+	MOVD R1, R19                // save original dst base
+
+	// Calculate end pointers
+	ADD R1, R2, R5              // dstEnd = dst + len
+	ADD R3, R4, R6              // srcEnd = src + len
+
+	// Initialize offset to 1 (for repeat operations)
+	MOVD $1, R7
+
+	// Initialize dstPos to 0
+	MOVD ZR, R8
+
+	// Check if we have enough data for fast loop margins
+	// Skip directly to slow loop if srcLen < 36 or dstLen < 32
+	// We need 36-byte src margin (32 for NEON loads + 4 for max tag header size)
+	// We need 32-byte dst margin for safe NEON over-writes
+	CMP $36, R4
+	BLO decode_remain_loop      // srcLen < 36, use slow loop
+	CMP $32, R2
+	BLO decode_remain_loop      // dstLen < 32, use slow loop
+
+	// Calculate limits with margins for fast loop
+	// srcLimit = srcEnd - 36 (ensures safe 32-byte reads after 4-byte tag consumption)
+	// dstLimit = dstEnd - 32
+	SUB $36, R6, R12            // srcLimit
+	SUB $32, R5, R13            // dstLimit
+
+	// ============================================
+	// FAST LOOP - with 36-byte src / 32-byte dst margins
+	// ============================================
+decode_fast_loop:
+	// Note: Go ARM64 CMP Rm, Rn computes Rn - Rm, so CMP Ra, Rb; BHS branches when Rb >= Ra
+	CMP R12, R3
+	BHS decode_remain_entry     // if src >= srcLimit (R3 - R12 >= 0), switch to slow loop
+	CMP R13, R1
+	BHS decode_remain_entry     // if dst >= dstLimit (R1 - R13 >= 0), switch to slow loop
+
+	// Read tag byte
+	MOVBU (R3), R10             // tag byte
+	MOVD R10, R11
+	LSR $2, R11, R11            // value = tag >> 2
+
+	// Check tag type (lower 2 bits)
+	AND $0x03, R10, R16
+	CBNZ R16, decode_fast_copy  // if tag != 0, it's a copy
+
+	// ----------------------------------------
+	// TAG 0: Literals or Repeat
+	// ----------------------------------------
+decode_fast_lits:
+	// Extract length field (bits 3-7)
+	LSR $1, R11, R9             // length field = value >> 1
+	CMP $29, R9
+	BLT decode_fast_lit_short   // 0-28: short literal
+	BEQ decode_fast_lit_1byte   // 29: 1 byte length
+	CMP $30, R9
+	BEQ decode_fast_lit_2byte   // 30: 2 byte length
+	B decode_fast_lit_3byte     // 31: 3 byte length
+
+decode_fast_lit_short:
+	// Length = field + 1, single byte tag
+	ADD $1, R9, R9
+	ADD $1, R3, R3              // consume tag byte
+	B decode_fast_lit_check_repeat
+
+decode_fast_lit_1byte:
+	// Length = 30 + next byte
+	MOVBU 1(R3), R9
+	ADD $30, R9, R9
+	ADD $2, R3, R3
+	B decode_fast_lit_check_repeat
+
+decode_fast_lit_2byte:
+	// Length = 30 + next 2 bytes (little endian)
+	MOVHU 1(R3), R9
+	ADD $30, R9, R9
+	ADD $3, R3, R3
+	B decode_fast_lit_check_repeat
+
+decode_fast_lit_3byte:
+	// Length = 30 + next 3 bytes (little endian)
+	// Load 4 bytes and mask to 24 bits
+	MOVWU (R3), R9
+	LSR $8, R9, R9              // shift out the tag byte
+	ADD $30, R9, R9
+	ADD $4, R3, R3
+
+decode_fast_lit_check_repeat:
+	// Check if this is a repeat (bit 2 of original tag)
+	TBZ $0, R11, decode_fast_lit_copy // if bit 0 of value (bit 2 of tag) is 0, do literal
+
+	// This is a REPEAT - use stored offset
+	B decode_fast_copy_exec
+
+decode_fast_lit_copy:
+	// Bounds check: dst + length <= dstEnd
+	ADD R1, R9, R16
+	CMP R5, R16
+	BHI corrupt                 // if dst + length > dstEnd, corrupt
+
+	// Bounds check: src + length <= srcEnd
+	ADD R3, R9, R16
+	CMP R6, R16
+	BHI corrupt                 // if src + length > srcEnd, corrupt
+
+	// Copy literals
+	CMP $16, R9
+	BLE decode_fast_lit_copy_16
+	CMP $32, R9
+	BLE decode_fast_lit_copy_32
+	B decode_fast_lit_copy_long
+
+decode_fast_lit_copy_16:
+	// Copy up to 16 bytes using NEON
+	VLD1 (R3), [V0.B16]
+	VST1 [V0.B16], (R1)
+	ADD R9, R3, R3
+	ADD R9, R1, R1
+	ADD R9, R8, R8
+	B decode_fast_loop
+
+decode_fast_lit_copy_32:
+	// Copy up to 32 bytes
+	VLD1 (R3), [V0.B16, V1.B16]
+	VST1 [V0.B16, V1.B16], (R1)
+	ADD R9, R3, R3
+	ADD R9, R1, R1
+	ADD R9, R8, R8
+	B decode_fast_loop
+
+decode_fast_lit_copy_long:
+	// Copy 64+ bytes or handle 33-64
+	CMP $64, R9
+	BLE decode_fast_lit_copy_64
+
+	// AMD64-style long literal copy using overlapping writes
+	// This avoids byte-by-byte remainder handling:
+	// 1. Load first 32 bytes into V0,V1
+	// 2. Load last 32 bytes into V2,V3
+	// 3. Loop through middle 32-byte chunks
+	// 4. Write first 32 and last 32 bytes (overlapping handles remainder)
+
+	// Load first 32 bytes
+	VLD1 (R3), [V0.B16, V1.B16]
+
+	// Load last 32 bytes
+	SUB $32, R9, R16              // R16 = length - 32
+	ADD R16, R3, R17              // R17 = src + length - 32
+	VLD1 (R17), [V2.B16, V3.B16]
+
+	// Calculate middle length (bytes 32 to length-32)
+	// Middle exists if length > 64 (length - 64 > 0)
+	MOVD R9, R15                  // R15 = length
+	SUB $64, R15, R15             // R15 = middle length
+
+	// Setup middle pointers
+	ADD $32, R3, R14              // R14 = src + 32 (middle src pointer)
+	ADD $32, R1, R17              // R17 = dst + 32 (middle dst pointer)
+
+	// Copy middle bytes in 32-byte chunks
+	CMP ZR, R15
+	BLE decode_fast_lit_copy_long_finish
+
+decode_fast_lit_copy_long_loop:
+	VLD1.P 32(R14), [V4.B16, V5.B16]
+	VST1.P [V4.B16, V5.B16], 32(R17)
+	SUB $32, R15, R15
+	CMP ZR, R15
+	BGT decode_fast_lit_copy_long_loop
+
+decode_fast_lit_copy_long_finish:
+	// Write first 32 bytes at dst
+	VST1 [V0.B16, V1.B16], (R1)
+
+	// Write last 32 bytes at dst + length - 32
+	SUB $32, R9, R16              // R16 = length - 32
+	ADD R16, R1, R17              // R17 = dst + length - 32
+	VST1 [V2.B16, V3.B16], (R17)
+
+	// Advance pointers by full length
+	ADD R9, R3, R3
+	ADD R9, R1, R1
+	ADD R9, R8, R8
+	B decode_fast_loop
+
+decode_fast_lit_copy_64:
+	// Copy 33-64 bytes
+	VLD1 (R3), [V0.B16, V1.B16]
+	SUB $32, R9, R16
+	ADD R16, R3, R17            // src + length - 32
+	VLD1 (R17), [V2.B16, V3.B16]
+	VST1 [V0.B16, V1.B16], (R1)
+	ADD R16, R1, R17            // dst + length - 32
+	VST1 [V2.B16, V3.B16], (R17)
+	ADD R9, R3, R3
+	ADD R9, R1, R1
+	ADD R9, R8, R8
+	B decode_fast_loop
+
+	// ----------------------------------------
+	// Copy operations
+	// ----------------------------------------
+decode_fast_copy:
+	CMP $2, R16
+	BLT decode_fast_copy1
+	BEQ decode_fast_copy2
+	B decode_fast_copy3
+
+	// ----------------------------------------
+	// TAG 1: Copy with 10-bit offset
+	// ----------------------------------------
+decode_fast_copy1:
+	// Format: [offset_lo:2 | length:4 | tag:2] [offset_hi:8]
+	// Length: 4-18 (0-14 inline) or 18+byte (15 extended)
+	// Offset: 1-1024 (stored as 0-1023)
+
+	// Read 2 bytes for tag+offset
+	MOVHU (R3), R16             // load 2 bytes
+
+	// Extract length (bits 2-5 of first byte)
+	AND $0x0F, R11, R9          // length field = (tag >> 2) & 0x0F
+
+	// Extract offset
+	LSR $6, R16, R7             // offset = word >> 6
+	ADD $1, R7, R7              // offset += 1 (min offset is 1)
+
+	// Check if extended length
+	CMP $15, R9
+	BEQ decode_fast_copy1_extended
+
+	// Short length: 4 + field
+	ADD $4, R9, R9
+	ADD $2, R3, R3              // consume 2 bytes
+	B decode_fast_copy_exec
+
+decode_fast_copy1_extended:
+	// Extended length: 18 + next byte
+	MOVBU 2(R3), R9
+	ADD $18, R9, R9
+	ADD $3, R3, R3              // consume 3 bytes
+	B decode_fast_copy_exec
+
+	// ----------------------------------------
+	// TAG 2: Copy with 16-bit offset
+	// ----------------------------------------
+decode_fast_copy2:
+	// Format: [length:6 | tag:2] [offset:16]
+	// Length: 4-64 (0-60 inline) or 64+1/2/3 bytes (61/62/63 extended)
+	// Offset: 64-65599 (stored + 64)
+
+	// value already has tag >> 2 = length field
+	MOVD R11, R9                // length field
+
+	CMP $61, R9
+	BGE decode_fast_copy2_extended
+
+	// Short length: 4 + field
+	ADD $4, R9, R9
+
+	// Read 16-bit offset
+	MOVHU 1(R3), R7
+	ADD $minCopy2Offset, R7, R7
+	ADD $3, R3, R3
+	B decode_fast_copy_exec
+
+decode_fast_copy2_extended:
+	BEQ decode_fast_copy2_ext1
+	CMP $62, R9
+	BEQ decode_fast_copy2_ext2
+	// 63: 3 byte length extension
+
+	// Read offset first
+	MOVHU 1(R3), R7
+	ADD $minCopy2Offset, R7, R7
+
+	// Read 3-byte length (load 4, shift)
+	MOVWU 2(R3), R9
+	LSR $8, R9, R9
+	ADD $64, R9, R9
+	ADD $6, R3, R3
+	B decode_fast_copy_exec
+
+decode_fast_copy2_ext2:
+	// 2 byte length extension
+	MOVHU 1(R3), R7
+	ADD $minCopy2Offset, R7, R7
+	MOVHU 3(R3), R9
+	ADD $64, R9, R9
+	ADD $5, R3, R3
+	B decode_fast_copy_exec
+
+decode_fast_copy2_ext1:
+	// 1 byte length extension
+	MOVHU 1(R3), R7
+	ADD $minCopy2Offset, R7, R7
+	MOVBU 3(R3), R9
+	ADD $64, R9, R9
+	ADD $4, R3, R3
+	B decode_fast_copy_exec
+
+	// ----------------------------------------
+	// TAG 3: Fused Copy2/3
+	// ----------------------------------------
+decode_fast_copy3:
+	// Load 4 bytes for full analysis
+	MOVWU (R3), R16
+
+	// Check if Copy3 (bit 2 set) or Fused Copy2 (bit 2 clear)
+	TBZ $2, R16, decode_fast_copy2_fused
+
+	// ---- Copy3 with optional fused literals ----
+	// Extract literal count (bits 3-4)
+	LSR $3, R16, R17
+	AND $0x03, R17, R14         // litLen
+
+	// Extract length field (bits 5-10)
+	LSR $5, R16, R9
+	AND $0x3F, R9, R9
+
+	// Extract offset (bits 11-31, 21 bits) + 65536
+	LSR $11, R16, R7
+	ADD $minCopy3Offset, R7, R7
+
+	ADD $4, R3, R3              // consume base 4 bytes
+
+	// Check for extended length
+	CMP $61, R9
+	BGE decode_fast_copy3_extended
+
+	// Short length: 4 + field
+	ADD $4, R9, R9
+	B decode_fast_copy3_lits
+
+decode_fast_copy3_extended:
+	BEQ decode_fast_copy3_ext1
+	CMP $62, R9
+	BEQ decode_fast_copy3_ext2
+
+	// 3 byte length extension
+	MOVWU -1(R3), R9            // reload from offset-1 to get bytes
+	LSR $8, R9, R9
+	ADD $64, R9, R9
+	ADD $3, R3, R3
+	B decode_fast_copy3_lits
+
+decode_fast_copy3_ext2:
+	MOVHU (R3), R9
+	ADD $64, R9, R9
+	ADD $2, R3, R3
+	B decode_fast_copy3_lits
+
+decode_fast_copy3_ext1:
+	MOVBU (R3), R9
+	ADD $64, R9, R9
+	ADD $1, R3, R3
+
+decode_fast_copy3_lits:
+	// Handle fused literals (0-3 bytes)
+	CBZ R14, decode_fast_copy_exec
+
+	// Copy fused literals (1-3 bytes)
+	// Bounds check
+	ADD R1, R14, R16
+	CMP R5, R16
+	BHI corrupt
+
+	// Load and store up to 4 bytes (we have margin)
+	MOVWU (R3), R16
+	MOVW R16, (R1)
+	ADD R14, R3, R3
+	ADD R14, R1, R1
+	ADD R14, R8, R8
+	B decode_fast_copy_exec
+
+decode_fast_copy2_fused:
+	// Fused Copy2: [len:3 | litLen:2 | 0 | tag:2] [offset:16] [lits:1-4]
+	// litLen is 0-3, but actual count is litLen+1 (1-4)
+	// length is 4-11
+
+	// Extract literal count (bits 3-4) + 1
+	LSR $3, R16, R14
+	AND $0x03, R14, R14
+	ADD $1, R14, R14            // litLen = field + 1
+
+	// Extract length (bits 5-7) + 4
+	LSR $5, R16, R9
+	AND $0x07, R9, R9
+	ADD $4, R9, R9
+
+	// Extract offset (bits 8-23) + 64
+	LSR $8, R16, R7
+	AND $0xFFFF, R7, R7
+	ADD $minCopy2Offset, R7, R7
+
+	ADD $3, R3, R3              // consume 3 bytes (tag + offset)
+
+	// Copy fused literals (1-4 bytes)
+	// Bounds check
+	ADD R1, R14, R16
+	CMP R5, R16
+	BHI corrupt
+
+	// Load and store 4 bytes (we have margin for over-read)
+	MOVWU (R3), R16
+	MOVW R16, (R1)
+	ADD R14, R3, R3
+	ADD R14, R1, R1
+	ADD R14, R8, R8
+	// Fall through to copy exec
+
+	// ----------------------------------------
+	// Execute copy operation
+	// R7 = offset, R9 = length
+	// ----------------------------------------
+decode_fast_copy_exec:
+	// Bounds check: offset <= dstPos
+	// CMP R8, R7 computes R7 - R8; BHI branches when R7 > R8 (offset > dstPos)
+	CMP R8, R7
+	BHI corrupt
+
+	// Bounds check: dst + length <= dstEnd
+	ADD R1, R9, R16
+	CMP R5, R16
+	BHI corrupt
+
+	// Calculate source pointer
+	SUB R7, R1, R14             // copySrc = dst - offset
+
+	// Check for overlap: if offset < length, we have overlap
+	// CMP R9, R7 computes R7 - R9; BLO branches when R7 < R9 (offset < length)
+	CMP R9, R7
+	BLO decode_fast_copy_overlap
+
+	// No overlap - can use fast copy
+	CMP $16, R9
+	BLE decode_fast_copy_16
+	CMP $32, R9
+	BLE decode_fast_copy_32
+	CMP $64, R9
+	BLE decode_fast_copy_64
+	B decode_fast_copy_long_nool
+
+decode_fast_copy_16:
+	VLD1 (R14), [V0.B16]
+	VST1 [V0.B16], (R1)
+	ADD R9, R1, R1
+	ADD R9, R8, R8
+	B decode_fast_loop
+
+decode_fast_copy_32:
+	VLD1 (R14), [V0.B16, V1.B16]
+	VST1 [V0.B16, V1.B16], (R1)
+	ADD R9, R1, R1
+	ADD R9, R8, R8
+	B decode_fast_loop
+
+decode_fast_copy_64:
+	// For 33-64 bytes, use overlapping reads/writes (like decode_fast_lit_copy_64)
+	// This avoids over-writing past dst+length
+	VLD1 (R14), [V0.B16, V1.B16]      // load 32 bytes at copySrc
+	SUB $32, R9, R16                   // R16 = length - 32
+	ADD R16, R14, R17                  // R17 = copySrc + length - 32
+	VLD1 (R17), [V2.B16, V3.B16]      // load 32 bytes at copySrc + length - 32
+	VST1 [V0.B16, V1.B16], (R1)       // store 32 bytes at dst
+	ADD R16, R1, R17                   // R17 = dst + length - 32
+	VST1 [V2.B16, V3.B16], (R17)      // store 32 bytes at dst + length - 32
+	ADD R9, R1, R1
+	ADD R9, R8, R8
+	B decode_fast_loop
+
+decode_fast_copy_long_nool:
+	// Long copy without overlap - loop
+	MOVD R9, R15
+
+decode_fast_copy_long_loop:
+	VLD1.P 32(R14), [V0.B16, V1.B16]
+	VST1.P [V0.B16, V1.B16], 32(R1)
+	SUB $32, R15, R15
+	CMP $32, R15
+	BGE decode_fast_copy_long_loop
+
+	// Handle remaining 0-31 bytes
+	CBZ R15, decode_fast_copy_long_done
+
+	// Copy remainder byte-by-byte for correctness
+decode_fast_copy_long_remainder:
+	MOVBU (R14), R16
+	MOVB R16, (R1)
+	ADD $1, R14, R14
+	ADD $1, R1, R1
+	SUB $1, R15, R15
+	CBNZ R15, decode_fast_copy_long_remainder
+
+decode_fast_copy_long_done:
+	ADD R9, R8, R8
+	B decode_fast_loop
+
+decode_fast_copy_overlap:
+	// Overlapping copy - need byte-by-byte or special handling
+	// Check for special small offsets
+
+	CMP $1, R7
+	BEQ decode_fast_copy_overlap_1
+	CMP $2, R7
+	BEQ decode_fast_copy_overlap_2
+	CMP $3, R7
+	BEQ decode_fast_copy_overlap_3
+	B decode_fast_copy_overlap_4plus
+
+decode_fast_copy_overlap_1:
+	// Offset 1: RLE - repeat single byte
+	MOVBU (R14), R16
+	ADD R9, R8, R8
+
+	// For RLE, use simple byte loop - it's not that slow for small lengths
+	// and avoids complex SIMD setup
+decode_overlap1_byte_loop:
+	MOVB R16, (R1)
+	ADD $1, R1, R1
+	SUB $1, R9, R9
+	CBNZ R9, decode_overlap1_byte_loop
+	B decode_fast_loop
+
+decode_fast_copy_overlap_2:
+	// Offset 2: repeat 2-byte pattern
+	MOVHU (R14), R16
+	ADD R9, R8, R8
+
+decode_overlap2_word_loop:
+	// Write 2 bytes at a time as long as we have >= 2 bytes left
+	CMP $2, R9
+	BLT decode_overlap2_finish
+	MOVH R16, (R1)
+	ADD $2, R1, R1
+	SUB $2, R9, R9
+	B decode_overlap2_word_loop
+
+decode_overlap2_finish:
+	// Handle remaining 0 or 1 byte at the END
+	CBZ R9, decode_fast_loop
+	MOVB R16, (R1)
+	ADD $1, R1, R1
+	B decode_fast_loop
+
+decode_fast_copy_overlap_3:
+	// Offset 3: repeat 3-byte pattern
+	MOVWU (R14), R16            // load 4 bytes
+	AND $0xFFFFFF, R16, R16     // keep only 3 bytes
+
+	ADD R9, R8, R8
+
+decode_overlap3_loop:
+	// Check if we have at least 3 bytes left
+	CMP $3, R9
+	BLT decode_overlap3_finish
+	// Write 3 bytes (actually writes 4, but we only advance by 3)
+	MOVW R16, (R1)
+	ADD $3, R1, R1
+	SUB $3, R9, R9
+	B decode_overlap3_loop
+
+decode_overlap3_finish:
+	// Handle remaining 1-2 bytes
+	CBZ R9, decode_fast_loop
+	CMP $1, R9
+	BEQ decode_overlap3_1byte
+	// 2 bytes
+	MOVH R16, (R1)
+	ADD $2, R1, R1
+	B decode_fast_loop
+
+decode_overlap3_1byte:
+	MOVB R16, (R1)
+	ADD $1, R1, R1
+	B decode_fast_loop
+
+decode_fast_copy_overlap_4plus:
+	// Offset 4+: general overlapping copy
+	ADD R9, R8, R8
+
+decode_overlap4_loop:
+	CMP $4, R9
+	BLT decode_overlap4_remainder
+	MOVWU (R14), R16
+	MOVW R16, (R1)
+	ADD $4, R14, R14
+	ADD $4, R1, R1
+	SUB $4, R9, R9
+	B decode_overlap4_loop
+
+decode_overlap4_remainder:
+	CBZ R9, decode_fast_loop
+
+decode_overlap4_byte_loop:
+	MOVBU (R14), R16
+	MOVB R16, (R1)
+	ADD $1, R14, R14
+	ADD $1, R1, R1
+	SUB $1, R9, R9
+	CBNZ R9, decode_overlap4_byte_loop
+	B decode_fast_loop
+
+	// ============================================
+	// SLOW LOOP - no margins, full bounds checking
+	// ============================================
+decode_remain_entry:
+	// Entry point after fast loop
+
+decode_remain_loop:
+	// Check if we're done
+	// Note: Go ARM64 CMP Rm, Rn computes Rn - Rm
+	CMP R6, R3
+	BHS decode_end              // if src >= srcEnd (R3 - R6 >= 0), done
+
+	// Read tag byte
+	MOVBU (R3), R10
+	MOVD R10, R11
+	LSR $2, R11, R11            // value = tag >> 2
+
+	// Check tag type
+	AND $0x03, R10, R16
+	CBNZ R16, decode_remain_copy
+
+	// ----------------------------------------
+	// TAG 0: Literals or Repeat (slow path)
+	// ----------------------------------------
+	LSR $1, R11, R9             // length field
+	CMP $29, R9
+	BLT decode_remain_lit_short
+	BEQ decode_remain_lit_1byte
+	CMP $30, R9
+	BEQ decode_remain_lit_2byte
+	B decode_remain_lit_3byte
+
+decode_remain_lit_short:
+	ADD $1, R9, R9
+	ADD $1, R3, R3
+	B decode_remain_lit_check
+
+decode_remain_lit_1byte:
+	ADD $2, R3, R3
+	CMP R6, R3
+	BHI corrupt
+	MOVBU -1(R3), R9
+	ADD $30, R9, R9
+	B decode_remain_lit_check
+
+decode_remain_lit_2byte:
+	ADD $3, R3, R3
+	CMP R6, R3
+	BHI corrupt
+	MOVHU -2(R3), R9
+	ADD $30, R9, R9
+	B decode_remain_lit_check
+
+decode_remain_lit_3byte:
+	ADD $4, R3, R3
+	CMP R6, R3
+	BHI corrupt
+	MOVWU -4(R3), R9
+	LSR $8, R9, R9
+	ADD $30, R9, R9
+
+decode_remain_lit_check:
+	// Check repeat bit
+	TBZ $0, R11, decode_remain_lit_copy
+
+	// REPEAT - use stored offset
+	B decode_remain_copy_exec
+
+decode_remain_lit_copy:
+	// Bounds check
+	ADD R1, R9, R16
+	CMP R5, R16
+	BHI corrupt
+	ADD R3, R9, R16
+	CMP R6, R16
+	BHI corrupt
+
+	// Copy literals (slow path, byte by byte for simplicity)
+decode_remain_lit_loop:
+	CBZ R9, decode_remain_loop
+	MOVBU (R3), R16
+	MOVB R16, (R1)
+	ADD $1, R3, R3
+	ADD $1, R1, R1
+	ADD $1, R8, R8
+	SUB $1, R9, R9
+	B decode_remain_lit_loop
+
+	// ----------------------------------------
+	// Copy operations (slow path)
+	// ----------------------------------------
+decode_remain_copy:
+	CMP $2, R16
+	BLT decode_remain_copy1
+	BEQ decode_remain_copy2
+	B decode_remain_copy3
+
+decode_remain_copy1:
+	ADD $2, R3, R3
+	CMP R6, R3
+	BHI corrupt
+
+	MOVHU -2(R3), R16
+	AND $0x0F, R11, R9
+	LSR $6, R16, R7
+	ADD $1, R7, R7
+
+	CMP $15, R9
+	BNE decode_remain_copy1_short
+
+	ADD $1, R3, R3
+	CMP R6, R3
+	BHI corrupt
+	MOVBU -1(R3), R9
+	ADD $18, R9, R9
+	B decode_remain_copy_exec
+
+decode_remain_copy1_short:
+	ADD $4, R9, R9
+	B decode_remain_copy_exec
+
+decode_remain_copy2:
+	ADD $3, R3, R3
+	CMP R6, R3
+	BHI corrupt
+
+	MOVBU -3(R3), R9
+	LSR $2, R9, R9
+	MOVHU -2(R3), R7
+
+	CMP $61, R9
+	BGE decode_remain_copy2_ext
+
+	ADD $4, R9, R9
+	ADD $minCopy2Offset, R7, R7
+	B decode_remain_copy_exec
+
+decode_remain_copy2_ext:
+	// Dispatch based on R9 (from CMP $61, R9 above)
+	// Note: Don't modify flags before these branches!
+	BEQ decode_remain_copy2_ext1    // R9 == 61
+	CMP $62, R9
+	BEQ decode_remain_copy2_ext2    // R9 == 62
+	// Fall through for R9 == 63
+
+	// 3 byte extension
+	ADD $minCopy2Offset, R7, R7     // Add offset base
+	ADD $3, R3, R3
+	CMP R6, R3
+	BHI corrupt
+	MOVWU -4(R3), R9
+	LSR $8, R9, R9
+	ADD $64, R9, R9
+	B decode_remain_copy_exec
+
+decode_remain_copy2_ext2:
+	ADD $minCopy2Offset, R7, R7     // Add offset base
+	ADD $2, R3, R3
+	CMP R6, R3
+	BHI corrupt
+	MOVHU -2(R3), R9
+	ADD $64, R9, R9
+	B decode_remain_copy_exec
+
+decode_remain_copy2_ext1:
+	ADD $minCopy2Offset, R7, R7     // Add offset base
+	ADD $1, R3, R3
+	CMP R6, R3
+	BHI corrupt
+	MOVBU -1(R3), R9
+	ADD $64, R9, R9
+	B decode_remain_copy_exec
+
+decode_remain_copy3:
+	ADD $4, R3, R3
+	CMP R6, R3
+	BHI corrupt
+
+	MOVWU -4(R3), R16
+	TBZ $2, R16, decode_remain_copy2_fused
+
+	// Copy3
+	LSR $3, R16, R17
+	AND $0x03, R17, R14         // litLen
+
+	LSR $5, R16, R9
+	AND $0x3F, R9, R9
+
+	LSR $11, R16, R7
+	ADD $minCopy3Offset, R7, R7
+
+	CMP $61, R9
+	BGE decode_remain_copy3_ext
+
+	ADD $4, R9, R9
+	B decode_remain_copy3_lits
+
+decode_remain_copy3_ext:
+	BEQ decode_remain_copy3_ext1
+	CMP $62, R9
+	BEQ decode_remain_copy3_ext2
+
+	// 3 byte ext
+	ADD $3, R3, R3
+	CMP R6, R3
+	BHI corrupt
+	MOVWU -4(R3), R9
+	LSR $8, R9, R9
+	ADD $64, R9, R9
+	B decode_remain_copy3_lits
+
+decode_remain_copy3_ext2:
+	ADD $2, R3, R3
+	CMP R6, R3
+	BHI corrupt
+	MOVHU -2(R3), R9
+	ADD $64, R9, R9
+	B decode_remain_copy3_lits
+
+decode_remain_copy3_ext1:
+	ADD $1, R3, R3
+	CMP R6, R3
+	BHI corrupt
+	MOVBU -1(R3), R9
+	ADD $64, R9, R9
+
+decode_remain_copy3_lits:
+	CBZ R14, decode_remain_copy_exec
+
+	// Bounds check and copy fused literals
+	ADD R1, R14, R16
+	CMP R5, R16
+	BHI corrupt
+	ADD R3, R14, R16
+	CMP R6, R16
+	BHI corrupt
+
+decode_remain_copy3_lit_loop:
+	CBZ R14, decode_remain_copy_exec
+	MOVBU (R3), R16
+	MOVB R16, (R1)
+	ADD $1, R3, R3
+	ADD $1, R1, R1
+	ADD $1, R8, R8
+	SUB $1, R14, R14
+	B decode_remain_copy3_lit_loop
+
+decode_remain_copy2_fused:
+	// Fused Copy2
+	LSR $3, R16, R14
+	AND $0x03, R14, R14
+	ADD $1, R14, R14
+
+	LSR $5, R16, R9
+	AND $0x07, R9, R9
+	ADD $4, R9, R9
+
+	LSR $8, R16, R7
+	AND $0xFFFF, R7, R7
+	ADD $minCopy2Offset, R7, R7
+
+	SUB $1, R3, R3              // back up 1 byte (we consumed 4, need 3)
+
+	// Bounds check and copy fused literals
+	ADD R1, R14, R16
+	CMP R5, R16
+	BHI corrupt
+	ADD R3, R14, R16
+	CMP R6, R16
+	BHI corrupt
+
+decode_remain_copy2_fused_lit_loop:
+	CBZ R14, decode_remain_copy_exec
+	MOVBU (R3), R16
+	MOVB R16, (R1)
+	ADD $1, R3, R3
+	ADD $1, R1, R1
+	ADD $1, R8, R8
+	SUB $1, R14, R14
+	B decode_remain_copy2_fused_lit_loop
+
+decode_remain_copy_exec:
+	// Bounds check: offset <= dstPos
+	// CMP R8, R7 computes R7 - R8; BHI branches when R7 > R8 (offset > dstPos)
+	CMP R8, R7
+	BHI corrupt
+
+	ADD R1, R9, R16
+	CMP R5, R16
+	BHI corrupt
+
+	// Calculate source
+	SUB R7, R1, R14
+
+	// Simple byte-by-byte copy for slow path
+	ADD R9, R8, R8
+
+decode_remain_copy_loop:
+	CBZ R9, decode_remain_loop
+	MOVBU (R14), R16
+	MOVB R16, (R1)
+	ADD $1, R14, R14
+	ADD $1, R1, R1
+	SUB $1, R9, R9
+	B decode_remain_copy_loop
+
+	// ============================================
+	// END - Validation
+	// ============================================
+decode_end:
+	// Validate we consumed all input and filled all output
+	// dst should equal dstEnd (original dst base + dst len)
+	// src should equal srcEnd
+
+	CMP R5, R1
+	BNE corrupt                 // dst != dstEnd
+	CMP R6, R3
+	BNE corrupt                 // src != srcEnd
+
+	// Success
+	MOVD ZR, ret+48(FP)
+	RET
+
+corrupt:
+	MOVD $1, R0
+	MOVD R0, ret+48(FP)
+	RET
diff --git a/decode_arm64.go b/decode_arm64.go
new file mode 100644
index 0000000..4cfe413
--- /dev/null
+++ b/decode_arm64.go
@@ -0,0 +1,36 @@
+// Copyright 2026 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build arm64 && !appengine && !noasm && gc && !purego
+
+package minlz
+
+import (
+	"github.com/minio/minlz/internal/race"
+)
+
+// minLZDecode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func minLZDecode(dst, src []byte) int {
+	if dst == nil {
+		panic("nil dst")
+	}
+
+	race.ReadSlice(src)
+	race.WriteSlice(dst)
+	return decodeBlockAsm(dst, src)
+}
diff --git a/decode_asm_test.go b/decode_asm_test.go
new file mode 100644
index 0000000..ca97d56
--- /dev/null
+++ b/decode_asm_test.go
@@ -0,0 +1,416 @@
+// Copyright 2026 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build (amd64 || arm64) && !appengine && !noasm && gc && !purego
+
+package minlz
+
+import (
+	"archive/zip"
+	"bytes"
+	"io"
+	"os"
+	"testing"
+)
+
+// TestCompareDecoders compares the Go and ASM decoders
+func TestCompareDecoders(t *testing.T) {
+	// Test data that triggers specific patterns
+	testCases := []struct {
+		name string
+		data []byte
+	}{
+		{"small", []byte("hello world")},
+		{"repeated_small", bytes.Repeat([]byte("abcd"), 100)},
+		{"repeated_large", bytes.Repeat([]byte("abcdefghijklmnop"), 10000)},
+		{"random_like", generateTestData(100000)},
+		{"very_random", generateTestData(500000)},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			testDecoderComparison(t, tc.data)
+		})
+	}
+}
+
+// TestCompareDecodersWithRegressionData uses actual regression test data
+func TestCompareDecodersWithRegressionData(t *testing.T) {
+	data, err := os.ReadFile("testdata/enc_regressions.zip")
+	if err != nil {
+		t.Skip("enc_regressions.zip not found")
+		return
+	}
+	zr, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, f := range zr.File {
+		t.Run(f.Name, func(t *testing.T) {
+			r, err := f.Open()
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer r.Close()
+			fileData, err := io.ReadAll(r)
+			if err != nil {
+				t.Fatal(err)
+			}
+			testDecoderComparison(t, fileData)
+		})
+	}
+}
+
+// testDecoderComparison compares ASM and Go decoder outputs.
+func testDecoderComparison(t *testing.T, data []byte) {
+	for level := LevelFastest; level <= LevelSmallest; level++ {
+		levelName := []string{"fastest", "balanced", "smallest"}[level-1]
+		t.Run(levelName, func(t *testing.T) {
+			encoded, err := Encode(nil, data, level)
+			if err != nil {
+				t.Fatalf("Encode failed: %v", err)
+			}
+
+			// Get decoded length
+			_, _, block, dLen, err := isMinLZ(encoded)
+			if err != nil {
+				t.Fatalf("isMinLZ failed: %v", err)
+			}
+
+			// Decode with ASM
+			dstAsm := make([]byte, dLen)
+			errAsm := decodeBlockAsm(dstAsm, block)
+
+			// Decode with Go
+			dstGo := make([]byte, dLen)
+			errGo := minLZDecodeGo(dstGo, block)
+
+			if errAsm != errGo {
+				t.Errorf("error mismatch: ASM=%d, Go=%d", errAsm, errGo)
+				if errAsm != 0 && errGo == 0 {
+					t.Logf("ASM decoder returned error, Go decoder succeeded")
+					t.Logf("Block size: %d, decoded size: %d", len(block), dLen)
+					binarySearchFailure(t, block, dLen)
+				} else if errAsm == 0 && errGo == 1 {
+					t.Logf("Go decoder returned error, ASM decoder succeeded - unexpected")
+				}
+				return
+			}
+
+			if errAsm == 0 && !bytes.Equal(dstAsm, dstGo) {
+				for i := range dstAsm {
+					if i >= len(dstGo) || dstAsm[i] != dstGo[i] {
+						t.Errorf("content mismatch at byte %d: ASM=%02x, Go=%02x",
+							i, dstAsm[i], dstGo[i])
+
+						start := i - 20
+						if start < 0 {
+							start = 0
+						}
+						end := i + 20
+						if end > len(dstGo) {
+							end = len(dstGo)
+						}
+						t.Logf("Go output around mismatch [%d:%d]: %x", start, end, dstGo[start:end])
+						t.Logf("ASM output around mismatch [%d:%d]: %x", start, end, dstAsm[start:end])
+
+						mismatchCount := 0
+						firstMismatch := i
+						lastMismatch := i
+						for j := i; j < len(dstAsm) && j < len(dstGo); j++ {
+							if dstAsm[j] != dstGo[j] {
+								mismatchCount++
+								lastMismatch = j
+							}
+						}
+						t.Logf("Total mismatches from byte %d onwards: %d (last at %d)", i, mismatchCount, lastMismatch)
+
+						if mismatchCount == 4 && lastMismatch-firstMismatch == 3 {
+							t.Logf("This looks like a 2-byte pattern swap!")
+						}
+
+						t.Logf("Encoded block size: %d bytes", len(block))
+						break
+					}
+				}
+			}
+		})
+	}
+}
+
+// binarySearchFailure finds the minimum prefix of the block that causes ASM to fail
+func binarySearchFailure(t *testing.T, block []byte, fullDLen int) {
+	t.Helper()
+
+	dstAsm := make([]byte, fullDLen)
+	if decodeBlockAsm(dstAsm, block) == 0 {
+		t.Log("Full block actually passes ASM decoder")
+		return
+	}
+
+	t.Logf("Searching for failure point in %d byte block...", len(block))
+
+	if len(block) > 40 {
+		margin := len(block) - 20
+		t.Logf("Bytes near fast/slow boundary (pos %d): ...%x...", margin-10, block[margin-10:margin+10])
+	}
+
+	if len(block) > 50 {
+		t.Logf("Last 50 bytes: %x", block[len(block)-50:])
+	}
+}
+
+// TestSpecificTagSequences tests specific tag patterns that might trigger bugs
+func TestSpecificTagSequences(t *testing.T) {
+	testCases := []struct {
+		name string
+		data []byte
+	}{
+		{"large_offset_copy2", makeLargeOffsetData(100000, 65000)},
+		{"large_offset_copy3", makeLargeOffsetData(200000, 100000)},
+		{"fused_lits_small", makeFusedLitData(10000)},
+		{"fused_lits_large", makeFusedLitData(100000)},
+		{"long_literals", makeLongLiteralData(100000)},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			testDecoderComparison(t, tc.data)
+		})
+	}
+}
+
+func makeLargeOffsetData(size, minOffset int) []byte {
+	data := make([]byte, size)
+	for i := range data {
+		data[i] = byte(i % 251)
+	}
+	pattern := []byte("LARGEPAT")
+	if minOffset < size-len(pattern)*2 {
+		copy(data[0:], pattern)
+		copy(data[minOffset:], pattern)
+	}
+	return data
+}
+
+func makeFusedLitData(size int) []byte {
+	data := make([]byte, size)
+	for i := range data {
+		data[i] = byte((i * 3) % 256)
+	}
+	pattern := []byte("ABCD")
+	for i := 100; i < size-10; i += 500 {
+		copy(data[i:], pattern)
+		data[i+len(pattern)] = byte(i % 256)
+		data[i+len(pattern)+1] = byte((i + 1) % 256)
+	}
+	return data
+}
+
+func makeLongLiteralData(size int) []byte {
+	data := make([]byte, size)
+	for i := range data {
+		data[i] = byte((i*17 + i*i) % 256)
+	}
+	return data
+}
+
+// TestOffset2Pattern tests the specific pattern swap issue
+func TestOffset2Pattern(t *testing.T) {
+	sizes := []int{65549, 70000, 60000, 50000}
+
+	for _, size := range sizes {
+		t.Run("size_"+string(rune('0'+size/10000)), func(t *testing.T) {
+			data := make([]byte, size)
+			for i := range data {
+				if i%2 == 0 {
+					data[i] = 0x35
+				} else {
+					data[i] = 0x7a
+				}
+			}
+			for i := 1000; i < size-100; i += 3000 {
+				copy(data[i:], []byte("UNIQUE_MARKER"))
+			}
+
+			testDecoderComparison(t, data)
+		})
+	}
+}
+
+// TestShortRepeats tests short repeat patterns (offset 1-4)
+func TestShortRepeats(t *testing.T) {
+	testCases := []struct {
+		name   string
+		offset int
+		length int
+	}{
+		{"offset1_len4", 1, 4},
+		{"offset2_len4", 2, 4},
+		{"offset2_len10", 2, 10},
+		{"offset3_len9", 3, 9},
+		{"offset4_len16", 4, 16},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			data := make([]byte, 10000)
+			for i := range data {
+				data[i] = byte((i * 7) % 256)
+			}
+			pattern := make([]byte, tc.offset)
+			for i := range pattern {
+				pattern[i] = byte('A' + i)
+			}
+			copy(data[1000:], pattern)
+			for i := 0; i < tc.length; i++ {
+				data[1000+tc.offset+i] = pattern[i%tc.offset]
+			}
+
+			testDecoderComparison(t, data)
+		})
+	}
+}
+
+// TestLargeDataWith35_7aPattern tests the specific pattern seen in s2fail.txt
+func TestLargeDataWith35_7aPattern(t *testing.T) {
+	sizes := []int{65549, 66000, 60000}
+	for _, size := range sizes {
+		t.Run("size_"+string(rune('0'+size/10000)), func(t *testing.T) {
+			data := make([]byte, size)
+
+			for i := range data {
+				switch i % 10 {
+				case 0:
+					data[i] = '3'
+				case 1:
+					data[i] = '5'
+				case 2, 3:
+					data[i] = 'z'
+				case 4:
+					data[i] = '1'
+				default:
+					data[i] = byte('0' + (i % 10))
+				}
+			}
+
+			for i := 0; i < size-20; i += 3000 {
+				copy(data[i:], []byte("UNIQUE_MARKER_"))
+			}
+
+			testDecoderComparison(t, data)
+		})
+	}
+}
+
+// TestVeryLargeOffset2 tests offset-2 copies at various large positions
+func TestVeryLargeOffset2(t *testing.T) {
+	positions := []int{50000, 55000, 56000, 56500, 56600, 56620, 57000}
+
+	for _, pos := range positions {
+		t.Run("pos_"+string(rune('0'+pos/10000)), func(t *testing.T) {
+			size := pos + 1000
+			data := make([]byte, size)
+
+			for i := range data {
+				data[i] = byte((i*13 + i/7) % 256)
+			}
+
+			data[pos-100] = 0x35
+			data[pos-99] = 0x7a
+			data[pos-98] = 0x35
+			data[pos-97] = 0x7a
+			data[pos-96] = 0x35
+			data[pos-95] = 0x7a
+
+			testDecoderComparison(t, data)
+		})
+	}
+}
+
+func generateTestData(size int) []byte {
+	data := make([]byte, size)
+	for i := range data {
+		data[i] = byte((i*7 + i/13) % 256)
+	}
+	pattern := []byte("PATTERN_DATA_HERE")
+	for i := 1000; i < size-len(pattern); i += 5000 {
+		copy(data[i:], pattern)
+	}
+	return data
+}
+
+// TestSrcMarginBoundary tests the source buffer margin boundary.
+// This is a regression test for a security issue where the fast loop
+// could over-read the source buffer when a 4-byte tag header was consumed
+// just before a 32-byte NEON load, requiring 36-byte margin instead of 32.
+func TestSrcMarginBoundary(t *testing.T) {
+	// Test various small sizes that stress the margin boundary
+	// The fast loop requires srcLen >= 36 bytes (32 for NEON + 4 for tag header)
+	sizes := []int{30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 50, 60, 70, 80}
+
+	for _, size := range sizes {
+		t.Run("size_"+string(rune('0'+size/10))+string(rune('0'+size%10)), func(t *testing.T) {
+			// Test with various data patterns
+			patterns := [][]byte{
+				bytes.Repeat([]byte{'a'}, size),                           // compressible
+				bytes.Repeat([]byte{'a', 'b'}, size/2+1)[:size],           // 2-byte pattern
+				bytes.Repeat([]byte{'a', 'b', 'c', 'd'}, size/4+1)[:size], // 4-byte pattern
+			}
+
+			// Add random-like pattern
+			random := make([]byte, size)
+			for i := range random {
+				random[i] = byte((i*17 + i*i) % 256)
+			}
+			patterns = append(patterns, random)
+
+			for i, data := range patterns {
+				for level := LevelFastest; level <= LevelSmallest; level++ {
+					encoded, err := Encode(nil, data, level)
+					if err != nil {
+						t.Fatalf("pattern %d, level %d: Encode failed: %v", i, level, err)
+					}
+
+					// Get block info
+					_, _, block, dLen, err := isMinLZ(encoded)
+					if err != nil {
+						t.Fatalf("pattern %d, level %d: isMinLZ failed: %v", i, level, err)
+					}
+
+					// Decode with ASM
+					dstAsm := make([]byte, dLen)
+					errAsm := decodeBlockAsm(dstAsm, block)
+
+					// Decode with Go
+					dstGo := make([]byte, dLen)
+					errGo := minLZDecodeGo(dstGo, block)
+
+					if errAsm != errGo {
+						t.Errorf("pattern %d, level %d: error mismatch: ASM=%d, Go=%d", i, level, errAsm, errGo)
+						continue
+					}
+
+					if errAsm == 0 && !bytes.Equal(dstAsm, dstGo) {
+						t.Errorf("pattern %d, level %d: content mismatch", i, level)
+					}
+
+					if errAsm == 0 && !bytes.Equal(dstAsm, data) {
+						t.Errorf("pattern %d, level %d: decoded data doesn't match original", i, level)
+					}
+				}
+			}
+		})
+	}
+}
diff --git a/decode_other.go b/decode_other.go
index 4d4f556..a79734e 100644
--- a/decode_other.go
+++ b/decode_other.go
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//go:build !amd64 || appengine || !gc || noasm || purego
+//go:build (!amd64 && !arm64) || appengine || !gc || noasm || purego
 
 package minlz
 
diff --git a/decode_test.go b/decode_test.go
new file mode 100644
index 0000000..666dec6
--- /dev/null
+++ b/decode_test.go
@@ -0,0 +1,198 @@
+// Copyright 2026 MinIO Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package minlz
+
+import (
+	"bytes"
+	"math/rand"
+	"testing"
+)
+
+// TestDecodeBlock tests the decoder with various data patterns.
+func TestDecodeBlock(t *testing.T) {
+	testCases := []struct {
+		name string
+		data []byte
+	}{
+		{"empty", []byte{}},
+		{"small_literal", []byte("hello world")},
+		{"repeated", bytes.Repeat([]byte("abcd"), 1000)},
+		{"mixed", append(bytes.Repeat([]byte("x"), 100), bytes.Repeat([]byte("y"), 100)...)},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			for level := LevelFastest; level <= LevelSmallest; level++ {
+				encoded, err := Encode(nil, tc.data, level)
+				if err != nil {
+					t.Fatalf("Encode failed: %v", err)
+				}
+
+				decoded, err := Decode(nil, encoded)
+				if err != nil {
+					t.Fatalf("Decode failed: %v", err)
+				}
+
+				if !bytes.Equal(tc.data, decoded) {
+					t.Errorf("level %d: decode mismatch: got %d bytes, want %d bytes",
+						level, len(decoded), len(tc.data))
+				}
+			}
+		})
+	}
+}
+
+// TestDecodeBlockRandom tests with random data of various sizes.
+func TestDecodeBlockRandom(t *testing.T) {
+	rng := rand.New(rand.NewSource(42))
+
+	sizes := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000}
+	if testing.Short() {
+		sizes = sizes[:5]
+	}
+
+	for _, size := range sizes {
+		t.Run("size_"+string(rune('0'+size%10)), func(t *testing.T) {
+			data := make([]byte, size)
+			rng.Read(data)
+
+			for level := LevelFastest; level <= LevelSmallest; level++ {
+				encoded, err := Encode(nil, data, level)
+				if err != nil {
+					t.Fatalf("Encode failed: %v", err)
+				}
+
+				decoded, err := Decode(nil, encoded)
+				if err != nil {
+					t.Fatalf("Decode failed for size %d, level %d: %v", size, level, err)
+				}
+
+				if !bytes.Equal(data, decoded) {
+					t.Errorf("size %d, level %d: decode mismatch", size, level)
+				}
+			}
+		})
+	}
+}
+
+// TestDecodeBlockOverlapping tests overlapping copy scenarios.
+func TestDecodeBlockOverlapping(t *testing.T) {
+	// Test various RLE-like patterns that exercise overlapping copies
+	patterns := [][]byte{
+		bytes.Repeat([]byte{'a'}, 1000),                              // offset 1 RLE
+		bytes.Repeat([]byte{'a', 'b'}, 500),                          // offset 2 pattern
+		bytes.Repeat([]byte{'a', 'b', 'c'}, 333),                     // offset 3 pattern
+		bytes.Repeat([]byte{'a', 'b', 'c', 'd'}, 250),                // offset 4 pattern
+		bytes.Repeat([]byte{'a', 'b', 'c', 'd', 'e', 'f', 'g'}, 143), // offset 7 pattern
+	}
+
+	for i, pattern := range patterns {
+		t.Run("pattern_"+string(rune('0'+i)), func(t *testing.T) {
+			for level := LevelFastest; level <= LevelSmallest; level++ {
+				encoded, err := Encode(nil, pattern, level)
+				if err != nil {
+					t.Fatalf("Encode failed: %v", err)
+				}
+
+				decoded, err := Decode(nil, encoded)
+				if err != nil {
+					t.Fatalf("Decode failed: %v", err)
+				}
+
+				if !bytes.Equal(pattern, decoded) {
+					t.Errorf("pattern %d, level %d: decode mismatch", i, level)
+				}
+			}
+		})
+	}
+}
+
+// TestDecodeBlockLongOffsets tests long offset copies (Copy2/Copy3).
+func TestDecodeBlockLongOffsets(t *testing.T) {
+	// Create data with patterns at various offsets
+	size := 200000
+	data := make([]byte, size)
+	rng := rand.New(rand.NewSource(123))
+
+	// Fill with semi-random data
+	for i := range data {
+		data[i] = byte(rng.Intn(256))
+	}
+
+	// Insert repeated patterns at various offsets to trigger Copy2/Copy3
+	pattern := []byte("REPEATED_PATTERN_DATA")
+	offsets := []int{100, 1000, 10000, 65000, 100000}
+	for _, off := range offsets {
+		if off+len(pattern) < size {
+			copy(data[off:], pattern)
+			// Copy the pattern later to trigger backreference
+			if off*2+len(pattern) < size {
+				copy(data[off*2:], pattern)
+			}
+		}
+	}
+
+	for level := LevelFastest; level <= LevelSmallest; level++ {
+		t.Run("level_"+string(rune('0'+level)), func(t *testing.T) {
+			encoded, err := Encode(nil, data, level)
+			if err != nil {
+				t.Fatalf("Encode failed: %v", err)
+			}
+
+			decoded, err := Decode(nil, encoded)
+			if err != nil {
+				t.Fatalf("Decode failed: %v", err)
+			}
+
+			if !bytes.Equal(data, decoded) {
+				t.Errorf("level %d: decode mismatch", level)
+			}
+		})
+	}
+}
+
+// BenchmarkDecodeBlock benchmarks the decoder.
+func BenchmarkDecodeBlock(b *testing.B) {
+	sizes := []int{1000, 10000, 100000, 1000000}
+
+	for _, size := range sizes {
+		data := make([]byte, size)
+		rand.Read(data)
+
+		encoded, _ := Encode(nil, data, LevelFastest)
+		dst := make([]byte, size)
+
+		b.Run("random_"+string(rune('0'+size%10)), func(b *testing.B) {
+			b.SetBytes(int64(size))
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				Decode(dst, encoded)
+			}
+		})
+	}
+
+	// Benchmark highly compressible data
+	data := bytes.Repeat([]byte("abcdefghij"), 100000)
+	encoded, _ := Encode(nil, data, LevelFastest)
+	dst := make([]byte, len(data))
+
+	b.Run("compressible", func(b *testing.B) {
+		b.SetBytes(int64(len(data)))
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			Decode(dst, encoded)
+		}
+	})
+}
diff --git a/run-test-arm64.cmd b/run-test-arm64.cmd
new file mode 100644
index 0000000..fd7eed7
--- /dev/null
+++ b/run-test-arm64.cmd
@@ -0,0 +1,23 @@
+@echo off
+REM Build ARM64 test binary on host and run in Docker container
+REM Usage: run-test-arm64.cmd [test flags]
+REM Example: run-test-arm64.cmd -test.run=TestSrcMarginBoundary -test.v
+
+echo Building ARM64 test binary...
+set GOOS=linux
+set GOARCH=arm64
+go test -c -o minlz_arm64.test
+
+if %ERRORLEVEL% neq 0 (
+    echo Build failed!
+    SET GOOS=windows
+    SET GOARCH=amd64
+    exit /b %ERRORLEVEL%
+)
+
+SET GOOS=windows
+SET GOARCH=amd64
+
+echo Running tests in ARM64 container...
+docker run --rm --platform linux/arm64 -e GOGC=20 -e GOMEMLIMIT=2GiB -v "%cd%:/work" -w /work arm64v8/alpine ./minlz_arm64.test %*
+