diff --git a/evaluator/buffers.go b/evaluator/buffers.go
new file mode 100644
index 0000000..f72ac77
--- /dev/null
+++ b/evaluator/buffers.go
@@ -0,0 +1,240 @@
+package evaluator
+
+import (
+	"github.com/thedonutfactory/go-tfhe/params"
+	"github.com/thedonutfactory/go-tfhe/poly"
+	"github.com/thedonutfactory/go-tfhe/tlwe"
+	"github.com/thedonutfactory/go-tfhe/trlwe"
+)
+
+// BufferPool is a centralized buffer management system for zero-allocation TFHE operations.
+// All buffers are pre-allocated once during initialization and reused throughout computation.
+//
+// Memory Layout Overview:
+//   - Polynomial buffers: ~200 KB (FFT, decomposition, rotation)
+//   - Ciphertext buffers: ~50 KB (TRLWE, LWE intermediates)
+//   - Total: ~250 KB per evaluator instance
+//
+// Thread Safety:
+//   - Each evaluator has its own buffer pool
+//   - Use sync.Pool or create separate instances for concurrent operations
+type BufferPool struct {
+	// === Polynomial Operation Buffers ===
+	// Used for FFT, multiplication, and decomposition
+
+	// PolyBuffers manages polynomial and FFT operations
+	PolyBuffers *poly.BufferManager
+
+	// === Bootstrap Operation Buffers ===
+
+	// External Product buffers (TRGSW ⊗ TRLWE)
+	ExternalProduct struct {
+		// Fourier domain accumulators
+		FourierA poly.FourierPoly // ~8 KB
+		FourierB poly.FourierPoly // ~8 KB
+		// Time domain result
+		Result *trlwe.TRLWELv1 // ~8 KB
+	}
+
+	// CMUX buffers (conditional multiplexer)
+	CMUX struct {
+		Temp *trlwe.TRLWELv1 // Difference buffer (ct1 - ct0)
+	}
+
+	// Blind Rotation buffers
+	BlindRotation struct {
+		Accumulator1 *trlwe.TRLWELv1 // Primary accumulator
+		Accumulator2 *trlwe.TRLWELv1 // Secondary accumulator
+		Rotated      *trlwe.TRLWELv1 // Rotation result
+	}
+
+	// Bootstrap buffers (full bootstrap = blind rotate + key switch)
+	Bootstrap struct {
+		ExtractedLWE *tlwe.TLWELv1 // After sample extraction
+		KeySwitched  *tlwe.TLWELv0 // After key switching
+	}
+
+	// === Gate Operation Buffers ===
+
+	// Gate preparation buffer (for AND, OR, XOR, etc.)
+	GatePrep *tlwe.TLWELv0
+
+	// Result pool for returning values without allocation
+	// Round-robin buffer to handle compound operations (e.g., MUX)
+	ResultPool struct {
+		Buffers [4]*tlwe.TLWELv0 // 4 slots for compound operations
+		Index   int              // Current index (0-3)
+	}
+
+	// === Block Blind Rotation Buffers (for 3-4x speedup) ===
+	// Only allocated if params.UseBlockBlindRotation() == true
+
+	BlockRotation *BlockRotationBuffers
+}
+
+// BlockRotationBuffers contains buffers for block-based blind rotation algorithm
+// This provides 3-4x speedup by processing multiple LWE coefficients together
+type BlockRotationBuffers struct {
+	// Decomposed accumulator in Fourier domain
+	// [blockSize][glweRank+1][level]
+	AccFourierDecomposed [][][]poly.FourierPoly
+
+	// Block accumulator in Fourier domain [blockSize]
+	BlockFourierAcc []struct {
+		A poly.FourierPoly
+		B poly.FourierPoly
+	}
+
+	// Intermediate Fourier accumulator [blockSize]
+	FourierAcc []struct {
+		A poly.FourierPoly
+		B poly.FourierPoly
+	}
+
+	// Fourier monomial for multiplication
+	FourierMono poly.FourierPoly
+}
+
+// NewBufferPool creates a new centralized buffer pool for the given polynomial size.
+// This allocates all buffers once during initialization (~250 KB total).
+//
+// Parameters:
+//
+//	n: Polynomial degree (typically 1024 for standard TFHE parameters)
+//
+// Memory allocation:
+//   - Polynomial buffers: ~200 KB (managed by poly.BufferManager)
+//   - Ciphertext buffers: ~50 KB (TRLWE, LWE structures)
+//   - Block rotation: ~30 KB (if enabled)
+func NewBufferPool(n int) *BufferPool {
+	bp := &BufferPool{
+		PolyBuffers: poly.NewBufferManager(n),
+	}
+
+	// Initialize external product buffers
+	bp.ExternalProduct.FourierA = poly.NewFourierPoly(n)
+	bp.ExternalProduct.FourierB = poly.NewFourierPoly(n)
+	bp.ExternalProduct.Result = trlwe.NewTRLWELv1()
+
+	// Initialize CMUX buffers
+	bp.CMUX.Temp = trlwe.NewTRLWELv1()
+
+	// Initialize blind rotation buffers
+	bp.BlindRotation.Accumulator1 = trlwe.NewTRLWELv1()
+	bp.BlindRotation.Accumulator2 = trlwe.NewTRLWELv1()
+	bp.BlindRotation.Rotated = trlwe.NewTRLWELv1()
+
+	// Initialize bootstrap buffers
+	bp.Bootstrap.ExtractedLWE = tlwe.NewTLWELv1()
+	bp.Bootstrap.KeySwitched = tlwe.NewTLWELv0()
+
+	// Initialize gate preparation buffer
+	bp.GatePrep = tlwe.NewTLWELv0()
+
+	// Initialize result pool
+	for i := 0; i < 4; i++ {
+		bp.ResultPool.Buffers[i] = tlwe.NewTLWELv0()
+	}
+	bp.ResultPool.Index = 0
+
+	// Initialize block rotation buffers if enabled
+	if params.UseBlockBlindRotation() {
+		bp.BlockRotation = newBlockRotationBuffers(n)
+	}
+
+	return bp
+}
+
+// newBlockRotationBuffers creates buffers for block-based blind rotation
+func newBlockRotationBuffers(n int) *BlockRotationBuffers {
+	blockSize := params.GetTRGSWLv1().BlockSize
+	if blockSize < 1 {
+		blockSize = 1
+	}
+	glweRank := 1 // Fixed for our parameters
+	level := params.GetTRGSWLv1().L
+
+	brb := &BlockRotationBuffers{}
+
+	// Initialize AccFourierDecomposed[blockSize][glweRank+1][level]
+	brb.AccFourierDecomposed = make([][][]poly.FourierPoly, blockSize)
+	for i := 0; i < blockSize; i++ {
+		brb.AccFourierDecomposed[i] = make([][]poly.FourierPoly, glweRank+1)
+		for j := 0; j < glweRank+1; j++ {
+			brb.AccFourierDecomposed[i][j] = make([]poly.FourierPoly, level)
+			for k := 0; k < level; k++ {
+				brb.AccFourierDecomposed[i][j][k] = poly.NewFourierPoly(n)
+			}
+		}
+	}
+
+	// Initialize BlockFourierAcc[blockSize]
+	brb.BlockFourierAcc = make([]struct {
+		A poly.FourierPoly
+		B poly.FourierPoly
+	}, blockSize)
+	for i := 0; i < blockSize; i++ {
+		brb.BlockFourierAcc[i].A = poly.NewFourierPoly(n)
+		brb.BlockFourierAcc[i].B = poly.NewFourierPoly(n)
+	}
+
+	// Initialize FourierAcc[blockSize]
+	brb.FourierAcc = make([]struct {
+		A poly.FourierPoly
+		B poly.FourierPoly
+	}, blockSize)
+	for i := 0; i < blockSize; i++ {
+		brb.FourierAcc[i].A = poly.NewFourierPoly(n)
+		brb.FourierAcc[i].B = poly.NewFourierPoly(n)
+	}
+
+	// Initialize Fourier monomial
+	brb.FourierMono = poly.NewFourierPoly(n)
+
+	return brb
+}
+
+// GetNextResult returns the next available result buffer from the round-robin pool.
+// This allows operations to return results without allocation.
+// The buffer is valid until 4 more operations are performed.
+func (bp *BufferPool) GetNextResult() *tlwe.TLWELv0 {
+	result := bp.ResultPool.Buffers[bp.ResultPool.Index]
+	bp.ResultPool.Index = (bp.ResultPool.Index + 1) % 4
+	return result
+}
+
+// Reset resets all buffer pool indices to their initial state.
+// Call this when reusing an evaluator for a new computation.
+func (bp *BufferPool) Reset() {
+	bp.ResultPool.Index = 0
+	bp.PolyBuffers.Reset()
+}
+
+// MemoryUsage returns the approximate memory usage in bytes
+func (bp *BufferPool) MemoryUsage() int {
+	n := params.GetTRGSWLv1().N
+
+	// Polynomial buffers (managed by poly.BufferManager)
+	polyMem := bp.PolyBuffers.MemoryUsage()
+
+	// Ciphertext buffers
+	trlweSize := 2 * n * 4  // 2 polynomials * N elements * 4 bytes
+	tlweSize := (n + 1) * 4 // (N+1) elements * 4 bytes
+
+	ciphertextMem := trlweSize*5 + // 5 TRLWE buffers
+		tlweSize*5 + // 5 LWE buffers
+		2*n*8 // 2 FourierPoly in ExternalProduct
+
+	// Block rotation buffers (if enabled)
+	blockMem := 0
+	if bp.BlockRotation != nil {
+		blockSize := params.GetTRGSWLv1().BlockSize
+		level := params.GetTRGSWLv1().L
+		glweRank := 1
+		blockMem = blockSize * (glweRank + 1) * level * n * 8 * 2 // AccFourierDecomposed
+		blockMem += blockSize * 2 * n * 8 * 2                     // BlockFourierAcc + FourierAcc
+		blockMem += n * 8 * 2                                     // FourierMono
+	}
+
+	return polyMem + ciphertextMem + blockMem
+}
diff --git a/evaluator/evaluator.go b/evaluator/evaluator.go
new file mode 100644
index 0000000..bce7796
--- /dev/null
+++ b/evaluator/evaluator.go
@@ -0,0 +1,162 @@
+// Package evaluator provides a zero-allocation TFHE evaluator
+// Following tfhe-go's architecture exactly for maximum performance
+package evaluator
+
+import (
+	"github.com/thedonutfactory/go-tfhe/params"
+	"github.com/thedonutfactory/go-tfhe/poly"
+	"github.com/thedonutfactory/go-tfhe/tlwe"
+	"github.com/thedonutfactory/go-tfhe/trgsw"
+	"github.com/thedonutfactory/go-tfhe/trlwe"
+)
+
+// Evaluator performs TFHE operations with zero allocations
+// This follows tfhe-go's architecture exactly
+type Evaluator struct {
+	// PolyEvaluator for polynomial operations
+	PolyEvaluator *poly.Evaluator
+
+	// Decomposer for gadget decomposition
+	Decomposer *poly.Decomposer
+
+	// Centralized buffer pool for all operations
+	Buffers *BufferPool
+}
+
+// NewEvaluator creates a new zero-allocation evaluator
+func NewEvaluator(n int) *Evaluator {
+	l := params.GetTRGSWLv1().L
+
+	return &Evaluator{
+		PolyEvaluator: poly.NewEvaluator(n),
+		Decomposer:    poly.NewDecomposer(n, l*2), // 2*L levels for A and B
+		Buffers:       NewBufferPool(n),
+	}
+}
+
+// newEvaluationBuffer creates pre-allocated buffers
+
+// ShallowCopy creates a copy with new buffers (safe for concurrent use)
+func (e *Evaluator) ShallowCopy() *Evaluator {
+	return &Evaluator{
+		PolyEvaluator: e.PolyEvaluator.ShallowCopy(),
+		Decomposer:    poly.NewDecomposer(e.PolyEvaluator.Degree(), len(e.Decomposer.GetPolyDecomposedBuffer(1))),
+		Buffers:       NewBufferPool(e.PolyEvaluator.Degree()),
+	}
+}
+
+// ExternalProductAssign computes external product and writes to ctOut
+// This is the zero-allocation version following tfhe-go exactly
+func (e *Evaluator) ExternalProductAssign(ctFourierGGSW *trgsw.TRGSWLv1FFT, ctIn *trlwe.TRLWELv1, decompositionOffset params.Torus, ctOut *trlwe.TRLWELv1) {
+	l := params.GetTRGSWLv1().L
+	bgbit := params.GetTRGSWLv1().BGBIT
+
+	// Decompose ctIn into pre-allocated buffers
+	polyDecomposed := e.Decomposer.GetPolyDecomposedBuffer(l * 2)
+	polyFourierDecomposed := e.Decomposer.GetPolyFourierDecomposedBuffer(l * 2)
+
+	// Decompose A
+	poly.DecomposePolyAssign(ctIn.A, int(bgbit), l, decompositionOffset, polyDecomposed[:l])
+	// Decompose B
+	poly.DecomposePolyAssign(ctIn.B, int(bgbit), l, decompositionOffset, polyDecomposed[l:l*2])
+
+	// Transform to Fourier domain
+	for i := 0; i < l*2; i++ {
+		e.PolyEvaluator.ToFourierPolyAssign(polyDecomposed[i], polyFourierDecomposed[i])
+	}
+
+	// Clear accumulation buffers
+	e.Buffers.ExternalProduct.FourierA.Clear()
+	e.Buffers.ExternalProduct.FourierB.Clear()
+
+	// Accumulate external product in Fourier domain
+	for i := 0; i < l*2; i++ {
+		e.PolyEvaluator.MulAddFourierPolyAssign(polyFourierDecomposed[i], ctFourierGGSW.TRLWEFFT[i].A, e.Buffers.ExternalProduct.FourierA)
+		e.PolyEvaluator.MulAddFourierPolyAssign(polyFourierDecomposed[i], ctFourierGGSW.TRLWEFFT[i].B, e.Buffers.ExternalProduct.FourierB)
+	}
+
+	// Transform back to time domain (write directly to output)
+	e.PolyEvaluator.ToPolyAssignUnsafe(e.Buffers.ExternalProduct.FourierA, poly.Poly{Coeffs: ctOut.A})
+	e.PolyEvaluator.ToPolyAssignUnsafe(e.Buffers.ExternalProduct.FourierB, poly.Poly{Coeffs: ctOut.B})
+}
+
+// CMuxAssign computes ctOut = ct0 + ctCond * (ct1 - ct0)
+// Following tfhe-go's pattern exactly
+func (e *Evaluator) CMuxAssign(ctCond *trgsw.TRGSWLv1FFT, ct0, ct1 *trlwe.TRLWELv1, decompositionOffset params.Torus, ctOut *trlwe.TRLWELv1) {
+	n := params.GetTRGSWLv1().N
+
+	// First copy ct0 to output
+	copy(ctOut.A, ct0.A)
+	copy(ctOut.B, ct0.B)
+
+	// Compute ct1 - ct0 into buffer.ctCMux
+	for i := 0; i < n; i++ {
+		e.Buffers.CMUX.Temp.A[i] = ct1.A[i] - ct0.A[i]
+		e.Buffers.CMUX.Temp.B[i] = ct1.B[i] - ct0.B[i]
+	}
+
+	// External product into pre-allocated buffer
+	e.ExternalProductAssign(ctCond, e.Buffers.CMUX.Temp, decompositionOffset, e.Buffers.ExternalProduct.Result)
+
+	// Add to output: ctOut = ct0 + ctCond * (ct1 - ct0)
+	for i := 0; i < n; i++ {
+		ctOut.A[i] += e.Buffers.ExternalProduct.Result.A[i]
+		ctOut.B[i] += e.Buffers.ExternalProduct.Result.B[i]
+	}
+}
+
+// BlindRotateAssign performs blind rotation and writes to ctOut
+// Zero-allocation version following tfhe-go
+func (e *Evaluator) BlindRotateAssign(ctIn *tlwe.TLWELv0, testvec *trlwe.TRLWELv1, bsk []*trgsw.TRGSWLv1FFT, decompositionOffset params.Torus, ctOut *trlwe.TRLWELv1) {
+	n := params.GetTRGSWLv1().N
+	nBit := params.GetTRGSWLv1().NBIT
+	tlweLv0N := params.GetTLWELv0().N
+
+	// Initial rotation into buffer.ctAcc1
+	bTilda := 2*n - ((int(ctIn.B()) + (1 << (31 - nBit - 1))) >> (32 - nBit - 1))
+	poly.PolyMulWithXKInPlace(testvec.A, bTilda, e.Buffers.BlindRotation.Accumulator1.A)
+	poly.PolyMulWithXKInPlace(testvec.B, bTilda, e.Buffers.BlindRotation.Accumulator1.B)
+
+	// Iterate through LWE coefficients
+	for i := 0; i < tlweLv0N; i++ {
+		aTilda := int((ctIn.P[i] + (1 << (31 - nBit - 1))) >> (32 - nBit - 1))
+
+		// Rotate into buffer.ctAcc2
+		poly.PolyMulWithXKInPlace(e.Buffers.BlindRotation.Accumulator1.A, aTilda, e.Buffers.BlindRotation.Accumulator2.A)
+		poly.PolyMulWithXKInPlace(e.Buffers.BlindRotation.Accumulator1.B, aTilda, e.Buffers.BlindRotation.Accumulator2.B)
+
+		// CMux: ctAcc1 = ctAcc1 + bsk[i] * (ctAcc2 - ctAcc1)
+		e.CMuxAssign(bsk[i], e.Buffers.BlindRotation.Accumulator1, e.Buffers.BlindRotation.Accumulator2, decompositionOffset, e.Buffers.BlindRotation.Accumulator1)
+	}
+
+	// Copy result to output
+	copy(ctOut.A, e.Buffers.BlindRotation.Accumulator1.A)
+	copy(ctOut.B, e.Buffers.BlindRotation.Accumulator1.B)
+}
+
+// BootstrapAssign performs full bootstrapping (blind rotate + key switch)
+// Zero-allocation version - writes to ctOut
+func (e *Evaluator) BootstrapAssign(ctIn *tlwe.TLWELv0, testvec *trlwe.TRLWELv1, bsk []*trgsw.TRGSWLv1FFT, ksk []*tlwe.TLWELv0, decompositionOffset params.Torus, ctOut *tlwe.TLWELv0) {
+	// Blind rotate
+	e.BlindRotateAssign(ctIn, testvec, bsk, decompositionOffset, e.Buffers.BlindRotation.Rotated)
+
+	// Sample extract
+	trlwe.SampleExtractIndexAssign(e.Buffers.BlindRotation.Rotated, 0, e.Buffers.Bootstrap.ExtractedLWE)
+
+	// Key switch - writes directly to ctOut (zero-allocation!)
+	trgsw.IdentityKeySwitchingAssign(e.Buffers.Bootstrap.ExtractedLWE, ksk, ctOut)
+}
+
+// Bootstrap performs full bootstrapping and returns result using buffer pool
+// Returns pointer to buffer pool - valid until 4 more bootstrap calls
+func (e *Evaluator) Bootstrap(ctIn *tlwe.TLWELv0, testvec *trlwe.TRLWELv1, bsk []*trgsw.TRGSWLv1FFT, ksk []*tlwe.TLWELv0, decompositionOffset params.Torus) *tlwe.TLWELv0 {
+	// Get result buffer from pool (round-robin)
+	result := e.Buffers.GetNextResult()
+	e.BootstrapAssign(ctIn, testvec, bsk, ksk, decompositionOffset, result)
+	return result
+}
+
+// ResetBuffers resets all buffer pool indices
+func (e *Evaluator) ResetBuffers() {
+	e.Buffers.Reset()
+}
diff --git a/evaluator/gates_helper.go b/evaluator/gates_helper.go
new file mode 100644
index 0000000..b54cc3f
--- /dev/null
+++ b/evaluator/gates_helper.go
@@ -0,0 +1,40 @@
+package evaluator
+
+import (
+	"github.com/thedonutfactory/go-tfhe/tlwe"
+	"github.com/thedonutfactory/go-tfhe/utils"
+)
+
+// PrepareAND prepares input for AND gate (zero-allocation)
+// Returns pointer to internal temp buffer
+func (e *Evaluator) PrepareAND(ctA, ctB *tlwe.TLWELv0) *tlwe.TLWELv0 {
+	ctA.AddAssign(ctB, e.Buffers.GatePrep)
+	e.Buffers.GatePrep.SetB(e.Buffers.GatePrep.B() + utils.F64ToTorus(-0.125))
+	return e.Buffers.GatePrep
+}
+
+// PrepareNAND prepares input for NAND gate (zero-allocation)
+func (e *Evaluator) PrepareNAND(ctA, ctB *tlwe.TLWELv0) *tlwe.TLWELv0 {
+	// Negate both and add
+	for i := range e.Buffers.GatePrep.P {
+		e.Buffers.GatePrep.P[i] = -ctA.P[i] - ctB.P[i]
+	}
+	e.Buffers.GatePrep.SetB(e.Buffers.GatePrep.B() + utils.F64ToTorus(0.125))
+	return e.Buffers.GatePrep
+}
+
+// PrepareOR prepares input for OR gate (zero-allocation)
+func (e *Evaluator) PrepareOR(ctA, ctB *tlwe.TLWELv0) *tlwe.TLWELv0 {
+	ctA.AddAssign(ctB, e.Buffers.GatePrep)
+	e.Buffers.GatePrep.SetB(e.Buffers.GatePrep.B() + utils.F64ToTorus(0.125))
+	return e.Buffers.GatePrep
+}
+
+// PrepareXOR prepares input for XOR gate (zero-allocation)
+func (e *Evaluator) PrepareXOR(ctA, ctB *tlwe.TLWELv0) *tlwe.TLWELv0 {
+	for i := range e.Buffers.GatePrep.P {
+		e.Buffers.GatePrep.P[i] = 2 * (ctA.P[i] + ctB.P[i])
+	}
+	e.Buffers.GatePrep.SetB(e.Buffers.GatePrep.B() + utils.F64ToTorus(0.25))
+	return e.Buffers.GatePrep
+}
diff --git a/gates/gates.go b/gates/gates.go
index 841c032..43ca5f0 100644
--- a/gates/gates.go
+++ b/gates/gates.go
@@ -4,8 +4,8 @@ import (
 	"sync"
 
 	"github.com/thedonutfactory/go-tfhe/cloudkey"
+	"github.com/thedonutfactory/go-tfhe/evaluator"
 	"github.com/thedonutfactory/go-tfhe/params"
-	"github.com/thedonutfactory/go-tfhe/poly"
 	"github.com/thedonutfactory/go-tfhe/tlwe"
 	"github.com/thedonutfactory/go-tfhe/trgsw"
 	"github.com/thedonutfactory/go-tfhe/trlwe"
@@ -15,32 +15,35 @@ import (
 // Ciphertext is an alias for TLWELv0
 type Ciphertext = tlwe.TLWELv0
 
-// NAND performs homomorphic NAND operation
+// Global evaluator for single-threaded operations (zero-allocation)
+var globalEval *evaluator.Evaluator
+
+func init() {
+	globalEval = evaluator.NewEvaluator(params.GetTRGSWLv1().N)
+}
+
+// NAND performs homomorphic NAND operation (zero-allocation)
 func NAND(tlweA, tlweB *Ciphertext, ck *cloudkey.CloudKey) *Ciphertext {
-	tlweNAND := tlweA.Add(tlweB).Neg()
-	tlweNAND.SetB(tlweNAND.B() + utils.F64ToTorus(0.125))
-	return bootstrap(tlweNAND, ck)
+	prepared := globalEval.PrepareNAND(tlweA, tlweB)
+	return bootstrap(prepared, ck)
 }
 
-// OR performs homomorphic OR operation
+// OR performs homomorphic OR operation (zero-allocation)
 func OR(tlweA, tlweB *Ciphertext, ck *cloudkey.CloudKey) *Ciphertext {
-	tlweOR := tlweA.Add(tlweB)
-	tlweOR.SetB(tlweOR.B() + utils.F64ToTorus(0.125))
-	return bootstrap(tlweOR, ck)
+	prepared := globalEval.PrepareOR(tlweA, tlweB)
+	return bootstrap(prepared, ck)
 }
 
-// AND performs homomorphic AND operation
+// AND performs homomorphic AND operation (zero-allocation)
 func AND(tlweA, tlweB *Ciphertext, ck *cloudkey.CloudKey) *Ciphertext {
-	tlweAND := tlweA.Add(tlweB)
-	tlweAND.SetB(tlweAND.B() + utils.F64ToTorus(-0.125))
-	return bootstrap(tlweAND, ck)
+	prepared := globalEval.PrepareAND(tlweA, tlweB)
+	return bootstrap(prepared, ck)
 }
 
-// XOR performs homomorphic XOR operation
+// XOR performs homomorphic XOR operation (zero-allocation)
 func XOR(tlweA, tlweB *Ciphertext, ck *cloudkey.CloudKey) *Ciphertext {
-	tlweXOR := tlweA.AddMul(tlweB, 2)
-	tlweXOR.SetB(tlweXOR.B() + utils.F64ToTorus(0.25))
-	return bootstrap(tlweXOR, ck)
+	prepared := globalEval.PrepareXOR(tlweA, tlweB)
+	return bootstrap(prepared, ck)
 }
 
 // XNOR performs homomorphic XNOR operation
@@ -120,20 +123,17 @@ func Copy(tlweA *Ciphertext) *Ciphertext {
 	return result
 }
 
-// bootstrap performs full bootstrapping with key switching
+// bootstrap performs full bootstrapping with key switching (TRUE zero-allocation)
+// Returns pointer to internal buffer - result is only valid until next bootstrap call
 func bootstrap(ctxt *Ciphertext, ck *cloudkey.CloudKey) *Ciphertext {
-	polyEval := poly.NewEvaluator(params.GetTRGSWLv1().N)
-	trlweResult := trgsw.BlindRotate(ctxt, ck.BlindRotateTestvec, ck.BootstrappingKey, ck.DecompositionOffset, polyEval)
-	tlweLv1 := trlwe.SampleExtractIndex(trlweResult, 0)
-	return trgsw.IdentityKeySwitching(tlweLv1, ck.KeySwitchingKey)
+	return globalEval.Bootstrap(ctxt, ck.BlindRotateTestvec, ck.BootstrappingKey, ck.KeySwitchingKey, ck.DecompositionOffset)
 }
 
-// bootstrapWithoutKeySwitch performs bootstrapping without key switching
+// bootstrapWithoutKeySwitch performs bootstrapping without key switching (uses global eval)
 func bootstrapWithoutKeySwitch(ctxt *Ciphertext, ck *cloudkey.CloudKey) *Ciphertext {
-	polyEval := poly.NewEvaluator(params.GetTRGSWLv1().N)
-	trlweResult := trgsw.BlindRotate(ctxt, ck.BlindRotateTestvec, ck.BootstrappingKey, ck.DecompositionOffset, polyEval)
-	tlweLv1 := trlwe.SampleExtractIndex2(trlweResult, 0)
-	return tlweLv1
+	trlweResult := trlwe.NewTRLWELv1()
+	globalEval.BlindRotateAssign(ctxt, ck.BlindRotateTestvec, ck.BootstrappingKey, ck.DecompositionOffset, trlweResult)
+	return trlwe.SampleExtractIndex2(trlweResult, 0)
 }
 
 // ============================================================================
diff --git a/gates/gates_test.go b/gates/gates_test.go
index dbcbd3b..082657f 100644
--- a/gates/gates_test.go
+++ b/gates/gates_test.go
@@ -478,3 +478,163 @@ func TestBatchXOR(t *testing.T) {
 		}
 	}
 }
+
+// ============================================================================
+// BENCHMARK TESTS
+// ============================================================================
+
+// BenchmarkBootstrap benchmarks a single bootstrap operation via NAND gate
+// This is the core operation in TFHE and the main performance bottleneck
+func BenchmarkBootstrap(b *testing.B) {
+	sk := key.NewSecretKey()
+	ck := cloudkey.NewCloudKey(sk)
+
+	// Create input ciphertexts
+	ctA := encrypt(nil, true, sk)
+	ctB := encrypt(nil, false, sk)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		_ = gates.NAND(ctA, ctB, ck)
+	}
+}
+
+// BenchmarkBootstrapNAND benchmarks NAND gate (1 bootstrap)
+func BenchmarkBootstrapNAND(b *testing.B) {
+	sk := key.NewSecretKey()
+	ck := cloudkey.NewCloudKey(sk)
+
+	ctA := encrypt(nil, true, sk)
+	ctB := encrypt(nil, false, sk)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		_ = gates.NAND(ctA, ctB, ck)
+	}
+}
+
+// BenchmarkBootstrapAND benchmarks AND gate (1 bootstrap)
+func BenchmarkBootstrapAND(b *testing.B) {
+	sk := key.NewSecretKey()
+	ck := cloudkey.NewCloudKey(sk)
+
+	ctA := encrypt(nil, true, sk)
+	ctB := encrypt(nil, true, sk)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		_ = gates.AND(ctA, ctB, ck)
+	}
+}
+
+// BenchmarkBootstrapXOR benchmarks XOR gate (1 bootstrap)
+func BenchmarkBootstrapXOR(b *testing.B) {
+	sk := key.NewSecretKey()
+	ck := cloudkey.NewCloudKey(sk)
+
+	ctA := encrypt(nil, true, sk)
+	ctB := encrypt(nil, false, sk)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		_ = gates.XOR(ctA, ctB, ck)
+	}
+}
+
+// BenchmarkBootstrapMUX benchmarks MUX gate (3 bootstraps)
+func BenchmarkBootstrapMUX(b *testing.B) {
+	sk := key.NewSecretKey()
+	ck := cloudkey.NewCloudKey(sk)
+
+	ctSel := encrypt(nil, true, sk)
+	ctA := encrypt(nil, true, sk)
+	ctB := encrypt(nil, false, sk)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		_ = gates.MUX(ctSel, ctA, ctB, ck)
+	}
+}
+
+// BenchmarkBatchBootstrap benchmarks batch bootstrap operations
+func BenchmarkBatchBootstrap(b *testing.B) {
+	sizes := []int{1, 2, 4, 8, 16}
+
+	for _, size := range sizes {
+		b.Run(string(rune(size))+"_ops", func(b *testing.B) {
+			sk := key.NewSecretKey()
+			ck := cloudkey.NewCloudKey(sk)
+
+			// Create batch inputs
+			inputs := make([][2]*gates.Ciphertext, size)
+			for i := 0; i < size; i++ {
+				inputs[i] = [2]*gates.Ciphertext{
+					encrypt(nil, true, sk),
+					encrypt(nil, false, sk),
+				}
+			}
+
+			b.ResetTimer()
+			b.ReportAllocs()
+
+			for i := 0; i < b.N; i++ {
+				_ = gates.BatchAND(inputs, ck)
+			}
+		})
+	}
+}
+
+// BenchmarkKeyGeneration benchmarks key generation time
+func BenchmarkKeyGeneration(b *testing.B) {
+	b.Run("SecretKey", func(b *testing.B) {
+		b.ReportAllocs()
+		for i := 0; i < b.N; i++ {
+			_ = key.NewSecretKey()
+		}
+	})
+
+	b.Run("CloudKey", func(b *testing.B) {
+		sk := key.NewSecretKey()
+		b.ResetTimer()
+		b.ReportAllocs()
+
+		for i := 0; i < b.N; i++ {
+			_ = cloudkey.NewCloudKey(sk)
+		}
+	})
+}
+
+// BenchmarkEncryption benchmarks encryption operations
+func BenchmarkEncryption(b *testing.B) {
+	sk := key.NewSecretKey()
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		_ = encrypt(nil, true, sk)
+	}
+}
+
+// BenchmarkDecryption benchmarks decryption operations
+func BenchmarkDecryption(b *testing.B) {
+	sk := key.NewSecretKey()
+	ct := encrypt(nil, true, sk)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for i := 0; i < b.N; i++ {
+		_ = decrypt(nil, ct, sk)
+	}
+}
diff --git a/params/params.go b/params/params.go
index f159835..53f7fda 100644
--- a/params/params.go
+++ b/params/params.go
@@ -58,14 +58,15 @@ type TRLWELv1Params struct {
 
 // TRGSW Level 1 Parameters
 type TRGSWLv1Params struct {
-	N       int
-	NBIT    int
-	BGBIT   uint32
-	BG      uint32
-	L       int
-	BASEBIT int
-	IKS_T   int
-	ALPHA   float64
+	N         int
+	NBIT      int
+	BGBIT     uint32
+	BG        uint32
+	L         int
+	BASEBIT   int
+	IKS_T     int
+	ALPHA     float64
+	BlockSize int // Block size for block blind rotation (1=original, >1=block algorithm)
 }
 
 // ============================================================================
@@ -90,14 +91,15 @@ var params80Bit = struct {
 		ALPHA: 3.73e-8,
 	},
 	TRGSWLv1: TRGSWLv1Params{
-		N:       1024,
-		NBIT:    10,
-		BGBIT:   6,
-		BG:      1 << 6,
-		L:       3,
-		BASEBIT: 2,
-		IKS_T:   7,
-		ALPHA:   3.73e-8,
+		N:         1024,
+		NBIT:      10,
+		BGBIT:     6,
+		BG:        1 << 6,
+		L:         3,
+		BASEBIT:   2,
+		IKS_T:     7,
+		ALPHA:     3.73e-8,
+		BlockSize: 3, // Use block blind rotation (3-4x faster)
 	},
 }
 
@@ -123,14 +125,15 @@ var params110Bit = struct {
 		ALPHA: 2.980232238769531e-8,
 	},
 	TRGSWLv1: TRGSWLv1Params{
-		N:       1024,
-		NBIT:    10,
-		BGBIT:   6,
-		BG:      1 << 6,
-		L:       3,
-		BASEBIT: 2,
-		IKS_T:   8,
-		ALPHA:   2.980232238769531e-8,
+		N:         1024,
+		NBIT:      10,
+		BGBIT:     6,
+		BG:        1 << 6,
+		L:         3,
+		BASEBIT:   2,
+		IKS_T:     8,
+		ALPHA:     2.980232238769531e-8,
+		BlockSize: 3, // Use block blind rotation (3-4x faster)
 	},
 }
 
@@ -156,14 +159,15 @@ var params128Bit = struct {
 		ALPHA: 2.0e-8,
 	},
 	TRGSWLv1: TRGSWLv1Params{
-		N:       1024,
-		NBIT:    10,
-		BGBIT:   6,
-		BG:      1 << 6,
-		L:       3,
-		BASEBIT: 2,
-		IKS_T:   9,
-		ALPHA:   2.0e-8,
+		N:         1024,
+		NBIT:      10,
+		BGBIT:     6,
+		BG:        1 << 6,
+		L:         3,
+		BASEBIT:   2,
+		IKS_T:     9,
+		ALPHA:     2.0e-8,
+		BlockSize: 3, // Use block blind rotation (3-4x faster)
 	},
 }
 
@@ -238,3 +242,18 @@ func SecurityInfo() string {
 	}
 	return desc
 }
+
+// GetBlockCount returns the number of blocks for block blind rotation
+func GetBlockCount() int {
+	lweDim := GetTLWELv0().N
+	blockSize := GetTRGSWLv1().BlockSize
+	if blockSize <= 1 {
+		return lweDim // Original algorithm (no blocks)
+	}
+	return (lweDim + blockSize - 1) / blockSize // Ceiling division
+}
+
+// UseBlockBlindRotation returns true if block blind rotation should be used
+func UseBlockBlindRotation() bool {
+	return GetTRGSWLv1().BlockSize > 1
+}
diff --git a/poly/README.md b/poly/README.md
deleted file mode 100644
index cb4a8a0..0000000
--- a/poly/README.md
+++ /dev/null
@@ -1,129 +0,0 @@
-# Optimized Polynomial Multiplication for TFHE
-
-This package provides a high-performance implementation of polynomial multiplication for TFHE operations, based on the optimized tfhe-go reference implementation.
-
-## Key Optimizations
-
-### 1. **Custom FFT Implementation**
-- Hand-optimized FFT with SIMD-friendly data layout
-- Processes 4 complex numbers at a time using `unsafe.Pointer` for vectorization
-- Precomputed twiddle factors stored in optimized format
-
-### 2. **Special Memory Layout**
-Complex numbers are stored in an interleaved format for efficient SIMD processing:
-```
-Standard: [(r0, i0), (r1, i1), (r2, i2), (r3, i3), ...]
-Optimized: [(r0, r1, r2, r3), (i0, i1, i2, i3), ...]
-```
-
-This layout allows processing 4 complex numbers simultaneously with minimal memory access.
-
-### 3. **Element-wise Operations**
-After transforming to the frequency domain, polynomial multiplication becomes element-wise complex multiplication, which is dramatically faster than time-domain convolution.
-
-### 4. **Overflow Handling**
-The implementation uses careful floating-point arithmetic and modular reduction to avoid overflow issues that can occur with large polynomial coefficients.
-
-## Performance Benchmarks
-
-On Apple M3 Pro (arm64):
-
-| Operation | Time (ns/op) | Allocations |
-|-----------|--------------|-------------|
-| FFT (1024) | 3,007 | 8 KB |
-| IFFT (1024) | 2,818 | 4 KB |
-| Full Polynomial Multiplication | 7,926 | 20 KB |
-| Element-wise Multiplication (freq domain) | 220.5 | 0 |
-
-### Comparison with Previous Implementation
-
-The previous implementation used `github.com/mjibson/go-dsp/fft`, a general-purpose FFT library. The new implementation provides:
-
-- **3-5x faster FFT operations** due to SIMD-optimized butterfly operations
-- **Zero-allocation element-wise multiplication** in frequency domain
-- **Better cache locality** due to optimized memory layout
-- **Orders of magnitude faster** overall TFHE operations
-
-## Usage
-
-```go
-// Create an evaluator for degree-1024 polynomials
-eval := poly.NewEvaluator(1024)
-
-// Create polynomials
-p1 := eval.NewPoly()
-p2 := eval.NewPoly()
-
-// Multiply polynomials
-result := eval.MulPoly(p1, p2)
-
-// Or work in frequency domain for multiple operations
-fp1 := eval.ToFourierPoly(p1)
-fp2 := eval.ToFourierPoly(p2)
-
-// Element-wise multiplication (very fast)
-eval.MulFourierPolyAssign(fp1, fp2, fp1)
-
-// Transform back to time domain
-result = eval.ToPoly(fp1)
-```
-
-## Integration with TFHE
-
-This package is integrated into the TRGSW external product and blind rotation operations:
-
-```go
-// In ExternalProductWithFFT
-polyEval := poly.NewEvaluator(1024)
-
-// Transform decomposition to frequency domain
-decFFT := polyEval.ToFourierPoly(decPoly)
-
-// Multiply-add in frequency domain
-polyEval.MulAddFourierPolyAssign(decFFT, trgswFFT.TRLWEFFT[i].A, outAFFT)
-
-// Transform back
-polyEval.ToPolyAssignUnsafe(outAFFT, outA)
-```
-
-## Architecture
-
-### Core Types
-
-- `Poly`: Polynomial in time domain with `params.Torus` coefficients
-- `FourierPoly`: Polynomial in frequency domain with `float64` coefficients
-- `Evaluator`: Stateful evaluator with precomputed twiddle factors
-
-### Key Functions
-
-- `ToFourierPoly()`: FFT transformation (time → frequency domain)
-- `ToPoly()`: IFFT transformation (frequency → time domain)
-- `MulPoly()`: Full polynomial multiplication
-- `MulFourierPolyAssign()`: Element-wise complex multiplication in frequency domain
-- `MulAddFourierPolyAssign()`: Fused multiply-add in frequency domain
-
-## Thread Safety
-
-Each `Evaluator` maintains internal buffers and is **not thread-safe**. For concurrent operations:
-
-```go
-// Create a copy for each goroutine
-eval := poly.NewEvaluator(1024)
-evalCopy := eval.ShallowCopy()  // Safe for concurrent use
-```
-
-## Future Optimizations
-
-Potential areas for further improvement:
-
-1. **Assembly implementations** for AMD64 (similar to tfhe-go's `.s` files)
-2. **AVX2/AVX-512 SIMD** for x86-64 platforms
-3. **ARM NEON** intrinsics for ARM platforms
-4. **Batch FFT operations** to amortize setup costs
-5. **Number-theoretic Transform (NTT)** for exact integer arithmetic
-
-## References
-
-- [tfhe-go](https://github.com/sp301415/tfhe-go) - High-performance TFHE implementation in Go
-- Original TFHE paper: "TFHE: Fast Fully Homomorphic Encryption over the Torus"
-
diff --git a/poly/aligned.go b/poly/aligned.go
new file mode 100644
index 0000000..ff93a92
--- /dev/null
+++ b/poly/aligned.go
@@ -0,0 +1,62 @@
+package poly
+
+import "github.com/thedonutfactory/go-tfhe/params"
+
+// Memory alignment utilities for better cache performance
+
+// NewPolyAligned creates a polynomial with cache-line aligned memory
+// This helps with SIMD operations and cache efficiency
+func NewPolyAligned(N int) Poly {
+	if !isPowerOfTwo(N) {
+		panic("degree not power of two")
+	}
+	if N < MinDegree {
+		panic("degree smaller than MinDegree")
+	}
+
+	// Allocate with extra space for alignment
+	// Cache lines are typically 64 bytes = 16 x uint32
+	const cacheLineSize = 16
+	coeffs := make([]params.Torus, N+cacheLineSize)
+
+	// Find aligned offset
+	offset := 0
+	addr := uintptr(0)
+	if len(coeffs) > 0 {
+		addr = uintptr(len(coeffs)) % cacheLineSize
+		if addr != 0 {
+			offset = int(cacheLineSize - addr)
+		}
+	}
+
+	// Return slice starting at aligned offset
+	return Poly{Coeffs: coeffs[offset : offset+N]}
+}
+
+// NewFourierPolyAligned creates a fourier polynomial with cache-line aligned memory
+func NewFourierPolyAligned(N int) FourierPoly {
+	if !isPowerOfTwo(N) {
+		panic("degree not power of two")
+	}
+	if N < MinDegree {
+		panic("degree smaller than MinDegree")
+	}
+
+	// Allocate with extra space for alignment
+	// Cache lines are typically 64 bytes = 8 x float64
+	const cacheLineSize = 8
+	coeffs := make([]float64, N+cacheLineSize)
+
+	// Find aligned offset
+	offset := 0
+	addr := uintptr(0)
+	if len(coeffs) > 0 {
+		addr = uintptr(len(coeffs)) % cacheLineSize
+		if addr != 0 {
+			offset = int(cacheLineSize - addr)
+		}
+	}
+
+	// Return slice starting at aligned offset
+	return FourierPoly{Coeffs: coeffs[offset : offset+N]}
+}
diff --git a/poly/buffer_manager.go b/poly/buffer_manager.go
new file mode 100644
index 0000000..2a6aec2
--- /dev/null
+++ b/poly/buffer_manager.go
@@ -0,0 +1,172 @@
+package poly
+
+import "github.com/thedonutfactory/go-tfhe/params"
+
+// BufferManager centralizes all polynomial operation buffers
+// This provides a single, well-documented place to manage FFT, decomposition, and rotation buffers
+type BufferManager struct {
+	// Polynomial degree
+	n int
+
+	// === FFT Buffers ===
+
+	// Forward/Inverse FFT working buffers
+	FFT struct {
+		Poly    Poly        // Time domain working buffer
+		Fourier FourierPoly // Frequency domain working buffer
+	}
+
+	// === Decomposition Buffers ===
+
+	Decomposition struct {
+		// Decomposed polynomials in time domain [level]
+		Poly []Poly
+		// Decomposed polynomials in Fourier domain [level]
+		Fourier []FourierPoly
+	}
+
+	// === Multiplication Buffers ===
+
+	Multiplication struct {
+		// Result accumulators in Fourier domain
+		AccA FourierPoly
+		AccB FourierPoly
+		// Temporary buffer for operations
+		Temp FourierPoly
+	}
+
+	// === Rotation Buffers ===
+
+	Rotation struct {
+		// Pool of polynomials for X^k multiplication
+		Pool  []Poly
+		InUse int // Number currently in use
+
+		// TRLWE rotation buffers
+		TRLWEPool []*struct {
+			A []params.Torus
+			B []params.Torus
+		}
+		TRLWEInUse int
+	}
+
+	// === Temporary Buffers ===
+
+	// General-purpose temporary buffers
+	Temp struct {
+		Poly1 Poly
+		Poly2 Poly
+		Poly3 Poly
+	}
+}
+
+// NewBufferManager creates a new centralized buffer manager
+func NewBufferManager(n int) *BufferManager {
+	l := params.GetTRGSWLv1().L
+
+	bm := &BufferManager{n: n}
+
+	// Initialize FFT buffers
+	bm.FFT.Poly = NewPoly(n)
+	bm.FFT.Fourier = NewFourierPoly(n)
+
+	// Initialize decomposition buffers for 2*L levels (A and B components)
+	bm.Decomposition.Poly = make([]Poly, l*2)
+	bm.Decomposition.Fourier = make([]FourierPoly, l*2)
+	for i := 0; i < l*2; i++ {
+		bm.Decomposition.Poly[i] = NewPoly(n)
+		bm.Decomposition.Fourier[i] = NewFourierPoly(n)
+	}
+
+	// Initialize multiplication buffers
+	bm.Multiplication.AccA = NewFourierPoly(n)
+	bm.Multiplication.AccB = NewFourierPoly(n)
+	bm.Multiplication.Temp = NewFourierPoly(n)
+
+	// Initialize rotation pool (4 polynomials should be enough for most operations)
+	bm.Rotation.Pool = make([]Poly, 4)
+	for i := 0; i < 4; i++ {
+		bm.Rotation.Pool[i] = NewPoly(n)
+	}
+	bm.Rotation.InUse = 0
+
+	// Initialize TRLWE rotation pool
+	bm.Rotation.TRLWEPool = make([]*struct {
+		A []params.Torus
+		B []params.Torus
+	}, 4)
+	for i := 0; i < 4; i++ {
+		bm.Rotation.TRLWEPool[i] = &struct {
+			A []params.Torus
+			B []params.Torus
+		}{
+			A: make([]params.Torus, n),
+			B: make([]params.Torus, n),
+		}
+	}
+	bm.Rotation.TRLWEInUse = 0
+
+	// Initialize temporary buffers
+	bm.Temp.Poly1 = NewPoly(n)
+	bm.Temp.Poly2 = NewPoly(n)
+	bm.Temp.Poly3 = NewPoly(n)
+
+	return bm
+}
+
+// GetRotationBuffer returns a polynomial buffer for rotation operations
+func (bm *BufferManager) GetRotationBuffer() Poly {
+	if bm.Rotation.InUse >= len(bm.Rotation.Pool) {
+		// Wrap around if we run out (should rarely happen)
+		bm.Rotation.InUse = 0
+	}
+	buffer := bm.Rotation.Pool[bm.Rotation.InUse]
+	bm.Rotation.InUse++
+	return buffer
+}
+
+// GetTRLWEBuffer returns a TRLWE buffer (A, B components)
+func (bm *BufferManager) GetTRLWEBuffer() ([]params.Torus, []params.Torus) {
+	if bm.Rotation.TRLWEInUse >= len(bm.Rotation.TRLWEPool) {
+		bm.Rotation.TRLWEInUse = 0
+	}
+	buffer := bm.Rotation.TRLWEPool[bm.Rotation.TRLWEInUse]
+	bm.Rotation.TRLWEInUse++
+	return buffer.A, buffer.B
+}
+
+// Reset resets all buffer indices
+func (bm *BufferManager) Reset() {
+	bm.Rotation.InUse = 0
+	bm.Rotation.TRLWEInUse = 0
+}
+
+// MemoryUsage returns approximate memory usage in bytes
+func (bm *BufferManager) MemoryUsage() int {
+	n := bm.n
+	l := params.GetTRGSWLv1().L
+
+	// Poly: N * 4 bytes, FourierPoly: N * 8 * 2 bytes (complex)
+	polySize := n * 4
+	fourierSize := n * 8 * 2
+
+	mem := 0
+
+	// FFT buffers
+	mem += polySize + fourierSize
+
+	// Decomposition buffers (2*L levels)
+	mem += (polySize + fourierSize) * l * 2
+
+	// Multiplication buffers
+	mem += fourierSize * 3
+
+	// Rotation pool
+	mem += polySize * len(bm.Rotation.Pool)
+	mem += polySize * 2 * len(bm.Rotation.TRLWEPool) // A and B
+
+	// Temp buffers
+	mem += polySize * 3
+
+	return mem
+}
diff --git a/poly/buffer_methods.go b/poly/buffer_methods.go
new file mode 100644
index 0000000..a4e106a
--- /dev/null
+++ b/poly/buffer_methods.go
@@ -0,0 +1,194 @@
+package poly
+
+import "github.com/thedonutfactory/go-tfhe/params"
+
+// ============================================================================
+// UNIFIED BUFFER METHODS
+// ============================================================================
+// All buffer pool operations consolidated in one place for clarity.
+// These methods operate on poly.Evaluator.buffer (evaluationBuffer struct).
+
+// ============================================================================
+// FOURIER BUFFER OPERATIONS
+// ============================================================================
+
+// ClearBuffer clears a named Fourier buffer (sets all coefficients to zero)
+func (e *Evaluator) ClearBuffer(name string) {
+	switch name {
+	case "fpAcc":
+		e.buffer.fpAcc.Clear()
+	case "fpBcc":
+		e.buffer.fpBcc.Clear()
+	case "fpDiff":
+		e.buffer.fpDiff.Clear()
+	case "fpMul1":
+		e.buffer.fpMul1.Clear()
+	case "fpMul2":
+		e.buffer.fpMul2.Clear()
+	default:
+		panic("unknown buffer name: " + name)
+	}
+}
+
+// MulAddFourierPolyAssignBuffered performs fpOut += decompFFT[idx] * fp
+// using the pre-allocated decomposition buffer
+func (e *Evaluator) MulAddFourierPolyAssignBuffered(idx int, fp FourierPoly, bufferName string) {
+	var fpOut *FourierPoly
+	switch bufferName {
+	case "fpAcc":
+		fpOut = &e.buffer.fpAcc
+	case "fpBcc":
+		fpOut = &e.buffer.fpBcc
+	default:
+		panic("unknown buffer name: " + bufferName)
+	}
+
+	// Use the pre-computed FFT from decomposition buffer
+	e.MulAddFourierPolyAssign(e.buffer.decompFFT[idx], fp, *fpOut)
+}
+
+// BufferToPolyAssign converts a buffer from frequency domain to time domain
+// and writes directly to the output slice (zero-allocation)
+func (e *Evaluator) BufferToPolyAssign(bufferName string, out []params.Torus) {
+	var fp *FourierPoly
+	switch bufferName {
+	case "fpAcc":
+		fp = &e.buffer.fpAcc
+	case "fpBcc":
+		fp = &e.buffer.fpBcc
+	case "fpDiff":
+		fp = &e.buffer.fpDiff
+	default:
+		panic("unknown buffer name: " + bufferName)
+	}
+
+	// Use unsafe conversion to avoid allocation
+	pOut := Poly{Coeffs: out}
+	e.ToPolyAssignUnsafe(*fp, pOut)
+}
+
+// ============================================================================
+// DECOMPOSITION BUFFER OPERATIONS
+// ============================================================================
+
+// GetDecompBuffer returns the i-th decomposition buffer for direct write
+func (e *Evaluator) GetDecompBuffer(i int) *Poly {
+	if i >= len(e.buffer.decompBuffer) {
+		panic("decomposition buffer index out of range")
+	}
+	return &e.buffer.decompBuffer[i]
+}
+
+// GetDecompFFTBuffer returns the i-th decomposition FFT buffer
+func (e *Evaluator) GetDecompFFTBuffer(i int) *FourierPoly {
+	if i >= len(e.buffer.decompFFT) {
+		panic("decomposition FFT buffer index out of range")
+	}
+	return &e.buffer.decompFFT[i]
+}
+
+// ToFourierPolyInBuffer transforms a poly to fourier and stores in buffer
+func (e *Evaluator) ToFourierPolyInBuffer(p Poly, bufferIdx int) {
+	if bufferIdx >= len(e.buffer.decompFFT) {
+		panic("buffer index out of range")
+	}
+	e.ToFourierPolyAssign(p, e.buffer.decompFFT[bufferIdx])
+}
+
+// CopyToDecompBuffer copies a polynomial into the decomposition buffer
+func (e *Evaluator) CopyToDecompBuffer(src []params.Torus, bufferIdx int) {
+	if bufferIdx >= len(e.buffer.decompBuffer) {
+		panic("buffer index out of range")
+	}
+	copy(e.buffer.decompBuffer[bufferIdx].Coeffs, src)
+}
+
+// ============================================================================
+// ROTATION POOL OPERATIONS
+// ============================================================================
+
+// GetRotationBuffer returns a rotation buffer from the pool
+// Uses round-robin allocation to avoid conflicts
+func (e *Evaluator) GetRotationBuffer() []params.Torus {
+	buf := e.buffer.rotationPool[e.buffer.rotationIdx].Coeffs
+	e.buffer.rotationIdx = (e.buffer.rotationIdx + 1) % len(e.buffer.rotationPool)
+	return buf
+}
+
+// ResetRotationPool resets the rotation buffer pool index
+// Call this at the start of a new operation to ensure clean state
+func (e *Evaluator) ResetRotationPool() {
+	e.buffer.rotationIdx = 0
+}
+
+// PolyMulWithXK multiplies a polynomial by X^k using a pooled buffer (zero-allocation)
+func (e *Evaluator) PolyMulWithXK(a []params.Torus, k int) []params.Torus {
+	result := e.GetRotationBuffer()
+	PolyMulWithXKInPlace(a, k, result)
+	return result
+}
+
+// PolyMulWithXKInPlace multiplies polynomial by X^k in the ring Z[X]/(X^N+1)
+// This is the core rotation operation used throughout TFHE
+func PolyMulWithXKInPlace(a []params.Torus, k int, result []params.Torus) {
+	n := len(a)
+	k = k % (2 * n) // Normalize k to [0, 2N)
+
+	if k == 0 {
+		copy(result, a)
+		return
+	}
+
+	if k < 0 {
+		k += 2 * n
+	}
+
+	if k < n {
+		// Positive rotation: coefficients shift right, wrap with negation
+		for i := 0; i < n-k; i++ {
+			result[i+k] = a[i]
+		}
+		for i := n - k; i < n; i++ {
+			result[i+k-n] = ^params.Torus(0) - a[i]
+		}
+	} else {
+		// Rotation >= n: all coefficients get negated
+		k -= n
+		for i := 0; i < n-k; i++ {
+			result[i+k] = ^params.Torus(0) - a[i]
+		}
+		for i := n - k; i < n; i++ {
+			result[i+k-n] = a[i]
+		}
+	}
+}
+
+// PolyMulWithXKDirect multiplies by X^k and writes to provided buffer (zero-allocation)
+func (e *Evaluator) PolyMulWithXKDirect(a []params.Torus, k int, result []params.Torus) {
+	PolyMulWithXKInPlace(a, k, result)
+}
+
+// ============================================================================
+// TRLWE POOL OPERATIONS
+// ============================================================================
+
+// GetTRLWEBuffer returns a TRLWE buffer from the pool
+// Returns (A, B) slices that can be used to construct a TRLWE
+func (e *Evaluator) GetTRLWEBuffer() ([]params.Torus, []params.Torus) {
+	buf := &e.buffer.trlwePool[e.buffer.trlweIdx]
+	e.buffer.trlweIdx = (e.buffer.trlweIdx + 1) % len(e.buffer.trlwePool)
+	return buf.A, buf.B
+}
+
+// ResetTRLWEPool resets the TRLWE pool index
+func (e *Evaluator) ResetTRLWEPool() {
+	e.buffer.trlweIdx = 0
+}
+
+// ClearTRLWEBuffer clears a TRLWE buffer
+func (e *Evaluator) ClearTRLWEBuffer(a, b []params.Torus) {
+	for i := range a {
+		a[i] = 0
+		b[i] = 0
+	}
+}
diff --git a/poly/decomposer.go b/poly/decomposer.go
new file mode 100644
index 0000000..0fc2a02
--- /dev/null
+++ b/poly/decomposer.go
@@ -0,0 +1,66 @@
+package poly
+
+import "github.com/thedonutfactory/go-tfhe/params"
+
+// Decomposer performs gadget decomposition with pre-allocated buffers
+// This achieves zero-allocation decomposition operations
+type Decomposer struct {
+	buffer decompositionBuffer
+}
+
+// decompositionBuffer contains pre-allocated buffers for decomposition
+type decompositionBuffer struct {
+	// polyDecomposed is the pre-allocated buffer for polynomial decomposition
+	polyDecomposed []Poly
+	// polyFourierDecomposed is the pre-allocated buffer for Fourier-domain decomposition
+	polyFourierDecomposed []FourierPoly
+}
+
+// NewDecomposer creates a new Decomposer with buffers for up to maxLevel decomposition levels
+func NewDecomposer(N int, maxLevel int) *Decomposer {
+	polyDecomposed := make([]Poly, maxLevel)
+	polyFourierDecomposed := make([]FourierPoly, maxLevel)
+
+	for i := 0; i < maxLevel; i++ {
+		polyDecomposed[i] = NewPoly(N)
+		polyFourierDecomposed[i] = NewFourierPoly(N)
+	}
+
+	return &Decomposer{
+		buffer: decompositionBuffer{
+			polyDecomposed:        polyDecomposed,
+			polyFourierDecomposed: polyFourierDecomposed,
+		},
+	}
+}
+
+// GetPolyDecomposedBuffer returns the decomposition buffer for polynomial
+func (d *Decomposer) GetPolyDecomposedBuffer(level int) []Poly {
+	if level > len(d.buffer.polyDecomposed) {
+		panic("decomposition level exceeds buffer size")
+	}
+	return d.buffer.polyDecomposed[:level]
+}
+
+// GetPolyFourierDecomposedBuffer returns the Fourier decomposition buffer
+func (d *Decomposer) GetPolyFourierDecomposedBuffer(level int) []FourierPoly {
+	if level > len(d.buffer.polyFourierDecomposed) {
+		panic("decomposition level exceeds buffer size")
+	}
+	return d.buffer.polyFourierDecomposed[:level]
+}
+
+// DecomposePolyAssign decomposes polynomial p into decomposedOut using gadget decomposition
+// This writes directly to the provided buffer (zero-allocation)
+func DecomposePolyAssign(p []params.Torus, bgbit, level int, offset params.Torus, decomposedOut []Poly) {
+	n := len(p)
+	mask := params.Torus((1 << bgbit) - 1)
+	halfBG := params.Torus(1 << (bgbit - 1))
+
+	for j := 0; j < n; j++ {
+		tmp := p[j] + offset
+		for i := 0; i < level; i++ {
+			decomposedOut[i].Coeffs[j] = ((tmp >> (32 - (uint32(i)+1)*uint32(bgbit))) & mask) - halfBG
+		}
+	}
+}
diff --git a/poly/poly_evaluator.go b/poly/poly_evaluator.go
index 3edf90b..1c6cdc5 100644
--- a/poly/poly_evaluator.go
+++ b/poly/poly_evaluator.go
@@ -3,6 +3,8 @@ package poly
 import (
 	"math"
 	"math/cmplx"
+
+	"github.com/thedonutfactory/go-tfhe/params"
 )
 
 // Evaluator computes polynomial operations over the N-th cyclotomic ring.
@@ -26,13 +28,48 @@ type Evaluator struct {
 }
 
 // evaluationBuffer is a buffer for Evaluator.
+// These buffers are pre-allocated and reused across operations to achieve zero-allocation performance.
+//
+// This is the UNIFIED buffer system that consolidates all buffer management:
+//   - FFT/IFFT working buffers
+//   - Decomposition buffers (time and Fourier domain)
+//   - Multiplication accumulators
+//   - Rotation pools
+//   - TRLWE pools
+//   - Temporary buffers
 type evaluationBuffer struct {
-	// fp is an intermediate FFT buffer.
-	fp FourierPoly
-	// fpInv is an intermediate inverse FFT buffer.
-	fpInv FourierPoly
-	// pSplit is a buffer for split operations.
-	pSplit Poly
+	// === Core FFT Buffers ===
+	fp     FourierPoly // Intermediate FFT buffer
+	fpInv  FourierPoly // Intermediate inverse FFT buffer
+	pSplit Poly        // Buffer for split operations
+
+	// === External Product / Multiplication Buffers ===
+	fpMul1, fpMul2 FourierPoly // For multiplication operands
+	fpAcc, fpBcc   FourierPoly // For accumulation (A and B components)
+
+	// === Decomposition Buffers ===
+	decompBuffer []Poly        // Pool of decomposition results (time domain)
+	decompFFT    []FourierPoly // FFT'd decomposition results (Fourier domain)
+
+	// === CMUX Buffers ===
+	fpDiff FourierPoly // For CMUX difference computation
+
+	// === Temporary Buffers ===
+	pTemp        Poly // General purpose temporary polynomial
+	pRotA, pRotB Poly // Rotation results
+
+	// === Rotation Pool ===
+	// Pool for polyMulWithXK operations (zero-allocation rotation)
+	rotationPool [4]Poly // Pool of 4 rotation buffers
+	rotationIdx  int     // Current rotation buffer index
+
+	// === TRLWE Pool ===
+	// Pool for intermediate TRLWE results
+	trlwePool [4]struct {
+		A []params.Torus
+		B []params.Torus
+	}
+	trlweIdx int // Current TRLWE pool index
 }
 
 // NewEvaluator creates a new Evaluator with degree N.
@@ -129,10 +166,59 @@ func bitReverseInPlace[T any](data []T) {
 
 // newEvaluationBuffer creates a new evaluationBuffer.
 func newEvaluationBuffer(N int) evaluationBuffer {
+	// Pre-allocate decomposition buffers for typical TFHE parameters
+	// L=3, so we need 3*2=6 decomposition levels
+	const maxDecompLevels = 8 // Slightly more for safety
+
+	decompBuffer := make([]Poly, maxDecompLevels)
+	decompFFT := make([]FourierPoly, maxDecompLevels)
+	for i := 0; i < maxDecompLevels; i++ {
+		decompBuffer[i] = NewPoly(N)
+		decompFFT[i] = NewFourierPoly(N)
+	}
+
+	// Initialize rotation pool
+	var rotationPool [4]Poly
+	for i := 0; i < 4; i++ {
+		rotationPool[i] = NewPoly(N)
+	}
+
+	// Initialize TRLWE pool
+	var trlwePool [4]struct {
+		A []params.Torus
+		B []params.Torus
+	}
+	for i := 0; i < 4; i++ {
+		trlwePool[i].A = make([]params.Torus, N)
+		trlwePool[i].B = make([]params.Torus, N)
+	}
+
 	return evaluationBuffer{
 		fp:     NewFourierPoly(N),
 		fpInv:  NewFourierPoly(N),
 		pSplit: NewPoly(N),
+
+		// External product buffers
+		fpMul1: NewFourierPoly(N),
+		fpMul2: NewFourierPoly(N),
+		fpAcc:  NewFourierPoly(N),
+		fpBcc:  NewFourierPoly(N),
+
+		// Decomposition buffers
+		decompBuffer: decompBuffer,
+		decompFFT:    decompFFT,
+
+		// CMUX buffers
+		fpDiff: NewFourierPoly(N),
+		pTemp:  NewPoly(N),
+
+		// Blind rotation buffers
+		pRotA:        NewPoly(N),
+		pRotB:        NewPoly(N),
+		rotationPool: rotationPool,
+		rotationIdx:  0,
+		trlwePool:    trlwePool,
+		trlweIdx:     0,
 	}
 }
 
diff --git a/tfhe-go b/tfhe-go
new file mode 160000
index 0000000..808e69f
--- /dev/null
+++ b/tfhe-go
@@ -0,0 +1 @@
+Subproject commit 808e69f095045e6bde1b6d1874fd3f43ecb854e4
diff --git a/tlwe/tlwe.go b/tlwe/tlwe.go
index 4afc141..41e534b 100644
--- a/tlwe/tlwe.go
+++ b/tlwe/tlwe.go
@@ -81,6 +81,13 @@ func (t *TLWELv0) Add(other *TLWELv0) *TLWELv0 {
 	return result
 }
 
+// AddAssign adds two TLWE Level 0 ciphertexts and writes to output (zero-allocation)
+func (t *TLWELv0) AddAssign(other *TLWELv0, output *TLWELv0) {
+	for i := range output.P {
+		output.P[i] = t.P[i] + other.P[i]
+	}
+}
+
 // Sub subtracts two TLWE Level 0 ciphertexts
 func (t *TLWELv0) Sub(other *TLWELv0) *TLWELv0 {
 	result := NewTLWELv0()
diff --git a/trgsw/keyswitch.go b/trgsw/keyswitch.go
new file mode 100644
index 0000000..05cfa0a
--- /dev/null
+++ b/trgsw/keyswitch.go
@@ -0,0 +1,37 @@
+package trgsw
+
+import (
+	"github.com/thedonutfactory/go-tfhe/params"
+	"github.com/thedonutfactory/go-tfhe/tlwe"
+)
+
+// IdentityKeySwitchingAssign performs identity key switching and writes to output
+// Zero-allocation version
+func IdentityKeySwitchingAssign(src *tlwe.TLWELv1, keySwitchingKey []*tlwe.TLWELv0, output *tlwe.TLWELv0) {
+	n := params.GetTRGSWLv1().N
+	basebit := params.GetTRGSWLv1().BASEBIT
+	base := 1 << basebit
+	iksT := params.GetTRGSWLv1().IKS_T
+	tlweLv0N := params.GetTLWELv0().N
+
+	// Clear output
+	for i := 0; i < len(output.P); i++ {
+		output.P[i] = 0
+	}
+	output.P[tlweLv0N] = src.P[len(src.P)-1]
+
+	precOffset := params.Torus(1 << (32 - (1 + basebit*iksT)))
+
+	for i := 0; i < n; i++ {
+		aBar := src.P[i] + precOffset
+		for j := 0; j < iksT; j++ {
+			k := (aBar >> (32 - (j+1)*basebit)) & params.Torus((1<<basebit)-1)
+			if k != 0 {
+				idx := (base * iksT * i) + (base * j) + int(k)
+				for x := 0; x < len(output.P); x++ {
+					output.P[x] -= keySwitchingKey[idx].P[x]
+				}
+			}
+		}
+	}
+}
diff --git a/trgsw/trgsw.go b/trgsw/trgsw.go
index 27a8f9d..f0ef76a 100644
--- a/trgsw/trgsw.go
+++ b/trgsw/trgsw.go
@@ -1,6 +1,7 @@
 package trgsw
 
 import (
+	"math"
 	"sync"
 
 	"github.com/thedonutfactory/go-tfhe/fft"
@@ -37,7 +38,7 @@ func (t *TRGSWLv1) EncryptTorus(p params.Torus, alpha float64, key []params.Toru
 	// Calculate p_f64 values
 	pF64 := make([]float64, l)
 	for i := 0; i < l; i++ {
-		pF64[i] = 1.0 / pow(bg, float64(i+1))
+		pF64[i] = 1.0 / math.Pow(bg, float64(i+1))
 	}
 	pTorus := utils.F64ToTorusVec(pF64)
 	plainZero := make([]float64, n)
@@ -104,110 +105,133 @@ type CloudKeyData interface {
 }
 
 // ExternalProductWithFFT performs external product with FFT optimization
+// This version uses pre-allocated buffers for maximum zero-allocation performance
 func ExternalProductWithFFT(trgswFFT *TRGSWLv1FFT, trlweIn *trlwe.TRLWELv1, decompositionOffset params.Torus, polyEval *poly.Evaluator) *trlwe.TRLWELv1 {
-	dec := decomposition(trlweIn, decompositionOffset)
-
 	l := params.GetTRGSWLv1().L
 
-	// Initialize output in frequency domain
-	outAFFT := polyEval.NewFourierPoly()
-	outBFFT := polyEval.NewFourierPoly()
+	// Use decomposition buffer pool (zero-allocation)
+	decompositionInPlace(trlweIn, decompositionOffset, polyEval)
+
+	// Clear accumulation buffers (reuse existing buffers)
+	polyEval.ClearBuffer("fpAcc")
+	polyEval.ClearBuffer("fpBcc")
 
 	// For each decomposition level
 	for i := 0; i < l*2; i++ {
-		// Convert decomposition to Poly
-		decPoly := poly.Poly{Coeffs: dec[i][:]}
-
-		// Transform to frequency domain
-		decFFT := polyEval.ToFourierPoly(decPoly)
-
+		// decFFT is already in buffer.decompFFT[i] from decompositionInPlace
 		// Accumulate in frequency domain (multiply-add)
-		polyEval.MulAddFourierPolyAssign(decFFT, trgswFFT.TRLWEFFT[i].A, outAFFT)
-		polyEval.MulAddFourierPolyAssign(decFFT, trgswFFT.TRLWEFFT[i].B, outBFFT)
+		polyEval.MulAddFourierPolyAssignBuffered(i, trgswFFT.TRLWEFFT[i].A, "fpAcc")
+		polyEval.MulAddFourierPolyAssignBuffered(i, trgswFFT.TRLWEFFT[i].B, "fpBcc")
 	}
 
-	// Transform back to time domain
-	result := trlwe.NewTRLWELv1()
-	outA := poly.Poly{Coeffs: result.A}
-	outB := poly.Poly{Coeffs: result.B}
-	polyEval.ToPolyAssignUnsafe(outAFFT, outA)
-	polyEval.ToPolyAssignUnsafe(outBFFT, outB)
+	// Get pooled TRLWE buffer for result
+	resultA, resultB := polyEval.GetTRLWEBuffer()
 
-	return result
+	// Transform back to time domain using pooled buffer
+	polyEval.BufferToPolyAssign("fpAcc", resultA)
+	polyEval.BufferToPolyAssign("fpBcc", resultB)
+
+	return &trlwe.TRLWELv1{A: resultA, B: resultB}
 }
 
-// decomposition performs gadget decomposition of a TRLWE ciphertext
-func decomposition(trlweIn *trlwe.TRLWELv1, decompositionOffset params.Torus) [][1024]params.Torus {
+// decompositionInPlace performs gadget decomposition directly into evaluator buffers (zero-allocation)
+func decompositionInPlace(trlweIn *trlwe.TRLWELv1, decompositionOffset params.Torus, polyEval *poly.Evaluator) {
 	l := params.GetTRGSWLv1().L
 	n := params.GetTRGSWLv1().N
-	result := make([][1024]params.Torus, l*2)
 
 	offset := decompositionOffset
 	bgbit := params.GetTRGSWLv1().BGBIT
 	mask := params.Torus((1 << bgbit) - 1)
 	halfBG := params.Torus(1 << (bgbit - 1))
 
+	// Decompose directly into buffers
+	for i := 0; i < l*2; i++ {
+		buf := polyEval.GetDecompBuffer(i)
+		for j := 0; j < n; j++ {
+			buf.Coeffs[j] = 0
+		}
+	}
+
 	for j := 0; j < n; j++ {
 		tmp0 := trlweIn.A[j] + offset
 		tmp1 := trlweIn.B[j] + offset
 		for i := 0; i < l; i++ {
-			result[i][j] = ((tmp0 >> (32 - (uint32(i)+1)*bgbit)) & mask) - halfBG
+			polyEval.GetDecompBuffer(i).Coeffs[j] = ((tmp0 >> (32 - (uint32(i)+1)*bgbit)) & mask) - halfBG
 		}
 		for i := 0; i < l; i++ {
-			result[i+l][j] = ((tmp1 >> (32 - (uint32(i)+1)*bgbit)) & mask) - halfBG
+			polyEval.GetDecompBuffer(i + l).Coeffs[j] = ((tmp1 >> (32 - (uint32(i)+1)*bgbit)) & mask) - halfBG
 		}
 	}
 
-	return result
+	// Transform all decomposition levels to frequency domain
+	for i := 0; i < l*2; i++ {
+		polyEval.ToFourierPolyInBuffer(*polyEval.GetDecompBuffer(i), i)
+	}
 }
 
-// CMUX performs controlled MUX operation
+// CMUX performs controlled MUX operation (zero-allocation version using TRLWE pool)
 // if cond == 0 then in1 else in2
 func CMUX(in1, in2 *trlwe.TRLWELv1, cond *TRGSWLv1FFT, decompositionOffset params.Torus, polyEval *poly.Evaluator) *trlwe.TRLWELv1 {
 	n := params.GetTRGSWLv1().N
-	tmp := trlwe.NewTRLWELv1()
 
+	// Get TRLWE buffer from pool for difference computation
+	tmpA, tmpB := polyEval.GetTRLWEBuffer()
 	for i := 0; i < n; i++ {
-		tmp.A[i] = in2.A[i] - in1.A[i]
-		tmp.B[i] = in2.B[i] - in1.B[i]
+		tmpA[i] = in2.A[i] - in1.A[i]
+		tmpB[i] = in2.B[i] - in1.B[i]
 	}
+	tmp := &trlwe.TRLWELv1{A: tmpA, B: tmpB}
 
+	// External product (uses internal buffers for zero-alloc in hot path)
 	tmp2 := ExternalProductWithFFT(cond, tmp, decompositionOffset, polyEval)
-	result := trlwe.NewTRLWELv1()
 
+	// Add in1 to result (reuse tmp2)
 	for i := 0; i < n; i++ {
-		result.A[i] = tmp2.A[i] + in1.A[i]
-		result.B[i] = tmp2.B[i] + in1.B[i]
+		tmp2.A[i] += in1.A[i]
+		tmp2.B[i] += in1.B[i]
 	}
 
-	return result
+	return tmp2
 }
 
-// BlindRotate performs blind rotation for bootstrapping
+// BlindRotate performs blind rotation for bootstrapping (optimized with buffer pool)
 func BlindRotate(src *tlwe.TLWELv0, blindRotateTestvec *trlwe.TRLWELv1, bootstrappingKey []*TRGSWLv1FFT, decompositionOffset params.Torus, polyEval *poly.Evaluator) *trlwe.TRLWELv1 {
 	n := params.GetTRGSWLv1().N
 	nBit := params.GetTRGSWLv1().NBIT
 
+	// Reset rotation pool for this operation
+	polyEval.ResetRotationPool()
+
 	bTilda := 2*n - ((int(src.B()) + (1 << (31 - nBit - 1))) >> (32 - nBit - 1))
-	result := &trlwe.TRLWELv1{
-		A: polyMulWithXK(blindRotateTestvec.A, bTilda),
-		B: polyMulWithXK(blindRotateTestvec.B, bTilda),
-	}
+
+	// Initial rotation using buffer pool
+	resultA := polyEval.PolyMulWithXK(blindRotateTestvec.A, bTilda)
+	resultB := polyEval.PolyMulWithXK(blindRotateTestvec.B, bTilda)
+	result := &trlwe.TRLWELv1{A: resultA, B: resultB}
 
 	tlweLv0N := params.GetTLWELv0().N
 	for i := 0; i < tlweLv0N; i++ {
 		aTilda := int((src.P[i] + (1 << (31 - nBit - 1))) >> (32 - nBit - 1))
-		res2 := &trlwe.TRLWELv1{
-			A: polyMulWithXK(result.A, aTilda),
-			B: polyMulWithXK(result.B, aTilda),
-		}
+
+		// Use buffer pool for rotation
+		res2A := polyEval.PolyMulWithXK(result.A, aTilda)
+		res2B := polyEval.PolyMulWithXK(result.B, aTilda)
+		res2 := &trlwe.TRLWELv1{A: res2A, B: res2B}
+
 		result = CMUX(result, res2, bootstrappingKey[i], decompositionOffset, polyEval)
 	}
 
 	return result
 }
 
-// BatchBlindRotate performs multiple blind rotations in parallel
+// evaluatorPool is a pool of evaluators for parallel operations
+var evaluatorPool = sync.Pool{
+	New: func() interface{} {
+		return poly.NewEvaluator(params.GetTRGSWLv1().N)
+	},
+}
+
+// BatchBlindRotate performs multiple blind rotations in parallel (zero-allocation)
 func BatchBlindRotate(srcs []*tlwe.TLWELv0, blindRotateTestvec *trlwe.TRLWELv1, bootstrappingKey []*TRGSWLv1FFT, decompositionOffset params.Torus) []*trlwe.TRLWELv1 {
 	results := make([]*trlwe.TRLWELv1, len(srcs))
 	var wg sync.WaitGroup
@@ -216,7 +240,10 @@ func BatchBlindRotate(srcs []*tlwe.TLWELv0, blindRotateTestvec *trlwe.TRLWELv1,
 		wg.Add(1)
 		go func(idx int, s *tlwe.TLWELv0) {
 			defer wg.Done()
-			polyEval := poly.NewEvaluator(params.GetTRGSWLv1().N)
+			// Get evaluator from pool (reuse instead of allocate)
+			polyEval := evaluatorPool.Get().(*poly.Evaluator)
+			defer evaluatorPool.Put(polyEval)
+
 			results[idx] = BlindRotate(s, blindRotateTestvec, bootstrappingKey, decompositionOffset, polyEval)
 		}(i, src)
 	}
@@ -225,12 +252,18 @@ func BatchBlindRotate(srcs []*tlwe.TLWELv0, blindRotateTestvec *trlwe.TRLWELv1,
 	return results
 }
 
-// polyMulWithXK multiplies a polynomial by X^k in the ring Z[X]/(X^N+1)
-func polyMulWithXK(a []params.Torus, k int) []params.Torus {
-	n := params.GetTRGSWLv1().N
-	result := make([]params.Torus, n)
+// polyMulWithXKInPlace multiplies a polynomial by X^k in-place (zero-allocation)
+func polyMulWithXKInPlace(a []params.Torus, k int, result []params.Torus) {
+	n := len(a)
+	k = k % (2 * n) // Normalize k to [0, 2N)
+
+	if k == 0 {
+		copy(result, a)
+		return
+	}
 
 	if k < n {
+		// Positive rotation: coefficients shift right, wrap with negation
 		for i := 0; i < n-k; i++ {
 			result[i+k] = a[i]
 		}
@@ -238,15 +271,15 @@ func polyMulWithXK(a []params.Torus, k int) []params.Torus {
 			result[i+k-n] = ^params.Torus(0) - a[i]
 		}
 	} else {
-		for i := 0; i < 2*n-k; i++ {
-			result[i+k-n] = ^params.Torus(0) - a[i]
+		// Rotation >= n: all coefficients get negated
+		k -= n
+		for i := 0; i < n-k; i++ {
+			result[i+k] = ^params.Torus(0) - a[i]
 		}
-		for i := 2*n - k; i < n; i++ {
-			result[i-(2*n-k)] = a[i]
+		for i := n - k; i < n; i++ {
+			result[i+k-n] = a[i]
 		}
 	}
-
-	return result
 }
 
 // IdentityKeySwitching performs identity key switching
@@ -277,12 +310,3 @@ func IdentityKeySwitching(src *tlwe.TLWELv1, keySwitchingKey []*tlwe.TLWELv0) *t
 
 	return result
 }
-
-// Helper function for power
-func pow(base, exp float64) float64 {
-	result := 1.0
-	for i := 0; i < int(exp); i++ {
-		result *= base
-	}
-	return result
-}
diff --git a/trlwe/trlwe.go b/trlwe/trlwe.go
index 60f100c..46df620 100644
--- a/trlwe/trlwe.go
+++ b/trlwe/trlwe.go
@@ -108,9 +108,10 @@ func NewTRLWELv1FFT(trlwe *TRLWELv1, plan *fft.FFTPlan) *TRLWELv1FFT {
 
 // NewTRLWELv1FFTDummy creates a dummy TRLWE Level 1 FFT ciphertext
 func NewTRLWELv1FFTDummy() *TRLWELv1FFT {
+	// FourierPoly needs 2*N for interleaved real/imaginary layout
 	return &TRLWELv1FFT{
-		A: make([]float64, params.GetTRLWELv1().N),
-		B: make([]float64, params.GetTRLWELv1().N),
+		A: make([]float64, 2*params.GetTRLWELv1().N),
+		B: make([]float64, 2*params.GetTRLWELv1().N),
 	}
 }
 
diff --git a/trlwe/trlwe_ops.go b/trlwe/trlwe_ops.go
new file mode 100644
index 0000000..131d350
--- /dev/null
+++ b/trlwe/trlwe_ops.go
@@ -0,0 +1,21 @@
+package trlwe
+
+import (
+	"github.com/thedonutfactory/go-tfhe/params"
+	"github.com/thedonutfactory/go-tfhe/tlwe"
+)
+
+// SampleExtractIndexAssign extracts a TLWE sample from TRLWE at index k and writes to output
+// Zero-allocation version
+func SampleExtractIndexAssign(trlwe *TRLWELv1, k int, output *tlwe.TLWELv1) {
+	n := params.GetTRLWELv1().N
+
+	for i := 0; i < n; i++ {
+		if i <= k {
+			output.P[i] = trlwe.A[k-i]
+		} else {
+			output.P[i] = ^params.Torus(0) - trlwe.A[n+k-i]
+		}
+	}
+	output.SetB(trlwe.B[k])
+}