diff --git a/go.mod b/go.mod
index c1c3675f839dd..df2aaab901a66 100644
--- a/go.mod
+++ b/go.mod
@@ -92,6 +92,7 @@ require (
 	github.com/tidwall/pretty v1.2.1
 	github.com/tmc/langchaingo v0.1.13
 	github.com/unum-cloud/usearch/golang v0.0.0-20251010193336-541e882da5a9
+	github.com/viterin/partial v1.1.0
 	go.starlark.net v0.0.0-20250701195324-d457b4515e0e
 	go.uber.org/automaxprocs v1.5.3
 	go.uber.org/ratelimit v0.2.0
@@ -242,6 +243,9 @@ replace (
 	github.com/lni/dragonboat/v4 v4.0.0-20220815145555-6f622e8bcbef => github.com/matrixorigin/dragonboat/v4 v4.0.0-20251214113216-2ddf81ef2a85
 	github.com/lni/goutils v1.3.1-0.20220604063047-388d67b4dbc4 => github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4
 	github.com/lni/vfs v0.2.1-0.20220616104132-8852fd867376 => github.com/matrixorigin/vfs v0.2.1-0.20220616104132-8852fd867376
+
+	github.com/rapidsai/cuvs/go v0.0.0-20251126145430-91c51b1cc43d => github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6
+	github.com/unum-cloud/usearch/golang v0.0.0-20251010193336-541e882da5a9 => github.com/cpegeric/usearch/golang v0.0.0-20251212130039-afde3fa5e527
 )
 
 replace github.com/shoenig/go-m1cpu => github.com/shoenig/go-m1cpu v0.1.7
diff --git a/go.sum b/go.sum
index 663a3404a54b4..7431d06b9ab2a 100644
--- a/go.sum
+++ b/go.sum
@@ -193,8 +193,12 @@ github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8Nz
 github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
 github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI=
 github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
+github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6 h1:hn6US40835XeZRilkHLIUpWTF2RYBRXCpBLn1PPOSjg=
+github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6/go.mod h1:Ju9l9IcIHZOPLO1tjN9dEYSgEPFowDPF9pM70W9nNGs=
 github.com/cpegeric/pdftotext-go v0.0.0-20241112123704-49cb86a3790e h1:tQSCiEjYPRU+AuuVR+zd+xYVOsEqX1clPhmIAM6FCHU=
 github.com/cpegeric/pdftotext-go v0.0.0-20241112123704-49cb86a3790e/go.mod h1:zt7uTOYu0EEeKatGaTi9JiP0I9ePHpDvjAwpfPXh/N0=
+github.com/cpegeric/usearch/golang v0.0.0-20251212130039-afde3fa5e527 h1:A8qDoMLqBZRv5Fi7RC+KA9hMngZAIK4cSuL1ME3Jy7w=
+github.com/cpegeric/usearch/golang v0.0.0-20251212130039-afde3fa5e527/go.mod h1:3SN8SakyyBWzb14DNZn4t5yX8dOa7ae45KpqDioi4RA=
 github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E=
 github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc=
 github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
@@ -710,8 +714,6 @@ github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdO
 github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY=
 github.com/prometheus/procfs v0.11.1 h1:xRC8Iq1yyca5ypa9n1EZnWZkt7dwcoRPQwX/5gwaUuI=
 github.com/prometheus/procfs v0.11.1/go.mod h1:eesXgaPo1q7lBpVMoMy0ZOFTth9hBn4W/y0/p/ScXhY=
-github.com/rapidsai/cuvs/go v0.0.0-20251126145430-91c51b1cc43d h1:oni8aAPpyR2wAj6lmMbVIdIku5fV839lJ8Dx3o0fw44=
-github.com/rapidsai/cuvs/go v0.0.0-20251126145430-91c51b1cc43d/go.mod h1:qQPopaJ6Z5DXM+HqtP8TzatknrfiCE7vBf/p1+lVFr8=
 github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
 github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
 github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
@@ -848,8 +850,6 @@ github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGr
 github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw=
 github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
 github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
-github.com/unum-cloud/usearch/golang v0.0.0-20251010193336-541e882da5a9 h1:JrHCee+uqpF2zXooiKu7ymvKgnzlUIXtTlZ7vi21Tr0=
-github.com/unum-cloud/usearch/golang v0.0.0-20251010193336-541e882da5a9/go.mod h1:NxBpQibuBBeA/V8RGbrNzVAv4OyWWL5yNao7mVz656k=
 github.com/urfave/negroni v1.0.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
 github.com/valyala/fasthttp v1.6.0/go.mod h1:FstJa9V+Pj9vQ7OJie2qMHdwemEDaDiSdBnvPM1Su9w=
@@ -862,6 +862,8 @@ github.com/valyala/histogram v1.2.0/go.mod h1:Hb4kBwb4UxsaNbbbh+RRz8ZR6pdodR57tz
 github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
 github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8=
 github.com/vishvananda/netns v0.0.4/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM=
+github.com/viterin/partial v1.1.0 h1:iH1l1xqBlapXsYzADS1dcbizg3iQUKTU1rbwkHv/80E=
+github.com/viterin/partial v1.1.0/go.mod h1:oKGAo7/wylWkJTLrWX8n+f4aDPtQMQ6VG4dd2qur5QA=
 github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
 github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb h1:zGWFAtiMcyryUHoUjUJX0/lt1H2+i2Ka2n+D3DImSNo=
 github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
diff --git a/pkg/common/concurrent/executor.go b/pkg/common/concurrent/executor.go
new file mode 100644
index 0000000000000..6fa438d4161da
--- /dev/null
+++ b/pkg/common/concurrent/executor.go
@@ -0,0 +1,65 @@
+// Copyright 2021 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package concurrent
+
+import (
+	"context"
+	"runtime"
+
+	"golang.org/x/sync/errgroup"
+)
+
+type ThreadPoolExecutor struct {
+	nthreads int
+}
+
+func NewThreadPoolExecutor(nthreads int) ThreadPoolExecutor {
+	if nthreads == 0 {
+		nthreads = runtime.NumCPU()
+	}
+	return ThreadPoolExecutor{nthreads: nthreads}
+}
+
+func (e ThreadPoolExecutor) Execute(
+	ctx context.Context,
+	nitems int,
+	fn func(ctx context.Context, thread_id int, start, end int) error) (err error) {
+
+	g, ctx := errgroup.WithContext(ctx)
+	chunksz := (nitems + e.nthreads - 1) / e.nthreads
+	for i := 0; i < e.nthreads; i++ {
+
+		start := i * chunksz
+		if start >= nitems {
+			break
+		}
+
+		end := start + chunksz
+		if end > nitems {
+			end = nitems
+		}
+
+		thread_id := i
+		g.Go(func() error {
+			if err2 := fn(ctx, thread_id, start, end); err2 != nil {
+				return err2
+			}
+
+			return nil
+		})
+	}
+
+	return g.Wait()
+}
diff --git a/pkg/common/concurrent/executor_test.go b/pkg/common/concurrent/executor_test.go
new file mode 100644
index 0000000000000..fd748bc953437
--- /dev/null
+++ b/pkg/common/concurrent/executor_test.go
@@ -0,0 +1,59 @@
+// Copyright 2021 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package concurrent
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestExecutor(t *testing.T) {
+
+	ctx := context.Background()
+	nthreads := 3
+	vec := make([]int, 1024)
+	answer := 0
+	for i := range vec {
+		vec[i] = i
+		answer += i
+	}
+
+	e := NewThreadPoolExecutor(nthreads)
+
+	r := make([]int, nthreads)
+
+	err := e.Execute(ctx, len(vec), func(ctx context.Context, thread_id int, start, end int) error {
+		subSlice := vec[start:end]
+		for j := range subSlice {
+			if j%100 == 0 && ctx.Err() != nil {
+				return ctx.Err()
+			}
+
+			r[thread_id] += subSlice[j]
+		}
+		return nil
+	})
+
+	require.NoError(t, err)
+
+	sum := 0
+	for _, v := range r {
+		sum += v
+	}
+
+	require.Equal(t, sum, answer)
+}
diff --git a/pkg/container/types/types.go b/pkg/container/types/types.go
index 275cc93cfa25a..d1379969157eb 100644
--- a/pkg/container/types/types.go
+++ b/pkg/container/types/types.go
@@ -565,6 +565,16 @@ func (t Type) DescString() string {
 	return t.Oid.String()
 }
 
+func (t Type) GetArrayElementSize() int {
+	switch t.Oid {
+	case T_array_float32:
+		return 4
+	case T_array_float64:
+		return 8
+	}
+	panic(moerr.NewInternalErrorNoCtx(fmt.Sprintf("unknown array type %d", t)))
+}
+
 func (t Type) Eq(b Type) bool {
 	switch t.Oid {
 	// XXX need to find out why these types have different size/width
diff --git a/pkg/sql/colexec/productl2/product_l2.go b/pkg/sql/colexec/productl2/product_l2.go
index ad40bdf674c4f..c9038cf2ae3d1 100644
--- a/pkg/sql/colexec/productl2/product_l2.go
+++ b/pkg/sql/colexec/productl2/product_l2.go
@@ -18,14 +18,17 @@ import (
 	"bytes"
 	"runtime"
 	"strings"
-	"sync"
 	"time"
 
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
 	"github.com/matrixorigin/matrixone/pkg/container/batch"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/container/vector"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/brute_force"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
 	"github.com/matrixorigin/matrixone/pkg/vm"
 	"github.com/matrixorigin/matrixone/pkg/vm/message"
 	"github.com/matrixorigin/matrixone/pkg/vm/process"
@@ -65,6 +68,7 @@ func (productl2 *Productl2) Call(proc *process.Process) (vm.CallResult, error) {
 	ctr := &ap.ctr
 	result := vm.NewCallResult()
 	var err error
+
 	for {
 		switch ctr.state {
 		case Build:
@@ -120,6 +124,47 @@ func (productl2 *Productl2) Call(proc *process.Process) (vm.CallResult, error) {
 			return result, nil
 		}
 	}
+
+}
+
+func NewNullVector[T types.RealNumbers](dim int32) []T {
+	// null vector with magnitude 1
+	nullvec := make([]T, dim)
+	nullvec[0] = 1
+	return nullvec
+}
+
+func getIndex[T types.RealNumbers](ap *Productl2, proc *process.Process, analyzer process.Analyzer) (cache.VectorIndexSearchIf, error) {
+	ctr := &ap.ctr
+	buildCount := ctr.bat.RowCount()
+	centroidColPos := ap.OnExpr.GetF().GetArgs()[0].GetCol().GetColPos()
+
+	dim := ctr.bat.Vecs[centroidColPos].GetType().Width
+	elemSize := uint(ctr.bat.Vecs[centroidColPos].GetType().GetArrayElementSize())
+	centers := make([][]T, buildCount)
+	nullvec := NewNullVector[T](dim)
+
+	for i := 0; i < buildCount; i++ {
+		if ctr.bat.Vecs[centroidColPos].IsNull(uint64(i)) {
+			centers[i] = nullvec
+			continue
+		}
+
+		c := types.BytesToArray[T](ctr.bat.Vecs[centroidColPos].GetBytesAt(i))
+		centers[i] = c
+	}
+
+	algo, err := brute_force.NewBruteForceIndex[T](centers, uint(dim), ctr.metrictype, elemSize)
+	if err != nil {
+		return nil, err
+	}
+
+	err = algo.Load(sqlexec.NewSqlProcess(proc))
+	if err != nil {
+		return nil, err
+	}
+
+	return algo, nil
 }
 
 func (productl2 *Productl2) build(proc *process.Process, analyzer process.Analyzer) error {
@@ -142,6 +187,21 @@ func (productl2 *Productl2) build(proc *process.Process, analyzer process.Analyz
 		}
 	}
 	mp.Free()
+
+	centroidColPos := productl2.OnExpr.GetF().GetArgs()[0].GetCol().GetColPos()
+	switch ctr.bat.Vecs[centroidColPos].GetType().Oid {
+	case types.T_array_float32:
+		ctr.brute_force, err = getIndex[float32](productl2, proc, analyzer)
+		if err != nil {
+			return err
+		}
+	case types.T_array_float64:
+		ctr.brute_force, err = getIndex[float64](productl2, proc, analyzer)
+		if err != nil {
+			return err
+		}
+	}
+
 	return nil
 }
 
@@ -160,56 +220,33 @@ func (productl2 *Productl2) build(proc *process.Process, analyzer process.Analyz
 //	}
 //)
 
-func newMat[T types.RealNumbers](ctr *container, ap *Productl2) ([][]T, [][]T) {
-	buildCount := ctr.bat.RowCount()
+func newMat[T types.RealNumbers](ctr *container, ap *Productl2) ([][]T, error) {
 	probeCount := ctr.inBat.RowCount()
-	centroidColPos := ap.OnExpr.GetF().GetArgs()[0].GetCol().GetColPos()
 	tblColPos := ap.OnExpr.GetF().GetArgs()[1].GetCol().GetColPos()
-	centroidmat := make([][]T, buildCount)
-	for i := 0; i < buildCount; i++ {
-		switch ctr.bat.Vecs[centroidColPos].GetType().Oid {
-		case types.T_array_float32:
-			if ctr.bat.Vecs[centroidColPos].IsNull(uint64(i)) {
-				centroidmat[i] = nil
-				continue
-			}
-			centroidmat[i] = types.BytesToArray[T](ctr.bat.Vecs[centroidColPos].GetBytesAt(i))
-		case types.T_array_float64:
-			if ctr.bat.Vecs[centroidColPos].IsNull(uint64(i)) {
-				centroidmat[i] = nil
-				continue
-			}
-			centroidmat[i] = types.BytesToArray[T](ctr.bat.Vecs[centroidColPos].GetBytesAt(i))
-		}
-	}
+
+	// dimension can only get from centroid column.  probe column input values can be null and dimension is 0.
+	centroidColPos := ap.OnExpr.GetF().GetArgs()[0].GetCol().GetColPos()
+	dim := ctr.bat.Vecs[centroidColPos].GetType().Width
+	nullvec := NewNullVector[T](dim)
 
 	// embedding mat
-	embedmat := make([][]T, probeCount)
+	probes := make([][]T, probeCount)
 	for j := 0; j < probeCount; j++ {
 
-		switch ctr.bat.Vecs[centroidColPos].GetType().Oid {
-		case types.T_array_float32:
-			if ctr.inBat.Vecs[tblColPos].IsNull(uint64(j)) {
-				embedmat[j] = nil
-				continue
-			}
-			embedmat[j] = types.BytesToArray[T](ctr.inBat.Vecs[tblColPos].GetBytesAt(j))
-		case types.T_array_float64:
-			if ctr.inBat.Vecs[tblColPos].IsNull(uint64(j)) {
-				embedmat[j] = nil
-				continue
-			}
-			embedmat[j] = types.BytesToArray[T](ctr.inBat.Vecs[tblColPos].GetBytesAt(j))
+		if ctr.inBat.Vecs[tblColPos].IsNull(uint64(j)) {
+			probes[j] = nullvec
+			continue
 		}
-
+		v := types.BytesToArray[T](ctr.inBat.Vecs[tblColPos].GetBytesAt(j))
+		probes[j] = v
 	}
 
-	return centroidmat, embedmat
+	return probes, nil
 }
 
 func (ctr *container) probe(ap *Productl2, proc *process.Process, result *vm.CallResult) error {
-	centroidColPos := ap.OnExpr.GetF().GetArgs()[0].GetCol().GetColPos()
-	switch ctr.bat.Vecs[centroidColPos].GetType().Oid {
+	tblColPos := ap.OnExpr.GetF().GetArgs()[1].GetCol().GetColPos()
+	switch ctr.inBat.Vecs[tblColPos].GetType().Oid {
 	case types.T_array_float32:
 		return probeRun[float32](ctr, ap, proc, result)
 	case types.T_array_float64:
@@ -218,9 +255,16 @@ func (ctr *container) probe(ap *Productl2, proc *process.Process, result *vm.Cal
 	return nil
 }
 
+func (ctr *container) release() {
+	if ctr.brute_force != nil {
+		ctr.brute_force.Destroy()
+		ctr.brute_force = nil
+	}
+}
+
 func probeRun[T types.RealNumbers](ctr *container, ap *Productl2, proc *process.Process, result *vm.CallResult) error {
-	buildCount := ctr.bat.RowCount()
 	probeCount := ctr.inBat.RowCount()
+	tblColPos := ap.OnExpr.GetF().GetArgs()[1].GetCol().GetColPos()
 
 	ncpu := runtime.NumCPU()
 	if probeCount < ncpu {
@@ -234,85 +278,40 @@ func probeRun[T types.RealNumbers](ctr *container, ap *Productl2, proc *process.
 		}
 	}
 
-	leastClusterIndex := make([]int, probeCount)
-	leastDistance := make([]T, probeCount)
-
-	for i := 0; i < probeCount; i++ {
-		leastClusterIndex[i] = 0
-		leastDistance[i] = metric.MaxFloat[T]()
-	}
-
-	centroidmat, embedmat := newMat[T](ctr, ap)
-	distfn, err := metric.ResolveDistanceFn[T](ctr.metrictype)
+	probes, err := newMat[T](ctr, ap)
 	if err != nil {
-		return moerr.NewInternalError(proc.Ctx, "ProductL2: failed to get distance function")
+		return err
 	}
 
-	errs := make(chan error, ncpu)
-	var mutex sync.Mutex
-	var wg sync.WaitGroup
+	rt := vectorindex.RuntimeConfig{Limit: 1, NThreads: uint(ncpu)}
 
-	for n := 0; n < ncpu; n++ {
+	anykeys, distances, err := ctr.brute_force.Search(sqlexec.NewSqlProcess(proc), probes, rt)
+	if err != nil {
+		return err
+	}
+	_ = distances
 
-		wg.Add(1)
-		go func(tid int) {
-			defer wg.Done()
-			for j := 0; j < probeCount; j++ {
+	leastClusterIndex := anykeys.([]int64)
 
-				if j%ncpu != tid {
-					continue
-				}
+	//os.Stderr.WriteString(fmt.Sprintf("keys %v\n", keys))
+	//os.Stderr.WriteString(fmt.Sprintf("distances %v\n", distances))
 
-				// for each row in probe table,
-				// find the nearest cluster center from the build table.
-				for i := 0; i < buildCount; i++ {
+	for j := 0; j < probeCount; j++ {
 
-					if embedmat[j] == nil || centroidmat[i] == nil {
-						leastDistance[j] = 0
-						leastClusterIndex[j] = i
-					} else {
-						dist, err := distfn(centroidmat[i], embedmat[j])
-						if err != nil {
-							errs <- err
-							return
-						}
-						if dist < leastDistance[j] {
-							leastDistance[j] = dist
-							leastClusterIndex[j] = i
-						}
-					}
+		if ctr.inBat.Vecs[tblColPos].IsNull(uint64(j)) {
+			leastClusterIndex[j] = 0
+		}
+		for k, rp := range ap.Result {
+			if rp.Rel == 0 {
+				if err := ctr.rbat.Vecs[k].UnionOne(ctr.inBat.Vecs[rp.Pos], int64(j), proc.Mp()); err != nil {
+					return err
 				}
-
-				err := func() error {
-					mutex.Lock()
-					defer mutex.Unlock()
-					for k, rp := range ap.Result {
-						if rp.Rel == 0 {
-							if err := ctr.rbat.Vecs[k].UnionOne(ctr.inBat.Vecs[rp.Pos], int64(j), proc.Mp()); err != nil {
-								return err
-							}
-						} else {
-							if err := ctr.rbat.Vecs[k].UnionOne(ctr.bat.Vecs[rp.Pos], int64(leastClusterIndex[j]), proc.Mp()); err != nil {
-								return err
-							}
-						}
-					}
-
-					return nil
-				}()
-
-				if err != nil {
-					errs <- err
-					return
+			} else {
+				if err := ctr.rbat.Vecs[k].UnionOne(ctr.bat.Vecs[rp.Pos], int64(leastClusterIndex[j]), proc.Mp()); err != nil {
+					return err
 				}
 			}
-		}(n)
-	}
-
-	wg.Wait()
-
-	if len(errs) > 0 {
-		return <-errs
+		}
 	}
 
 	// ctr.rbat.AddRowCount(count * count2)
diff --git a/pkg/sql/colexec/productl2/types.go b/pkg/sql/colexec/productl2/types.go
index 621fb5ce7b558..6effcf0a7d824 100644
--- a/pkg/sql/colexec/productl2/types.go
+++ b/pkg/sql/colexec/productl2/types.go
@@ -20,7 +20,8 @@ import (
 	"github.com/matrixorigin/matrixone/pkg/container/batch"
 	"github.com/matrixorigin/matrixone/pkg/pb/plan"
 	"github.com/matrixorigin/matrixone/pkg/sql/colexec"
-	vmetric "github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
 	"github.com/matrixorigin/matrixone/pkg/vm"
 	"github.com/matrixorigin/matrixone/pkg/vm/process"
 )
@@ -34,11 +35,12 @@ const (
 )
 
 type container struct {
-	state      int
-	bat        *batch.Batch // build batch
-	rbat       *batch.Batch
-	inBat      *batch.Batch // probe batch
-	metrictype vmetric.MetricType
+	state       int
+	bat         *batch.Batch // build batch
+	rbat        *batch.Batch
+	inBat       *batch.Batch // probe batch
+	metrictype  metric.MetricType
+	brute_force cache.VectorIndexSearchIf // brute_force.BruteForceIndex
 }
 
 type Productl2 struct {
@@ -89,11 +91,13 @@ func (productl2 *Productl2) Reset(proc *process.Process, pipelineFailed bool, er
 	if productl2.ctr.rbat != nil {
 		productl2.ctr.rbat.CleanOnlyData()
 	}
+	productl2.ctr.release()
 	productl2.ctr.inBat = nil
 	productl2.ctr.state = Build
 }
 
 func (productl2 *Productl2) Free(proc *process.Process, pipelineFailed bool, err error) {
+	productl2.ctr.release()
 	productl2.ctr.cleanBatch(proc.Mp())
 }
 
diff --git a/pkg/sql/colexec/table_function/ivf_create.go b/pkg/sql/colexec/table_function/ivf_create.go
index 4795163ccaa53..68bad062bd5dd 100644
--- a/pkg/sql/colexec/table_function/ivf_create.go
+++ b/pkg/sql/colexec/table_function/ivf_create.go
@@ -97,7 +97,7 @@ func clustering[T types.RealNumbers](u *ivfCreateState, tf *TableFunction, proc
 	}
 	defer clusterer.Close()
 
-	anycenters, err := clusterer.Cluster()
+	anycenters, err := clusterer.Cluster(proc.Ctx)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sql/plan/function/func_binary_test.go b/pkg/sql/plan/function/func_binary_test.go
index a58e0cb217362..1685dd49291a4 100644
--- a/pkg/sql/plan/function/func_binary_test.go
+++ b/pkg/sql/plan/function/func_binary_test.go
@@ -5083,7 +5083,7 @@ func initCosineDistanceArrayTestCase() []tcTemp {
 				NewFunctionTestInput(types.T_array_float32.ToType(), [][]float32{{10, 20, 30}, {5, 6, 7}}, []bool{false, false}),
 			},
 			expect: NewFunctionTestResult(types.T_float64.ToType(), false,
-				[]float64{0, 0.00035416888764172594},
+				[]float64{0, 0.0003542540071066469},
 				[]bool{false, false}),
 		},
 		{
diff --git a/pkg/vectorindex/brute_force/brute_force.go b/pkg/vectorindex/brute_force/brute_force.go
new file mode 100644
index 0000000000000..9ea24fd84a199
--- /dev/null
+++ b/pkg/vectorindex/brute_force/brute_force.go
@@ -0,0 +1,293 @@
+// Copyright 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package brute_force
+
+import (
+	"context"
+	"fmt"
+	"runtime"
+	"slices"
+
+	"github.com/matrixorigin/matrixone/pkg/common/concurrent"
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/common/util"
+	"github.com/matrixorigin/matrixone/pkg/container/types"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
+	usearch "github.com/unum-cloud/usearch/golang"
+	"github.com/viterin/partial"
+)
+
+type UsearchBruteForceIndex[T types.RealNumbers] struct {
+	Dataset      []T // flattend vector
+	Metric       usearch.Metric
+	Dimension    uint
+	Count        uint
+	Quantization usearch.Quantization
+	ElementSize  uint
+}
+
+type GoBruteForceIndex[T types.RealNumbers] struct {
+	Dataset   [][]T // flattend vector
+	Metric    metric.MetricType
+	Dimension uint
+	Count     uint
+}
+
+var _ cache.VectorIndexSearchIf = &UsearchBruteForceIndex[float32]{}
+var _ cache.VectorIndexSearchIf = &GoBruteForceIndex[float32]{}
+
+func GetUsearchQuantizationFromType(v any) (usearch.Quantization, error) {
+	switch v.(type) {
+	case float32:
+		return usearch.F32, nil
+	case float64:
+		return usearch.F64, nil
+	default:
+		return 0, moerr.NewInternalErrorNoCtx(fmt.Sprintf("usearch not support type %T", v))
+	}
+}
+
+func NewCpuBruteForceIndex[T types.RealNumbers](dataset [][]T,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
+
+	switch m {
+	case metric.Metric_L1Distance:
+		return NewGoBruteForceIndex(dataset, dimension, m, elemsz)
+	default:
+		return NewUsearchBruteForceIndex(dataset, dimension, m, elemsz)
+	}
+}
+
+func NewGoBruteForceIndex[T types.RealNumbers](dataset [][]T,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
+
+	idx := &GoBruteForceIndex[T]{}
+	idx.Metric = m
+	idx.Dimension = dimension
+	idx.Count = uint(len(dataset))
+	idx.Dataset = dataset
+	return idx, nil
+}
+
+func NewUsearchBruteForceIndex[T types.RealNumbers](dataset [][]T,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
+	var err error
+
+	idx := &UsearchBruteForceIndex[T]{}
+	idx.Metric = metric.MetricTypeToUsearchMetric[m]
+	idx.Quantization, err = GetUsearchQuantizationFromType(T(0))
+	if err != nil {
+		return nil, err
+	}
+	idx.Dimension = dimension
+	idx.Count = uint(len(dataset))
+	idx.ElementSize = elemsz
+
+	idx.Dataset = make([]T, idx.Count*idx.Dimension)
+	for i := 0; i < len(dataset); i++ {
+		offset := i * int(dimension)
+		copy(idx.Dataset[offset:], dataset[i])
+	}
+
+	return idx, nil
+}
+
+func (idx *UsearchBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) error {
+	return nil
+}
+
+func (idx *UsearchBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (keys any, distances []float64, err error) {
+	queries, ok := _queries.([][]T)
+	if !ok {
+		return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid")
+	}
+
+	var flatten []T
+	if len(queries) == 1 {
+		flatten = queries[0]
+	} else {
+		flatten = make([]T, len(queries)*int(idx.Dimension))
+		for i := 0; i < len(queries); i++ {
+			offset := i * int(idx.Dimension)
+			copy(flatten[offset:], queries[i])
+		}
+	}
+	//fmt.Printf("flattened %v\n", flatten)
+
+	// limit must be less than idx.Count
+	limit := rt.Limit
+	if limit > idx.Count {
+		limit = idx.Count
+	}
+
+	keys_ui64, distances_f32, err := usearch.ExactSearchUnsafe(
+		util.UnsafePointer(&(idx.Dataset[0])),
+		util.UnsafePointer(&(flatten[0])),
+		uint(idx.Count),
+		uint(len(queries)),
+		idx.Dimension*idx.ElementSize,
+		idx.Dimension*idx.ElementSize,
+		idx.Dimension,
+		idx.Metric,
+		idx.Quantization,
+		limit,
+		rt.NThreads)
+
+	if err != nil {
+		return
+	}
+
+	distances = make([]float64, len(distances_f32))
+	for i, dist := range distances_f32 {
+		distances[i] = float64(dist)
+	}
+
+	keys_i64 := make([]int64, len(keys_ui64))
+	for i, key := range keys_ui64 {
+		keys_i64[i] = int64(key)
+	}
+	keys = keys_i64
+
+	runtime.KeepAlive(flatten)
+	runtime.KeepAlive(idx.Dataset)
+	return
+}
+
+func (idx *UsearchBruteForceIndex[T]) UpdateConfig(sif cache.VectorIndexSearchIf) error {
+	return nil
+}
+
+func (idx *UsearchBruteForceIndex[T]) Destroy() {
+}
+
+func (idx *GoBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) error {
+	return nil
+}
+
+func (idx *GoBruteForceIndex[T]) UpdateConfig(sif cache.VectorIndexSearchIf) error {
+	return nil
+}
+
+func (idx *GoBruteForceIndex[T]) Destroy() {
+}
+
+func (idx *GoBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (keys any, distances []float64, err error) {
+	queries, ok := _queries.([][]T)
+	if !ok {
+		return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid")
+	}
+
+	distfn, err := metric.ResolveDistanceFn[T](idx.Metric)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	nthreads := rt.NThreads
+
+	// datasize * nqueries
+	nqueries := len(queries)
+	ndataset := len(idx.Dataset)
+
+	// create distance matric
+	results := make([][]vectorindex.SearchResult, nqueries)
+	for i := range results {
+		results[i] = make([]vectorindex.SearchResult, ndataset)
+	}
+
+	exec := concurrent.NewThreadPoolExecutor(int(nthreads))
+	err = exec.Execute(
+		proc.GetContext(),
+		nqueries,
+		func(ctx context.Context, thread_id int, start, end int) (err2 error) {
+			subqueries := queries[start:end]
+			subresults := results[start:end]
+			for k, q := range subqueries {
+				if k%100 == 0 && ctx.Err() != nil {
+					return ctx.Err()
+				}
+
+				for j := range idx.Dataset {
+					dist, err2 := distfn(q, idx.Dataset[j])
+					if err2 != nil {
+						return err2
+					}
+					subresults[k][j].Id = int64(j)
+					subresults[k][j].Distance = float64(dist)
+				}
+			}
+			return
+		})
+
+	if err != nil {
+		return nil, nil, err
+	}
+
+	cmpfn := func(a, b vectorindex.SearchResult) int {
+		if a.Distance < b.Distance {
+			return -1
+		} else if a.Distance == b.Distance {
+			return 0
+		}
+		return 1
+	}
+
+	// get min
+	keys64 := make([]int64, nqueries*int(rt.Limit))
+	distances = make([]float64, nqueries*int(rt.Limit))
+	err = exec.Execute(
+		proc.GetContext(),
+		nqueries,
+		func(ctx context.Context, thread_id int, start, end int) (err2 error) {
+			subresults := results[start:end]
+			for j := range subresults {
+				if j%100 == 0 && ctx.Err() != nil {
+					return ctx.Err()
+				}
+
+				if rt.Limit == 1 {
+					// min
+					first := slices.MinFunc(subresults[j], cmpfn)
+					subresults[j][0] = first
+
+				} else {
+					// partial sort
+					partial.SortFunc(subresults[j], int(rt.Limit), cmpfn)
+
+				}
+			}
+			return
+		})
+	if err != nil {
+		return nil, nil, err
+	}
+
+	for i := 0; i < nqueries; i++ {
+		for j := 0; j < int(rt.Limit); j++ {
+			keys64[i*int(rt.Limit)+j] = results[i][j].Id
+			distances[i*int(rt.Limit)+j] = results[i][j].Distance
+		}
+	}
+
+	return keys64, distances, nil
+}
diff --git a/pkg/vectorindex/brute_force/brute_force_test.go b/pkg/vectorindex/brute_force/brute_force_test.go
new file mode 100644
index 0000000000000..21cf130271463
--- /dev/null
+++ b/pkg/vectorindex/brute_force/brute_force_test.go
@@ -0,0 +1,153 @@
+//go:build !gpu
+
+// Copyright 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package brute_force
+
+import (
+	"fmt"
+	"math/rand/v2"
+	"testing"
+
+	"github.com/matrixorigin/matrixone/pkg/common/mpool"
+	"github.com/matrixorigin/matrixone/pkg/testutil"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
+	"github.com/stretchr/testify/require"
+)
+
+func TestBruteForce(t *testing.T) {
+
+	m := mpool.MustNewZero()
+	proc := testutil.NewProcessWithMPool(t, "", m)
+	sqlproc := sqlexec.NewSqlProcess(proc)
+
+	dataset := [][]float32{{1, 2, 3}, {3, 4, 5}}
+	query := [][]float32{{1, 2, 3}, {3, 4, 5}}
+	dimension := uint(3)
+	ncpu := uint(1)
+	limit := uint(2)
+	elemsz := uint(4) // float32
+
+	idx, err := NewUsearchBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+	require.NoError(t, err)
+
+	rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+
+	keys, distances, err := idx.Search(sqlproc, query, rt)
+	require.NoError(t, err)
+	fmt.Printf("keys %v, dist %v\n", keys, distances)
+
+}
+
+func TestGoBruteForce(t *testing.T) {
+
+	m := mpool.MustNewZero()
+	proc := testutil.NewProcessWithMPool(t, "", m)
+	sqlproc := sqlexec.NewSqlProcess(proc)
+
+	dataset := [][]float32{{1, 2, 3}, {3, 4, 5}}
+	query := [][]float32{{1, 2, 3}, {3, 4, 5}}
+	dimension := uint(3)
+	ncpu := uint(1)
+	limit := uint(2)
+	elemsz := uint(4) // float32
+
+	idx, err := NewGoBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+	require.NoError(t, err)
+
+	rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+
+	keys, distances, err := idx.Search(sqlproc, query, rt)
+	require.NoError(t, err)
+	fmt.Printf("keys %v, dist %v\n", keys, distances)
+
+}
+
+func runBruteForceConcurrent(t *testing.T, is_usearch bool) {
+
+	m := mpool.MustNewZero()
+	proc := testutil.NewProcessWithMPool(t, "", m)
+	sqlproc := sqlexec.NewSqlProcess(proc)
+	dimension := uint(128)
+	ncpu := uint(4)
+	limit := uint(3)
+	elemsz := uint(4) // float32
+
+	dsize := 10000
+	dataset := make([][]float32, dsize)
+	for i := range dataset {
+		dataset[i] = make([]float32, dimension)
+		for j := range dataset[i] {
+			dataset[i][j] = rand.Float32()
+		}
+	}
+
+	query := dataset
+
+	var idx cache.VectorIndexSearchIf
+	var err error
+	if is_usearch {
+		idx, err = NewUsearchBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+	} else {
+		idx, err = NewGoBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+	}
+	require.NoError(t, err)
+
+	// limit 3
+	{
+		rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+
+		anykeys, distances, err := idx.Search(sqlproc, query, rt)
+		require.NoError(t, err)
+
+		keys := anykeys.([]int64)
+		// fmt.Printf("keys %v, dist %v\n", keys, distances)
+		require.Equal(t, int(rt.Limit)*len(query), len(keys))
+		for i := range query {
+			offset := i * int(rt.Limit)
+			require.Equal(t, int64(i), keys[offset])
+			require.Equal(t, float64(0), distances[offset])
+		}
+	}
+
+	// limit 1
+	{
+		rt := vectorindex.RuntimeConfig{Limit: 1, NThreads: ncpu}
+
+		anykeys, distances, err := idx.Search(sqlproc, query, rt)
+		require.NoError(t, err)
+
+		keys := anykeys.([]int64)
+		// fmt.Printf("keys %v, dist %v\n", keys, distances)
+		require.Equal(t, int(rt.Limit)*len(query), len(keys))
+		for i := range query {
+			offset := i * int(rt.Limit)
+			require.Equal(t, int64(i), keys[offset])
+			require.Equal(t, float64(0), distances[offset])
+		}
+	}
+
+}
+
+func TestGoBruteForceConcurrent(t *testing.T) {
+	runBruteForceConcurrent(t, false)
+}
+
+func TestUsearchBruteForceConcurrent(t *testing.T) {
+	runBruteForceConcurrent(t, true)
+}
diff --git a/pkg/vectorindex/brute_force/cpu.go b/pkg/vectorindex/brute_force/cpu.go
new file mode 100644
index 0000000000000..b60f8e5b68a4b
--- /dev/null
+++ b/pkg/vectorindex/brute_force/cpu.go
@@ -0,0 +1,31 @@
+//go:build !gpu
+
+// Copyright 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package brute_force
+
+import (
+	"github.com/matrixorigin/matrixone/pkg/container/types"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+)
+
+func NewBruteForceIndex[T types.RealNumbers](dataset [][]T,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
+
+	return NewCpuBruteForceIndex[T](dataset, dimension, m, elemsz)
+}
diff --git a/pkg/vectorindex/brute_force/gpu.go b/pkg/vectorindex/brute_force/gpu.go
new file mode 100644
index 0000000000000..029c32ef152a1
--- /dev/null
+++ b/pkg/vectorindex/brute_force/gpu.go
@@ -0,0 +1,200 @@
+//go:build gpu
+
+// Copyright 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package brute_force
+
+import (
+	//	"fmt"
+
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/container/types"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
+	cuvs "github.com/rapidsai/cuvs/go"
+	"github.com/rapidsai/cuvs/go/brute_force"
+)
+
+type GpuBruteForceIndex[T cuvs.TensorNumberType] struct {
+	Resource    *cuvs.Resource // shared resource for read-only index
+	Dataset     *cuvs.Tensor[T]
+	Index       *brute_force.BruteForceIndex
+	Metric      cuvs.Distance
+	Dimension   uint
+	Count       uint
+	ElementSize uint
+}
+
+var _ cache.VectorIndexSearchIf = &GpuBruteForceIndex[float32]{}
+
+// cuvs library has bug.  comment out the GPU version until cuvs fix the bug
+func NewBruteForceIndex[T types.RealNumbers](dataset [][]T,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
+
+	switch dset := any(dataset).(type) {
+	case [][]float64:
+		return NewCpuBruteForceIndex[T](dataset, dimension, m, elemsz)
+	case [][]float32:
+		return NewCpuBruteForceIndex[float32](dset, dimension, m, elemsz)
+		//return NewGpuBruteForceIndex[float32](dset, dimension, m, elemsz)
+	default:
+		return nil, moerr.NewInternalErrorNoCtx("type not supported for BruteForceIndex")
+	}
+
+}
+
+func NewGpuBruteForceIndex[T cuvs.TensorNumberType](dataset [][]T,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
+
+	idx := &GpuBruteForceIndex[T]{}
+	resource, _ := cuvs.NewResource(nil)
+	idx.Resource = &resource
+	tensor, err := cuvs.NewTensor(dataset)
+	if err != nil {
+		return nil, err
+	}
+	idx.Dataset = &tensor
+	idx.Metric = metric.MetricTypeToCuvsMetric[m]
+	idx.Dimension = dimension
+	idx.Count = uint(len(dataset))
+
+	idx.ElementSize = elemsz
+	return idx, nil
+
+}
+
+func (idx *GpuBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) (err error) {
+	if _, err = idx.Dataset.ToDevice(idx.Resource); err != nil {
+		return err
+	}
+
+	idx.Index, err = brute_force.CreateIndex()
+	if err != nil {
+		return
+	}
+
+	err = brute_force.BuildIndex[T](*idx.Resource, idx.Dataset, idx.Metric, 0, idx.Index)
+	if err != nil {
+		return
+	}
+
+	if err = idx.Resource.Sync(); err != nil {
+		return
+	}
+
+	return
+}
+
+func (idx *GpuBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (retkeys any, retdistances []float64, err error) {
+	queriesvec, ok := _queries.([][]T)
+	if !ok {
+		return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid")
+	}
+
+	// local resource for concurrent search
+	resource, err := cuvs.NewResource(nil)
+	if err != nil {
+		return nil, nil, err
+	}
+	defer resource.Close()
+
+	queries, err := cuvs.NewTensor(queriesvec)
+	if err != nil {
+		return nil, nil, err
+	}
+	defer queries.Close()
+
+	neighbors, err := cuvs.NewTensorOnDevice[int64](&resource, []int64{int64(len(queriesvec)), int64(rt.Limit)})
+	if err != nil {
+		return nil, nil, err
+	}
+	defer neighbors.Close()
+
+	distances, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(len(queriesvec)), int64(rt.Limit)})
+	if err != nil {
+		return nil, nil, err
+	}
+	defer distances.Close()
+
+	if _, err = queries.ToDevice(&resource); err != nil {
+		return nil, nil, err
+	}
+
+	err = brute_force.SearchIndex(resource, *idx.Index, &queries, &neighbors, &distances)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	if _, err = neighbors.ToHost(&resource); err != nil {
+		return nil, nil, err
+	}
+
+	if _, err = distances.ToHost(&resource); err != nil {
+		return nil, nil, err
+	}
+
+	if err = resource.Sync(); err != nil {
+		return nil, nil, err
+	}
+
+	neighborsSlice, err := neighbors.Slice()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	distancesSlice, err := distances.Slice()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	//fmt.Printf("flattened %v\n", flatten)
+	retdistances = make([]float64, len(distancesSlice)*int(rt.Limit))
+	for i := range distancesSlice {
+		for j, dist := range distancesSlice[i] {
+			retdistances[i*int(rt.Limit)+j] = float64(dist)
+		}
+	}
+
+	keys := make([]int64, len(neighborsSlice)*int(rt.Limit))
+	for i := range neighborsSlice {
+		for j, key := range neighborsSlice[i] {
+			keys[i*int(rt.Limit)+j] = int64(key)
+		}
+	}
+	retkeys = keys
+	return
+}
+
+func (idx *GpuBruteForceIndex[T]) UpdateConfig(sif cache.VectorIndexSearchIf) error {
+	return nil
+}
+
+func (idx *GpuBruteForceIndex[T]) Destroy() {
+	if idx.Dataset != nil {
+		idx.Dataset.Close()
+	}
+	if idx.Resource != nil {
+		idx.Resource.Close()
+	}
+	if idx.Index != nil {
+		idx.Index.Close()
+	}
+}
diff --git a/pkg/vectorindex/brute_force/gpu_test.go b/pkg/vectorindex/brute_force/gpu_test.go
new file mode 100644
index 0000000000000..d9b024f5444cd
--- /dev/null
+++ b/pkg/vectorindex/brute_force/gpu_test.go
@@ -0,0 +1,139 @@
+//go:build gpu
+
+// Copyright 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package brute_force
+
+import (
+	//"fmt"
+	"math/rand/v2"
+	"sync"
+	"testing"
+
+	"github.com/matrixorigin/matrixone/pkg/common/mpool"
+	"github.com/matrixorigin/matrixone/pkg/testutil"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
+	"github.com/stretchr/testify/require"
+)
+
+func TestGpuBruteForce(t *testing.T) {
+
+	dataset := [][]float32{{1, 2, 3}, {3, 4, 5}}
+	query := [][]float32{{1, 2, 3}, {3, 4, 5}}
+	dimension := uint(3)
+	ncpu := uint(1)
+	limit := uint(1)
+	elemsz := uint(4) // float32
+
+	idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+	require.NoError(t, err)
+	defer idx.Destroy()
+
+	err = idx.Load(nil)
+	require.NoError(t, err)
+
+	rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+
+	var wg sync.WaitGroup
+
+	for n := 0; n < 4; n++ {
+
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for i := 0; i < 1000; i++ {
+				keys, distances, err := idx.Search(nil, query, rt)
+				require.NoError(t, err)
+
+				keys_i64, ok := keys.([]int64)
+				require.Equal(t, ok, true)
+
+				for j, key := range keys_i64 {
+					require.Equal(t, key, int64(j))
+					require.Equal(t, distances[j], float64(0))
+				}
+				// fmt.Printf("keys %v, dist %v\n", keys, distances)
+			}
+		}()
+	}
+
+	wg.Wait()
+
+}
+
+func TestGpuBruteForceConcurrent(t *testing.T) {
+
+	m := mpool.MustNewZero()
+	proc := testutil.NewProcessWithMPool(t, "", m)
+	sqlproc := sqlexec.NewSqlProcess(proc)
+	dimension := uint(128)
+	ncpu := uint(4)
+	limit := uint(3)
+	elemsz := uint(4) // float32
+
+	dsize := 10000
+	dataset := make([][]float32, dsize)
+	for i := range dataset {
+		dataset[i] = make([]float32, dimension)
+		for j := range dataset[i] {
+			dataset[i][j] = rand.Float32()
+		}
+	}
+
+	query := dataset
+
+	idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+	require.NoError(t, err)
+	defer idx.Destroy()
+
+	err = idx.Load(nil)
+	require.NoError(t, err)
+
+	// limit 3
+	{
+		rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+
+		anykeys, distances, err := idx.Search(sqlproc, query, rt)
+		require.NoError(t, err)
+
+		keys := anykeys.([]int64)
+		// fmt.Printf("keys %v, dist %v\n", keys, distances)
+		require.Equal(t, int(rt.Limit)*len(query), len(keys))
+		for i := range query {
+			offset := i * int(rt.Limit)
+			require.Equal(t, int64(i), keys[offset])
+			require.Equal(t, float64(0), distances[offset])
+		}
+	}
+
+	// limit 1
+	{
+		rt := vectorindex.RuntimeConfig{Limit: 1, NThreads: ncpu}
+
+		anykeys, distances, err := idx.Search(sqlproc, query, rt)
+		require.NoError(t, err)
+
+		keys := anykeys.([]int64)
+		// fmt.Printf("keys %v, dist %v\n", keys, distances)
+		require.Equal(t, int(rt.Limit)*len(query), len(keys))
+		for i := range query {
+			offset := i * int(rt.Limit)
+			require.Equal(t, int64(i), keys[offset])
+			require.Equal(t, float64(0), distances[offset])
+		}
+	}
+}
diff --git a/pkg/vectorindex/hnsw/build_test.go b/pkg/vectorindex/hnsw/build_test.go
index ef820eee55775..dfd379e7856f8 100644
--- a/pkg/vectorindex/hnsw/build_test.go
+++ b/pkg/vectorindex/hnsw/build_test.go
@@ -140,7 +140,7 @@ func TestBuildMulti(t *testing.T) {
 			defer wg2.Done()
 			for i := 0; i < nitem; i++ {
 				key := int64(tid*nitem + i)
-				anykeys, distances, err := search.Search(nil, sample[key], vectorindex.RuntimeConfig{Limit: 10})
+				anykeys, distances, err := search.Search(sqlproc, sample[key], vectorindex.RuntimeConfig{Limit: 10})
 				require.Nil(t, err)
 				keys, ok := anykeys.([]int64)
 				require.True(t, ok)
@@ -323,7 +323,7 @@ func runBuildSingleThread[T types.RealNumbers](t *testing.T) {
 			defer wg2.Done()
 			for i := 0; i < nitem; i++ {
 				key := int64(tid*nitem + i)
-				anykeys, distances, err := search.Search(nil, sample[key], vectorindex.RuntimeConfig{Limit: 10})
+				anykeys, distances, err := search.Search(sqlproc, sample[key], vectorindex.RuntimeConfig{Limit: 10})
 				require.Nil(t, err)
 				keys, ok := anykeys.([]int64)
 				require.True(t, ok)
diff --git a/pkg/vectorindex/hnsw/search.go b/pkg/vectorindex/hnsw/search.go
index 62f450b410f66..e838dd2f334d0 100644
--- a/pkg/vectorindex/hnsw/search.go
+++ b/pkg/vectorindex/hnsw/search.go
@@ -15,11 +15,12 @@
 package hnsw
 
 import (
-	"errors"
+	"context"
 	"fmt"
 	"sync"
 	"sync/atomic"
 
+	"github.com/matrixorigin/matrixone/pkg/common/concurrent"
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/container/vector"
@@ -87,39 +88,35 @@ func (s *HnswSearch[T]) Search(sqlproc *sqlexec.SqlProcess, anyquery any, rt vec
 	// search
 	size := len(s.Indexes) * int(limit)
 	heap := vectorindex.NewSearchResultSafeHeap(size)
-	var wg sync.WaitGroup
-
-	var errs error
 
 	nthread := int(vectorindex.GetConcurrency(0))
 	if nthread > len(s.Indexes) {
 		nthread = len(s.Indexes)
 	}
 
-	for i := 0; i < nthread; i++ {
-		wg.Add(1)
-		go func(tid int) {
-			defer wg.Done()
-			for j, idx := range s.Indexes {
-				if j%nthread == tid {
-					keys, distances, err := idx.Search(query, limit)
-					if err != nil {
-						errs = errors.Join(errs, err)
-						return
-					}
-
-					for k := range keys {
-						heap.Push(&vectorindex.SearchResult{Id: int64(keys[k]), Distance: float64(distances[k])})
-					}
+	exec := concurrent.NewThreadPoolExecutor(nthread)
+	err = exec.Execute(sqlproc.GetContext(),
+		len(s.Indexes),
+		func(ctx context.Context, thread_id int, start, end int) (err2 error) {
+			subindex := s.Indexes[start:end]
+			for j := range subindex {
+				if ctx.Err() != nil {
+					return ctx.Err()
 				}
-			}
-		}(i)
-	}
 
-	wg.Wait()
+				keys, distances, err2 := subindex[j].Search(query, limit)
+				if err2 != nil {
+					return err2
+				}
 
-	if errs != nil {
-		return nil, nil, errs
+				for k := range keys {
+					heap.Push(&vectorindex.SearchResult{Id: int64(keys[k]), Distance: float64(distances[k])})
+				}
+			}
+			return
+		})
+	if err != nil {
+		return nil, nil, err
 	}
 
 	reskeys := make([]int64, 0, limit)
diff --git a/pkg/vectorindex/ivfflat/kmeans/device/gpu.go b/pkg/vectorindex/ivfflat/kmeans/device/gpu.go
index 59bbca5cc18e6..ed7eecfd58cf9 100644
--- a/pkg/vectorindex/ivfflat/kmeans/device/gpu.go
+++ b/pkg/vectorindex/ivfflat/kmeans/device/gpu.go
@@ -17,6 +17,11 @@
 package device
 
 import (
+	//"os"
+
+	"context"
+
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans/elkans"
@@ -26,57 +31,64 @@ import (
 )
 
 type GpuClusterer[T cuvs.TensorNumberType] struct {
-	resource    *cuvs.Resource
-	index       *ivf_flat.IvfFlatIndex
 	indexParams *ivf_flat.IndexParams
-	dataset     *cuvs.Tensor[T]
-	centroids   *cuvs.Tensor[T]
+	nlist       int
+	dim         int
+	vectors     [][]T
 }
 
-func (c *GpuClusterer[T]) InitCentroids() error {
+func (c *GpuClusterer[T]) InitCentroids(ctx context.Context) error {
 
 	return nil
 }
 
-func (c *GpuClusterer[T]) Cluster() (any, error) {
+func (c *GpuClusterer[T]) Cluster(ctx context.Context) (any, error) {
 
-	if _, err := c.dataset.ToDevice(c.resource); err != nil {
+	resource, err := cuvs.NewResource(nil)
+	if err != nil {
 		return nil, err
 	}
+	defer resource.Close()
 
-	if err := ivf_flat.BuildIndex(*c.resource, c.indexParams, c.dataset, c.index); err != nil {
+	dataset, err := cuvs.NewTensor(c.vectors)
+	if err != nil {
 		return nil, err
 	}
+	defer dataset.Close()
 
-	if err := c.resource.Sync(); err != nil {
+	index, err := ivf_flat.CreateIndex(c.indexParams, &dataset)
+	if err != nil {
 		return nil, err
 	}
+	defer index.Close()
 
-	nlist, err := ivf_flat.GetNLists(c.index)
-	if err != nil {
+	if _, err := dataset.ToDevice(&resource); err != nil {
 		return nil, err
 	}
 
-	dim, err := ivf_flat.GetDim(c.index)
+	centers, err := cuvs.NewTensorOnDevice[T](&resource, []int64{int64(c.nlist), int64(c.dim)})
 	if err != nil {
 		return nil, err
 	}
+	defer centers.Close()
 
-	centers, err := cuvs.NewTensorOnDevice[T](c.resource, []int64{int64(nlist), int64(dim)})
-	if err != nil {
+	if err := ivf_flat.BuildIndex(resource, c.indexParams, &dataset, index); err != nil {
+		return nil, err
+	}
+
+	if err := resource.Sync(); err != nil {
 		return nil, err
 	}
-	c.centroids = &centers
 
-	if _, err := centers.ToDevice(c.resource); err != nil {
+	if err := ivf_flat.GetCenters(index, &centers); err != nil {
 		return nil, err
 	}
 
-	if err := ivf_flat.GetCenters(c.index, &centers); err != nil {
+	if _, err := centers.ToHost(&resource); err != nil {
 		return nil, err
 	}
 
-	if _, err := centers.ToHost(c.resource); err != nil {
+	if err := resource.Sync(); err != nil {
 		return nil, err
 	}
 
@@ -93,23 +105,9 @@ func (c *GpuClusterer[T]) SSE() (float64, error) {
 }
 
 func (c *GpuClusterer[T]) Close() error {
-
 	if c.indexParams != nil {
 		c.indexParams.Close()
 	}
-	if c.dataset != nil {
-		c.dataset.Close()
-
-	}
-	if c.resource != nil {
-		c.resource.Close()
-	}
-	if c.index != nil {
-		c.index.Close()
-	}
-	if c.centroids != nil {
-		c.centroids.Close()
-	}
 	return nil
 }
 
@@ -140,11 +138,12 @@ func NewKMeans[T types.RealNumbers](vectors [][]T, clusterCnt,
 	case [][]float32:
 
 		c := &GpuClusterer[float32]{}
-		resources, err := cuvs.NewResource(nil)
-		if err != nil {
-			return nil, err
+		c.nlist = clusterCnt
+		if len(vectors) == 0 {
+			return nil, moerr.NewInternalErrorNoCtx("empty dataset")
 		}
-		c.resource = &resources
+		c.vectors = vecs
+		c.dim = len(vecs[0])
 
 		indexParams, err := ivf_flat.CreateIndexParams()
 		if err != nil {
@@ -155,15 +154,6 @@ func NewKMeans[T types.RealNumbers](vectors [][]T, clusterCnt,
 		indexParams.SetKMeansNIters(uint32(maxIterations))
 		indexParams.SetKMeansTrainsetFraction(1) // train all sample
 		c.indexParams = indexParams
-
-		dataset, err := cuvs.NewTensor(vecs)
-		if err != nil {
-			return nil, err
-		}
-		c.dataset = &dataset
-
-		c.index, _ = ivf_flat.CreateIndex(c.indexParams, c.dataset)
-
 		return c, nil
 	default:
 		return elkans.NewKMeans(vectors, clusterCnt, maxIterations, deltaThreshold, distanceType, initType, spherical, nworker)
diff --git a/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go b/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go
index b683cb40d178f..35eb3dfcecef8 100644
--- a/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go
+++ b/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go
@@ -17,11 +17,17 @@
 package device
 
 import (
-	"fmt"
+	//"fmt"
 	"math/rand/v2"
+	"sync"
 	"testing"
 
+	"github.com/matrixorigin/matrixone/pkg/common/mpool"
+	"github.com/matrixorigin/matrixone/pkg/testutil"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex"
+	mobf "github.com/matrixorigin/matrixone/pkg/vectorindex/brute_force"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
 	"github.com/stretchr/testify/require"
 )
 
@@ -44,10 +50,86 @@ func TestGpu(t *testing.T) {
 	centers, err := c.Cluster()
 	require.NoError(t, err)
 
+	_, ok := centers.([][]float32)
+	require.True(t, ok)
+
+	/*
+		for k, center := range centroids {
+			fmt.Printf("center[%d] = %v\n", k, center)
+		}
+	*/
+}
+
+func TestIVFAndBruteForce(t *testing.T) {
+
+	m := mpool.MustNewZero()
+	proc := testutil.NewProcessWithMPool(t, "", m)
+	sqlproc := sqlexec.NewSqlProcess(proc)
+	dimension := uint(128)
+	ncpu := uint(1)
+	limit := uint(1)
+	elemsz := uint(4) // float32
+
+	dsize := 100000
+	nlist := 128
+	vecs := make([][]float32, dsize)
+	for i := range vecs {
+		vecs[i] = make([]float32, dimension)
+		for j := range vecs[i] {
+			vecs[i][j] = rand.Float32()
+		}
+	}
+
+	c, err := NewKMeans[float32](vecs, nlist, 10, 0, metric.Metric_L2Distance, 0, false, 0)
+	require.NoError(t, err)
+
+	centers, err := c.Cluster()
+	require.NoError(t, err)
+
 	centroids, ok := centers.([][]float32)
 	require.True(t, ok)
 
-	for k, center := range centroids {
-		fmt.Printf("center[%d] = %v\n", k, center)
+	/*
+		for k, center := range centroids {
+			fmt.Printf("center[%d] = %v\n", k, center)
+		}
+	*/
+
+	queries := vecs[:8192]
+	idx, err := mobf.NewBruteForceIndex[float32](centroids, dimension, metric.Metric_L2sqDistance, elemsz)
+	require.NoError(t, err)
+	defer idx.Destroy()
+
+	err = idx.Load(nil)
+	require.NoError(t, err)
+
+	rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+
+	var wg sync.WaitGroup
+
+	for n := 0; n < 4; n++ {
+
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for i := 0; i < 1000; i++ {
+				_, _, err := idx.Search(sqlproc, queries, rt)
+				require.NoError(t, err)
+				/*
+
+					keys_i64, ok := keys.([]int64)
+					require.Equal(t, ok, true)
+
+					for j, key := range keys_i64 {
+						require.Equal(t, key, int64(j))
+						require.Equal(t, distances[j], float64(0))
+					}
+				*/
+				// fmt.Printf("keys %v, dist %v\n", keys, distances)
+			}
+		}()
 	}
+
+	wg.Wait()
+
 }
diff --git a/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go b/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go
new file mode 100644
index 0000000000000..17d89be59a97a
--- /dev/null
+++ b/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go
@@ -0,0 +1,264 @@
+//go:build gpu
+
+// Copyright 2023 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package device
+
+import (
+	//"fmt"
+	"math/rand/v2"
+	"sync"
+	"testing"
+	//"os"
+
+	"github.com/stretchr/testify/require"
+
+	cuvs "github.com/rapidsai/cuvs/go"
+	"github.com/rapidsai/cuvs/go/brute_force"
+	"github.com/rapidsai/cuvs/go/ivf_flat"
+)
+
+func getCenters(vecs [][]float32, dim int, clusterCnt int, distanceType cuvs.Distance, maxIterations int) ([][]float32, error) {
+
+	resource, err := cuvs.NewResource(nil)
+	if err != nil {
+		return nil, err
+	}
+	defer resource.Close()
+
+	indexParams, err := ivf_flat.CreateIndexParams()
+	if err != nil {
+		return nil, err
+	}
+	defer indexParams.Close()
+
+	indexParams.SetNLists(uint32(clusterCnt))
+	indexParams.SetMetric(distanceType)
+	indexParams.SetKMeansNIters(uint32(maxIterations))
+	indexParams.SetKMeansTrainsetFraction(1) // train all sample
+
+	dataset, err := cuvs.NewTensor(vecs)
+	if err != nil {
+		return nil, err
+	}
+	defer dataset.Close()
+
+	index, _ := ivf_flat.CreateIndex(indexParams, &dataset)
+	defer index.Close()
+
+	if _, err := dataset.ToDevice(&resource); err != nil {
+		return nil, err
+	}
+
+	centers, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(clusterCnt), int64(dim)})
+	if err != nil {
+		return nil, err
+	}
+
+	if err := ivf_flat.BuildIndex(resource, indexParams, &dataset, index); err != nil {
+		return nil, err
+	}
+
+	if err := resource.Sync(); err != nil {
+		return nil, err
+	}
+
+	if err := ivf_flat.GetCenters(index, &centers); err != nil {
+		return nil, err
+	}
+
+	if _, err := centers.ToHost(&resource); err != nil {
+		return nil, err
+	}
+
+	if err := resource.Sync(); err != nil {
+		return nil, err
+	}
+
+	result, err := centers.Slice()
+	if err != nil {
+		return nil, err
+	}
+
+	return result, nil
+
+}
+
+func Search(datasetvec [][]float32, queriesvec [][]float32, limit uint, distanceType cuvs.Distance) (retkeys any, retdistances []float64, err error) {
+	//os.Stderr.WriteString(fmt.Sprintf("probe set %d\n", len(queriesvec)))
+	//os.Stderr.WriteString("brute force index search start\n")
+
+	resource, err := cuvs.NewResource(nil)
+	if err != nil {
+		return
+	}
+	defer resource.Close()
+
+	dataset, err := cuvs.NewTensor(datasetvec)
+	if err != nil {
+		return
+	}
+	defer dataset.Close()
+
+	index, err := brute_force.CreateIndex()
+	if err != nil {
+		return
+	}
+	defer index.Close()
+
+	queries, err := cuvs.NewTensor(queriesvec)
+	if err != nil {
+		return
+	}
+	defer queries.Close()
+
+	neighbors, err := cuvs.NewTensorOnDevice[int64](&resource, []int64{int64(len(queriesvec)), int64(limit)})
+	if err != nil {
+		return
+	}
+	defer neighbors.Close()
+
+	distances, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(len(queriesvec)), int64(limit)})
+	if err != nil {
+		return
+	}
+	defer distances.Close()
+
+	if _, err = dataset.ToDevice(&resource); err != nil {
+		return
+	}
+
+	if err = resource.Sync(); err != nil {
+		return
+	}
+
+	err = brute_force.BuildIndex(resource, &dataset, distanceType, 2.0, index)
+	if err != nil {
+		//os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed %v\n", err))
+		//os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed centers %v\n", datasetvec))
+		return
+	}
+
+	if err = resource.Sync(); err != nil {
+		return
+	}
+	//os.Stderr.WriteString("built brute force index\n")
+
+	if _, err = queries.ToDevice(&resource); err != nil {
+		return
+	}
+
+	//os.Stderr.WriteString("brute force index search Runing....\n")
+	err = brute_force.SearchIndex(resource, *index, &queries, &neighbors, &distances)
+	if err != nil {
+		return
+	}
+	//os.Stderr.WriteString("brute force index search finished Runing....\n")
+
+	if _, err = neighbors.ToHost(&resource); err != nil {
+		return
+	}
+	//os.Stderr.WriteString("brute force index search neighbour to host done....\n")
+
+	if _, err = distances.ToHost(&resource); err != nil {
+		return
+	}
+	//os.Stderr.WriteString("brute force index search distances to host done....\n")
+
+	if err = resource.Sync(); err != nil {
+		return
+	}
+
+	//os.Stderr.WriteString("brute force index search return result....\n")
+	neighborsSlice, err := neighbors.Slice()
+	if err != nil {
+		return
+	}
+
+	distancesSlice, err := distances.Slice()
+	if err != nil {
+		return
+	}
+
+	//fmt.Printf("flattened %v\n", flatten)
+	retdistances = make([]float64, len(distancesSlice)*int(limit))
+	for i := range distancesSlice {
+		for j, dist := range distancesSlice[i] {
+			retdistances[i*int(limit)+j] = float64(dist)
+		}
+	}
+
+	keys := make([]int64, len(neighborsSlice)*int(limit))
+	for i := range neighborsSlice {
+		for j, key := range neighborsSlice[i] {
+			keys[i*int(limit)+j] = int64(key)
+		}
+	}
+	retkeys = keys
+	//os.Stderr.WriteString("brute force index search RETURN NOW....\n")
+	return
+}
+
+func TestIvfAndBruteForceForIssue(t *testing.T) {
+
+	dimension := uint(128)
+	limit := uint(1)
+	/*
+		ncpu := uint(1)
+		elemsz := uint(4) // float32
+	*/
+
+	dsize := 100000
+	nlist := 128
+	vecs := make([][]float32, dsize)
+	for i := range vecs {
+		vecs[i] = make([]float32, dimension)
+		for j := range vecs[i] {
+			vecs[i][j] = rand.Float32()
+		}
+	}
+	queries := vecs[:8192]
+
+	centers, err := getCenters(vecs, int(dimension), nlist, cuvs.DistanceL2, 10)
+	require.NoError(t, err)
+
+	var wg sync.WaitGroup
+
+	for n := 0; n < 4; n++ {
+
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for i := 0; i < 1000; i++ {
+				_, _, err := Search(centers, queries, limit, cuvs.DistanceL2)
+				require.NoError(t, err)
+
+				/*
+					keys_i64, ok := keys.([]int64)
+					require.Equal(t, ok, true)
+
+					for j, key := range keys_i64 {
+						require.Equal(t, key, int64(j))
+						require.Equal(t, distances[j], float64(0))
+					}
+				*/
+				// fmt.Printf("keys %v, dist %v\n", keys, distances)
+			}
+		}()
+	}
+
+	wg.Wait()
+
+}
diff --git a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer.go b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer.go
index b749f7d9d6509..da9d514af0b5e 100644
--- a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer.go
+++ b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer.go
@@ -15,12 +15,13 @@
 package elkans
 
 import (
+	"context"
 	"math"
 	"math/rand"
 	"runtime"
-	"sync"
 	"sync/atomic"
 
+	"github.com/matrixorigin/matrixone/pkg/common/concurrent"
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/logutil"
@@ -148,7 +149,7 @@ func (km *ElkanClusterer[T]) Close() error {
 }
 
 // InitCentroids initializes the centroids using initialization algorithms like random or kmeans++.
-func (km *ElkanClusterer[T]) InitCentroids() error {
+func (km *ElkanClusterer[T]) InitCentroids(ctx context.Context) error {
 	var initializer Initializer
 	switch km.initType {
 	case kmeans.Random:
@@ -158,7 +159,7 @@ func (km *ElkanClusterer[T]) InitCentroids() error {
 	default:
 		initializer = NewRandomInitializer()
 	}
-	anycentroids, err := initializer.InitCentroids(km.vectorList, km.clusterCnt)
+	anycentroids, err := initializer.InitCentroids(ctx, km.vectorList, km.clusterCnt)
 	if err != nil {
 		return err
 	}
@@ -172,7 +173,7 @@ func (km *ElkanClusterer[T]) InitCentroids() error {
 }
 
 // Cluster returns the final centroids and the error if any.
-func (km *ElkanClusterer[T]) Cluster() (any, error) {
+func (km *ElkanClusterer[T]) Cluster(ctx context.Context) (any, error) {
 	if km.normalize {
 		for i := range km.vectorList {
 			metric.NormalizeL2(km.vectorList[i], km.vectorList[i])
@@ -183,29 +184,29 @@ func (km *ElkanClusterer[T]) Cluster() (any, error) {
 		return km.vectorList, nil
 	}
 
-	err := km.InitCentroids() // step 0.1
+	err := km.InitCentroids(ctx) // step 0.1
 	if err != nil {
 		return nil, err
 	}
 
-	km.initBounds() // step 0.2
+	km.initBounds(ctx) // step 0.2
 
-	return km.elkansCluster()
+	return km.elkansCluster(ctx)
 }
 
-func (km *ElkanClusterer[T]) elkansCluster() ([][]T, error) {
+func (km *ElkanClusterer[T]) elkansCluster(ctx context.Context) ([][]T, error) {
 
 	for iter := 0; ; iter++ {
-		km.computeCentroidDistances() // step 1
+		km.computeCentroidDistances(ctx) // step 1
 
-		changes, err := km.assignData() // step 2 and 3
+		changes, err := km.assignData(ctx) // step 2 and 3
 		if err != nil {
 			return nil, err
 		}
 
-		newCentroids := km.recalculateCentroids() // step 4
+		newCentroids := km.recalculateCentroids(ctx) // step 4
 
-		km.updateBounds(newCentroids) // step 5 and 6
+		km.updateBounds(ctx, newCentroids) // step 5 and 6
 
 		km.centroids = newCentroids // step 7
 
@@ -259,7 +260,7 @@ func validateArgs[T types.RealNumbers](vectorList [][]T, clusterCnt,
 }
 
 // initBounds initializes the lower bounds, upper bound and assignment for each vector.
-func (km *ElkanClusterer[T]) initBounds() error {
+func (km *ElkanClusterer[T]) initBounds(ctx context.Context) (err error) {
 	// step 0.2
 	// Set the lower bound l(x, c)=0 for each point x and center c.
 	// Assign each x to its closest initial center c(x)=min{ d(x, c) }, using Lemma 1 to avoid
@@ -271,85 +272,86 @@ func (km *ElkanClusterer[T]) initBounds() error {
 		ncpu = len(km.vectorList)
 	}
 
-	errs := make(chan error, ncpu)
-	var wg sync.WaitGroup
+	exec := concurrent.NewThreadPoolExecutor(ncpu)
+	err = exec.Execute(
+		ctx,
+		len(km.vectorList),
+		func(ctx context.Context, thread_id int, start, end int) (err2 error) {
+			subvec := km.vectorList[start:end]
+			submetas := km.vectorMetas[start:end]
+			subassigns := km.assignments[start:end]
 
-	for n := 0; n < ncpu; n++ {
-		wg.Add(1)
-		go func(tid int) {
-			defer wg.Done()
-			for x := range km.vectorList {
-				if x%ncpu != tid {
-					continue
+			for x := range subvec {
+
+				if x%100 == 0 && ctx.Err() != nil {
+					return ctx.Err()
 				}
+
 				minDist := metric.MaxFloat[T]()
 				closestCenter := 0
 				for c := range km.centroids {
-					dist, err := km.distFn(km.vectorList[x], km.centroids[c])
-					if err != nil {
-						errs <- err
-						return
+					dist, err2 := km.distFn(subvec[x], km.centroids[c])
+					if err2 != nil {
+						return err2
 					}
 
-					km.vectorMetas[x].lower[c] = dist
+					submetas[x].lower[c] = dist
 					if dist < minDist {
 						minDist = dist
 						closestCenter = c
 					}
 				}
 
-				km.vectorMetas[x].upper = minDist
-				km.assignments[x] = closestCenter
+				submetas[x].upper = minDist
+				subassigns[x] = closestCenter
 			}
-		}(n)
-	}
+			return
+		})
 
-	wg.Wait()
-	if len(errs) > 0 {
-		return <-errs
-	}
-	return nil
+	return
 }
 
 // computeCentroidDistances computes the centroid distances and the min centroid distances.
 // NOTE: here we are save 0.5 of centroid distance to avoid 0.5 multiplication in step 3(iii) and 3.b.
-func (km *ElkanClusterer[T]) computeCentroidDistances() error {
+func (km *ElkanClusterer[T]) computeCentroidDistances(ctx context.Context) error {
 
 	// step 1.a
 	// For all centers c and c', compute 0.5 x d(c, c').
-	var wg sync.WaitGroup
 	ncpu := km.nworker
 	if km.clusterCnt < ncpu {
 		ncpu = km.clusterCnt
 	}
-	errs := make(chan error, ncpu)
 
-	for n := 0; n < ncpu; n++ {
-		wg.Add(1)
-		go func(tid int) {
-			defer wg.Done()
-			for i := 0; i < km.clusterCnt; i++ {
-				if i%ncpu != tid {
-					continue
+	exec := concurrent.NewThreadPoolExecutor(ncpu)
+	err := exec.Execute(
+		ctx,
+		km.clusterCnt,
+		func(ctx context.Context, thread_id int, start, end int) error {
+			subcentroids := km.centroids[start:end]
+
+			for x := range subcentroids {
+
+				if x%100 == 0 && ctx.Err() != nil {
+					return ctx.Err()
 				}
+
+				i := start + x
 				for j := i + 1; j < km.clusterCnt; j++ {
-					dist, err := km.distFn(km.centroids[i], km.centroids[j])
-					if err != nil {
-						errs <- err
-						return
+					dist, err2 := km.distFn(subcentroids[x], km.centroids[j])
+					if err2 != nil {
+						return err2
 					}
 					dist *= 0.5
 					km.halfInterCentroidDistMatrix[i][j] = dist
 					km.halfInterCentroidDistMatrix[j][i] = dist
-
 				}
 			}
-		}(n)
-	}
-	wg.Wait()
 
-	if len(errs) > 0 {
-		return <-errs
+			return nil
+		})
+
+	if err != nil {
+		return err
 	}
 
 	// step 1.b
@@ -370,29 +372,31 @@ func (km *ElkanClusterer[T]) computeCentroidDistances() error {
 
 // assignData assigns each vector to the nearest centroid.
 // This is the place where most of the "distance computation skipping" happens.
-func (km *ElkanClusterer[T]) assignData() (int, error) {
+func (km *ElkanClusterer[T]) assignData(ctx context.Context) (int, error) {
 	var changes atomic.Int64
 	ncpu := km.nworker
 	if len(km.vectorList) < ncpu {
 		ncpu = len(km.vectorList)
 	}
-	errs := make(chan error, ncpu)
-
-	var wg sync.WaitGroup
-	for n := 0; n < ncpu; n++ {
 
-		wg.Add(1)
-		go func(tid int) {
-			defer wg.Done()
+	exec := concurrent.NewThreadPoolExecutor(ncpu)
+	err := exec.Execute(
+		ctx,
+		len(km.vectorList),
+		func(ctx context.Context, thread_id int, start, end int) (err2 error) {
+			subvec := km.vectorList[start:end]
+			submetas := km.vectorMetas[start:end]
+			subassigns := km.assignments[start:end]
 
-			for currVector := range km.vectorList {
+			for currVector := range subvec {
 
-				if currVector%ncpu != tid {
-					continue
+				if currVector%100 == 0 && ctx.Err() != nil {
+					return ctx.Err()
 				}
+
 				// step 2
 				// u(x) <= s(c(x))
-				if km.vectorMetas[currVector].upper <= km.minHalfInterCentroidDist[km.assignments[currVector]] {
+				if submetas[currVector].upper <= km.minHalfInterCentroidDist[subassigns[currVector]] {
 					continue
 				}
 
@@ -402,73 +406,68 @@ func (km *ElkanClusterer[T]) assignData() (int, error) {
 					// (i) c != c(x) and
 					// (ii) u(x)>l(x, c) and
 					// (iii) u(x)> 0.5 x d(c(x), c)
-					if c != km.assignments[currVector] &&
-						km.vectorMetas[currVector].upper > km.vectorMetas[currVector].lower[c] &&
-						km.vectorMetas[currVector].upper > km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] {
+					if c != subassigns[currVector] &&
+						submetas[currVector].upper > submetas[currVector].lower[c] &&
+						submetas[currVector].upper > km.halfInterCentroidDistMatrix[subassigns[currVector]][c] {
 
 						//step 3.a - Bounds update
 						// If r(x) then compute d(x, c(x)) and assign r(x)= false.
 						var dxcx T
-						if km.vectorMetas[currVector].recompute {
-							var err error
-							km.vectorMetas[currVector].recompute = false
-
-							dxcx, err = km.distFn(km.vectorList[currVector], km.centroids[km.assignments[currVector]])
-							if err != nil {
-								errs <- err
-								return
+						if submetas[currVector].recompute {
+							submetas[currVector].recompute = false
+
+							dxcx, err2 = km.distFn(subvec[currVector], km.centroids[subassigns[currVector]])
+							if err2 != nil {
+								return err2
 							}
-							km.vectorMetas[currVector].upper = dxcx
-							km.vectorMetas[currVector].lower[km.assignments[currVector]] = dxcx
+							submetas[currVector].upper = dxcx
+							submetas[currVector].lower[subassigns[currVector]] = dxcx
 
-							if km.vectorMetas[currVector].upper <= km.vectorMetas[currVector].lower[c] {
+							if submetas[currVector].upper <= submetas[currVector].lower[c] {
 								continue // Pruned by triangle inequality on lower bound.
 							}
 
-							if km.vectorMetas[currVector].upper <= km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] {
+							if submetas[currVector].upper <= km.halfInterCentroidDistMatrix[subassigns[currVector]][c] {
 								continue // Pruned by triangle inequality on cluster distances.
 							}
 
 						} else {
-							dxcx = km.vectorMetas[currVector].upper //  Otherwise, d(x, c(x))=u(x).
+							dxcx = submetas[currVector].upper //  Otherwise, d(x, c(x))=u(x).
 						}
 
 						//step 3.b - Update
 						// If d(x, c(x))>l(x, c) or d(x, c(x))> 0.5 d(c(x), c) then
 						// Compute d(x, c)
 						// If d(x, c)<d(x, c(x)) then assign c(x)=c.
-						if dxcx > km.vectorMetas[currVector].lower[c] ||
-							dxcx > km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] {
+						if dxcx > submetas[currVector].lower[c] ||
+							dxcx > km.halfInterCentroidDistMatrix[subassigns[currVector]][c] {
 
-							dxc, err := km.distFn(km.vectorList[currVector], km.centroids[c]) // d(x,c) in the paper
-							if err != nil {
-								errs <- err
-								return
+							dxc, err2 := km.distFn(subvec[currVector], km.centroids[c]) // d(x,c) in the paper
+							if err2 != nil {
+								return err2
 							}
-							km.vectorMetas[currVector].lower[c] = dxc
+							submetas[currVector].lower[c] = dxc
 							if dxc < dxcx {
-								km.vectorMetas[currVector].upper = dxc
-								km.assignments[currVector] = c
+								submetas[currVector].upper = dxc
+								subassigns[currVector] = c
 								changes.Add(1)
 							}
 						}
 					}
 				}
 			}
-		}(n)
-	}
+			return
+		})
 
-	wg.Wait()
-
-	if len(errs) > 0 {
-		return 0, <-errs
+	if err != nil {
+		return 0, err
 	}
 
 	return int(changes.Load()), nil
 }
 
 // recalculateCentroids calculates the new mean centroids based on the new assignments.
-func (km *ElkanClusterer[T]) recalculateCentroids() [][]T {
+func (km *ElkanClusterer[T]) recalculateCentroids(ctx context.Context) [][]T {
 	membersCount := make([]int64, km.clusterCnt)
 
 	newCentroids := make([][]T, km.clusterCnt)
@@ -514,7 +513,7 @@ func (km *ElkanClusterer[T]) recalculateCentroids() [][]T {
 }
 
 // updateBounds updates the lower and upper bounds for each vector.
-func (km *ElkanClusterer[T]) updateBounds(newCentroid [][]T) (err error) {
+func (km *ElkanClusterer[T]) updateBounds(ctx context.Context, newCentroid [][]T) (err error) {
 
 	// compute the centroid shift distance matrix once.
 	// d(c', m(c')) in the paper
diff --git a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_bench_test.go b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_bench_test.go
index a52e55c862588..465e0a4fcddc5 100644
--- a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_bench_test.go
+++ b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_bench_test.go
@@ -15,6 +15,7 @@
 package elkans
 
 import (
+	"context"
 	"math/rand"
 	"strconv"
 	"testing"
@@ -39,6 +40,7 @@ func Benchmark_kmeans(b *testing.B) {
 	//	Level:  "debug",
 	//	Format: "console",
 	//})
+	ctx := context.Background()
 
 	rowCnt := 1000_000
 	k := 1000
@@ -53,7 +55,7 @@ func Benchmark_kmeans(b *testing.B) {
 		clusterRand, _ := NewKMeans(data, k,
 			500, 0.01,
 			metric.Metric_L2Distance, kmeans.Random, true, 0)
-		_, err := clusterRand.Cluster()
+		_, err := clusterRand.Cluster(ctx)
 		if err != nil {
 			panic(err)
 		}
@@ -71,7 +73,7 @@ func Benchmark_kmeans(b *testing.B) {
 		kmeansPlusPlus, _ := NewKMeans(data, k,
 			500, 0.01,
 			metric.Metric_L2Distance, kmeans.KmeansPlusPlus, true, 0)
-		_, err := kmeansPlusPlus.Cluster()
+		_, err := kmeansPlusPlus.Cluster(ctx)
 		if err != nil {
 			panic(err)
 		}
diff --git a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_test.go b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_test.go
index ebbd3d585504e..6642ce7fc7fd9 100644
--- a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_test.go
+++ b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_test.go
@@ -15,6 +15,7 @@
 package elkans
 
 import (
+	"context"
 	"reflect"
 	"testing"
 
@@ -161,6 +162,8 @@ func Test_ClusterError(t *testing.T) {
 			wantErr: false,
 		},
 	}
+	ctx := context.Background()
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			clusterer, _ := NewKMeans[float64](tt.fields.vectorList, tt.fields.clusterCnt,
@@ -169,7 +172,7 @@ func Test_ClusterError(t *testing.T) {
 			elkan, ok := clusterer.(*ElkanClusterer[float64])
 			require.True(t, ok)
 			elkan.distFn = FakeDistance[float64]
-			_, err := clusterer.Cluster()
+			_, err := clusterer.Cluster(ctx)
 			require.NotNil(t, err)
 		})
 	}
@@ -229,6 +232,7 @@ func Test_InitBoundsError(t *testing.T) {
 			wantErr: false,
 		},
 	}
+	ctx := context.Background()
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			clusterer, _ := NewKMeans[float64](tt.fields.vectorList, tt.fields.clusterCnt,
@@ -236,10 +240,10 @@ func Test_InitBoundsError(t *testing.T) {
 				tt.fields.distType, tt.fields.initType, false, 0)
 			elkan, ok := clusterer.(*ElkanClusterer[float64])
 			require.True(t, ok)
-			_, err := clusterer.Cluster()
+			_, err := clusterer.Cluster(ctx)
 			require.Nil(t, err)
 			elkan.distFn = FakeDistance[float64]
-			err = elkan.initBounds()
+			err = elkan.initBounds(ctx)
 			require.NotNil(t, err)
 		})
 	}
@@ -299,6 +303,7 @@ func Test_ComputeCentroidDistancesError(t *testing.T) {
 			wantErr: false,
 		},
 	}
+	ctx := context.Background()
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			clusterer, _ := NewKMeans[float64](tt.fields.vectorList, tt.fields.clusterCnt,
@@ -306,10 +311,10 @@ func Test_ComputeCentroidDistancesError(t *testing.T) {
 				tt.fields.distType, tt.fields.initType, false, 0)
 			elkan, ok := clusterer.(*ElkanClusterer[float64])
 			require.True(t, ok)
-			_, err := clusterer.Cluster()
+			_, err := clusterer.Cluster(ctx)
 			require.Nil(t, err)
 			elkan.distFn = FakeDistance[float64]
-			err = elkan.computeCentroidDistances()
+			err = elkan.computeCentroidDistances(ctx)
 			require.NotNil(t, err)
 		})
 	}
@@ -369,6 +374,7 @@ func Test_SSEError(t *testing.T) {
 			wantErr: false,
 		},
 	}
+	ctx := context.Background()
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			clusterer, _ := NewKMeans[float64](tt.fields.vectorList, tt.fields.clusterCnt,
@@ -376,7 +382,7 @@ func Test_SSEError(t *testing.T) {
 				tt.fields.distType, tt.fields.initType, false, 0)
 			elkan, ok := clusterer.(*ElkanClusterer[float64])
 			require.True(t, ok)
-			_, err := clusterer.Cluster()
+			_, err := clusterer.Cluster(ctx)
 			require.Nil(t, err)
 			elkan.distFn = FakeDistance[float64]
 			_, err = elkan.SSE()
@@ -439,12 +445,13 @@ func Test_Cluster(t *testing.T) {
 			wantErr: false,
 		},
 	}
+	ctx := context.Background()
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			clusterer, _ := NewKMeans[float64](tt.fields.vectorList, tt.fields.clusterCnt,
 				tt.fields.maxIterations, tt.fields.deltaThreshold,
 				tt.fields.distType, tt.fields.initType, false, 0)
-			_got, err := clusterer.Cluster()
+			_got, err := clusterer.Cluster(ctx)
 			if (err != nil) != tt.wantErr {
 				t.Errorf("Cluster() error = %v, wantErr %v", err, tt.wantErr)
 				return
@@ -542,6 +549,7 @@ func TestElkanClusterer_initBounds(t *testing.T) {
 			},
 		},
 	}
+	ctx := context.Background()
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			km, err := NewKMeans[float64](tt.fields.vectorList, tt.fields.clusterCnt,
@@ -555,7 +563,7 @@ func TestElkanClusterer_initBounds(t *testing.T) {
 			}
 			if ekm, ok := km.(*ElkanClusterer[float64]); ok {
 				ekm.centroids = tt.state.centroids
-				ekm.initBounds()
+				ekm.initBounds(ctx)
 				if !reflect.DeepEqual(ekm.assignments, tt.want.assignment) {
 					t.Errorf("assignments got = %v, want %v", ekm.assignments, tt.want.assignment)
 				}
@@ -631,6 +639,7 @@ func TestElkanClusterer_computeCentroidDistances(t *testing.T) {
 			},
 		},
 	}
+	ctx := context.Background()
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			km, err := NewKMeans[float64](tt.fields.vectorList, tt.fields.clusterCnt,
@@ -642,7 +651,7 @@ func TestElkanClusterer_computeCentroidDistances(t *testing.T) {
 			}
 			if ekm, ok := km.(*ElkanClusterer[float64]); ok {
 				ekm.centroids = tt.state.centroids
-				ekm.computeCentroidDistances()
+				ekm.computeCentroidDistances(ctx)
 
 				// NOTE: here we are not considering the vectors in the vectorList. Hence we don't need to worry about
 				// the normalization impact. Here we are only testing the working of computeCentroidDistances() function.
@@ -715,6 +724,7 @@ func TestElkanClusterer_recalculateCentroids(t *testing.T) {
 			},
 		},
 	}
+	ctx := context.Background()
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			km, err := NewKMeans[float64](tt.fields.vectorList, tt.fields.clusterCnt,
@@ -730,7 +740,7 @@ func TestElkanClusterer_recalculateCentroids(t *testing.T) {
 				// NOTE: here km.Normalize() is skipped as we not calling km.Cluster() in this test.
 				// Here we are only testing the working of recalculateCentroids() function.
 
-				got := ekm.recalculateCentroids()
+				got := ekm.recalculateCentroids(ctx)
 				if !assertx.InEpsilonF64Slices(tt.want.centroids, got) {
 					t.Errorf("centroids got = %v, want %v", got, tt.want.centroids)
 				}
@@ -854,6 +864,7 @@ func TestElkanClusterer_updateBounds(t *testing.T) {
 			},
 		},
 	}
+	ctx := context.Background()
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			km, err := NewKMeans[float64](tt.fields.vectorList, tt.fields.clusterCnt,
@@ -869,7 +880,7 @@ func TestElkanClusterer_updateBounds(t *testing.T) {
 
 				// NOTE: here km.Normalize() is skipped as we not calling km.Cluster() in this test.
 				// Here we are only testing the working of updateBounds() function.
-				ekm.updateBounds(tt.state.newCentroids)
+				ekm.updateBounds(ctx, tt.state.newCentroids)
 
 				for i := 0; i < len(tt.want.vectorMetas); i++ {
 					if !assertx.InEpsilonF64Slice(tt.want.vectorMetas[i].lower, ekm.vectorMetas[i].lower) {
@@ -1004,6 +1015,7 @@ func TestElkanClusterer_updateBounds_Error(t *testing.T) {
 			},
 		},
 	}
+	ctx := context.Background()
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			km, err := NewKMeans[float64](tt.fields.vectorList, tt.fields.clusterCnt,
@@ -1020,7 +1032,7 @@ func TestElkanClusterer_updateBounds_Error(t *testing.T) {
 
 				// NOTE: here km.Normalize() is skipped as we not calling km.Cluster() in this test.
 				// Here we are only testing the working of updateBounds() function.
-				err := ekm.updateBounds(tt.state.newCentroids)
+				err := ekm.updateBounds(ctx, tt.state.newCentroids)
 				require.NotNil(t, err)
 			} else if !ok {
 				t.Errorf("km not of type ElkanClusterer")
diff --git a/pkg/vectorindex/ivfflat/kmeans/elkans/initializer.go b/pkg/vectorindex/ivfflat/kmeans/elkans/initializer.go
index f7410d7e2d862..966452ef123b0 100644
--- a/pkg/vectorindex/ivfflat/kmeans/elkans/initializer.go
+++ b/pkg/vectorindex/ivfflat/kmeans/elkans/initializer.go
@@ -15,10 +15,12 @@
 package elkans
 
 import (
+	"context"
 	"math/rand"
 	"runtime"
 	"sync"
 
+	"github.com/matrixorigin/matrixone/pkg/common/concurrent"
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans"
@@ -26,7 +28,7 @@ import (
 )
 
 type Initializer interface {
-	InitCentroids(vectors any, k int) (centroids any, err error)
+	InitCentroids(ctx context.Context, vectors any, k int) (centroids any, err error)
 }
 
 // var _ Initializer = (*Random)(nil)
@@ -42,7 +44,7 @@ func NewRandomInitializer() Initializer {
 	}
 }
 
-func (r *Random) InitCentroids(vectors any, k int) (_centroids any, _err error) {
+func (r *Random) InitCentroids(ctx context.Context, vectors any, k int) (_centroids any, _err error) {
 
 	switch _vecs := vectors.(type) {
 	case [][]float32:
@@ -85,7 +87,7 @@ func NewKMeansPlusPlusInitializer[T types.RealNumbers](distFn metric.DistanceFun
 	}
 }
 
-func (kpp *KMeansPlusPlus[T]) InitCentroids(_vectors any, k int) (_centroids any, _err error) {
+func (kpp *KMeansPlusPlus[T]) InitCentroids(ctx context.Context, _vectors any, k int) (_centroids any, _err error) {
 
 	vectors, ok := _vectors.([][]T)
 	if !ok {
@@ -108,50 +110,48 @@ func (kpp *KMeansPlusPlus[T]) InitCentroids(_vectors any, k int) (_centroids any
 		ncpu = numSamples
 	}
 
-	errs := make(chan error, ncpu)
 	for nextCentroidIdx := 1; nextCentroidIdx < k; nextCentroidIdx++ {
 
 		// 2. for each data point, compute the min distance to the existing centers
 		var totalDistToExistingCenters T
-		var wg sync.WaitGroup
 		var mutex sync.Mutex
 
-		for n := 0; n < ncpu; n++ {
-			wg.Add(1)
+		exec := concurrent.NewThreadPoolExecutor(ncpu)
+		err := exec.Execute(
+			ctx,
+			len(vectors),
+			func(ctx context.Context, thread_id int, start, end int) (err2 error) {
+				subvec := vectors[start:end]
+				subdist := distances[start:end]
 
-			go func(tid int) {
-				defer wg.Done()
+				for i := range subvec {
 
-				for vecIdx := range vectors {
-
-					if vecIdx%ncpu != tid {
-						continue
+					if i%100 == 0 && ctx.Err() != nil {
+						return ctx.Err()
 					}
 
 					// this is a deviation from standard kmeans.here we are not using minDistance to all the existing centers.
 					// This code was very slow: https://github.com/matrixorigin/matrixone/blob/77ff1452bd56cd93a10e3327632adebdbaf279cb/pkg/sql/plan/function/functionAgg/algos/kmeans/elkans/initializer.go#L81-L86
 					// but instead we are using the distance to the last center that was chosen.
-					distance, err := kpp.distFn(vectors[vecIdx], centroids[nextCentroidIdx-1])
-					if err != nil {
-						errs <- err
-						return
+					distance, err2 := kpp.distFn(subvec[i], centroids[nextCentroidIdx-1])
+					if err2 != nil {
+						return err2
 					}
 
 					distance *= distance
 					mutex.Lock()
-					if distance < distances[vecIdx] {
-						distances[vecIdx] = distance
+					if distance < subdist[i] {
+						subdist[i] = distance
 					}
-					totalDistToExistingCenters += distances[vecIdx]
+					totalDistToExistingCenters += subdist[i]
 					mutex.Unlock()
 				}
-			}(n)
-		}
 
-		wg.Wait()
+				return
+			})
 
-		if len(errs) > 0 {
-			return nil, <-errs
+		if err != nil {
+			return nil, err
 		}
 
 		// 3. choose the next random center, using a weighted probability distribution
diff --git a/pkg/vectorindex/ivfflat/kmeans/elkans/initializer_test.go b/pkg/vectorindex/ivfflat/kmeans/elkans/initializer_test.go
index 32ef8dc1e87eb..51ff1c5549144 100644
--- a/pkg/vectorindex/ivfflat/kmeans/elkans/initializer_test.go
+++ b/pkg/vectorindex/ivfflat/kmeans/elkans/initializer_test.go
@@ -15,6 +15,7 @@
 package elkans
 
 import (
+	"context"
 	"reflect"
 	"testing"
 
@@ -59,10 +60,12 @@ func TestRandom_InitCentroids(t *testing.T) {
 			},
 		},
 	}
+	ctx := context.Background()
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			r := NewRandomInitializer()
-			_gotCentroids, err := r.InitCentroids(tt.args.vectors, tt.args.k)
+			_gotCentroids, err := r.InitCentroids(ctx, tt.args.vectors, tt.args.k)
 			require.Nil(t, err)
 			gotCentroids, ok := _gotCentroids.([][]float64)
 			require.True(t, ok)
@@ -110,10 +113,11 @@ func TestKMeansPlusPlus_InitCentroids(t *testing.T) {
 			},
 		},
 	}
+	ctx := context.Background()
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			r := NewKMeansPlusPlusInitializer[float64](metric.L2Distance[float64])
-			_gotCentroids, err := r.InitCentroids(tt.args.vectors, tt.args.k)
+			_gotCentroids, err := r.InitCentroids(ctx, tt.args.vectors, tt.args.k)
 			require.Nil(t, err)
 			gotCentroids := _gotCentroids.([][]float64)
 			if !reflect.DeepEqual(gotCentroids, tt.wantCentroids) {
@@ -138,6 +142,7 @@ func Benchmark_InitCentroids(b *testing.B) {
 	rowCnt := 10_000
 	dims := 1024
 	k := 10
+	ctx := context.Background()
 
 	data := make([][]float64, rowCnt)
 	populateRandData(rowCnt, dims, data)
@@ -148,7 +153,7 @@ func Benchmark_InitCentroids(b *testing.B) {
 	b.Run("RANDOM", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			_, err := random.InitCentroids(data, k)
+			_, err := random.InitCentroids(ctx, data, k)
 			require.Nil(b, err)
 		}
 	})
@@ -156,7 +161,7 @@ func Benchmark_InitCentroids(b *testing.B) {
 	b.Run("KMEANS++", func(b *testing.B) {
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
-			_, err := kmeanspp.InitCentroids(data, k)
+			_, err := kmeanspp.InitCentroids(ctx, data, k)
 			require.Nil(b, err)
 		}
 	})
diff --git a/pkg/vectorindex/ivfflat/kmeans/types.go b/pkg/vectorindex/ivfflat/kmeans/types.go
index 9845784f46c8b..12f35953e6539 100644
--- a/pkg/vectorindex/ivfflat/kmeans/types.go
+++ b/pkg/vectorindex/ivfflat/kmeans/types.go
@@ -14,11 +14,13 @@
 
 package kmeans
 
+import "context"
+
 const DefaultRandSeed = 1
 
 type Clusterer interface {
-	InitCentroids() error
-	Cluster() (any, error)
+	InitCentroids(context.Context) error
+	Cluster(context.Context) (any, error)
 	SSE() (float64, error)
 	Close() error
 }
diff --git a/pkg/vectorindex/ivfflat/search.go b/pkg/vectorindex/ivfflat/search.go
index 7335b03876998..26b0120d4c5b3 100644
--- a/pkg/vectorindex/ivfflat/search.go
+++ b/pkg/vectorindex/ivfflat/search.go
@@ -15,7 +15,6 @@
 package ivfflat
 
 import (
-	"container/heap"
 	"fmt"
 	"strconv"
 
@@ -25,6 +24,7 @@ import (
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/container/vector"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/brute_force"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
@@ -32,15 +32,10 @@ import (
 
 var runSql = sqlexec.RunSql
 
-type Centroid[T types.RealNumbers] struct {
-	Id  int64
-	Vec []T
-}
-
 // Ivf search index struct to hold the usearch index
 type IvfflatSearchIndex[T types.RealNumbers] struct {
 	Version   int64
-	Centroids []Centroid[T]
+	Centroids cache.VectorIndexSearchIf
 }
 
 // This is the Ivf search implementation that implement VectorIndexSearchIf interface
@@ -74,7 +69,9 @@ func (idx *IvfflatSearchIndex[T]) LoadIndex(proc *sqlexec.SqlProcess, idxcfg vec
 		return nil
 	}
 
-	idx.Centroids = make([]Centroid[T], 0, idxcfg.Ivfflat.Lists)
+	ncenters := 0
+	centroids := make([][]T, idxcfg.Ivfflat.Lists)
+	elemsz := res.Batches[0].Vecs[1].GetType().GetArrayElementSize()
 	for _, bat := range res.Batches {
 		idVec := bat.Vecs[0]
 		faVec := bat.Vecs[1]
@@ -82,57 +79,52 @@ func (idx *IvfflatSearchIndex[T]) LoadIndex(proc *sqlexec.SqlProcess, idxcfg vec
 		hasNull := faVec.HasNull()
 		for i, id := range ids {
 			if hasNull && faVec.IsNull(uint64(i)) {
-				//os.Stderr.WriteString("Centroid is NULL\n")
 				continue
 			}
 			val := faVec.GetStringAt(i)
 			vec := types.BytesToArray[T](util.UnsafeStringToBytes(val))
-			idx.Centroids = append(idx.Centroids, Centroid[T]{Id: id, Vec: vec})
+			centroids[id] = vec
+			ncenters += 1
 		}
 	}
 
+	if ncenters == 0 {
+		return nil
+	}
+
+	if uint(ncenters) != idxcfg.Ivfflat.Lists {
+		return moerr.NewInternalErrorNoCtx("number of centroids in db != Nlist")
+	}
+
+	bfidx, err := brute_force.NewBruteForceIndex[T](centroids, idxcfg.Ivfflat.Dimensions, metric.MetricType(idxcfg.Ivfflat.Metric), uint(elemsz))
+	if err != nil {
+		return err
+	}
+	err = bfidx.Load(proc)
+	if err != nil {
+		return err
+	}
+
+	idx.Centroids = bfidx
 	//os.Stderr.WriteString(fmt.Sprintf("%d centroids loaded... lists = %d, centroid %v\n", len(idx.Centroids), idxcfg.Ivfflat.Lists, idx.Centroids))
 	return nil
 }
 
 func (idx *IvfflatSearchIndex[T]) findCentroids(sqlproc *sqlexec.SqlProcess, query []T, distfn metric.DistanceFunction[T], _ vectorindex.IndexConfig, probe uint, _ int64) ([]int64, error) {
 
-	if len(idx.Centroids) == 0 {
+	if idx.Centroids == nil {
 		// empty index has id = 1
 		return []int64{1}, nil
 	}
 
-	hp := make(vectorindex.SearchResultMaxHeap, 0, int(probe))
-	for _, c := range idx.Centroids {
-		dist, err := distfn(query, c.Vec)
-		if err != nil {
-			return nil, err
-		}
-		dist64 := float64(dist)
-
-		if len(hp) >= int(probe) {
-			if hp[0].GetDistance() > dist64 {
-				hp[0] = &vectorindex.SearchResult{Id: c.Id, Distance: dist64}
-				heap.Fix(&hp, 0)
-			}
-		} else {
-			heap.Push(&hp, &vectorindex.SearchResult{Id: c.Id, Distance: dist64})
-		}
-	}
-
-	n := hp.Len()
-	res := make([]int64, 0, n)
-	for range n {
-		srif := heap.Pop(&hp)
-		sr, ok := srif.(*vectorindex.SearchResult)
-		if !ok {
-			return nil, moerr.NewInternalError(sqlproc.GetContext(), "findCentroids: heap return key is not int64")
-		}
-		res = append(res, sr.Id)
+	queries := [][]T{query}
+	rt := vectorindex.RuntimeConfig{Limit: probe, NThreads: 1}
+	keys, _, err := idx.Centroids.Search(sqlproc, queries, rt)
+	if err != nil {
+		return nil, err
 	}
 
-	//os.Stderr.WriteString(fmt.Sprintf("probe %d... return centroid ids %v\n", probe, res))
-	return res, nil
+	return keys.([]int64), nil
 }
 
 // Call usearch.Search
@@ -220,7 +212,10 @@ func (idx *IvfflatSearchIndex[T]) Search(
 }
 
 func (idx *IvfflatSearchIndex[T]) Destroy() {
-	idx.Centroids = nil
+	if idx.Centroids != nil {
+		idx.Centroids.Destroy()
+		idx.Centroids = nil
+	}
 }
 
 func NewIvfflatSearch[T types.RealNumbers](
diff --git a/pkg/vectorindex/metric/distance_func.go b/pkg/vectorindex/metric/distance_func.go
index 496e5e3c650e2..78192460010c2 100644
--- a/pkg/vectorindex/metric/distance_func.go
+++ b/pkg/vectorindex/metric/distance_func.go
@@ -19,47 +19,29 @@ import (
 
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
-	"gonum.org/v1/gonum/blas/blas32"
-	"gonum.org/v1/gonum/blas/blas64"
 )
 
+/*
 func L2Distance[T types.RealNumbers](v1, v2 []T) (T, error) {
-	switch any(v1).(type) {
-	case []float32:
-		_v1 := any(v1).([]float32)
-		_v2 := any(v2).([]float32)
-
-		diff := blas32.Vector{
-			N:    len(_v1),
-			Inc:  1,
-			Data: make([]float32, len(_v1)),
-		}
-
-		for i := range _v1 {
-			diff.Data[i] = _v1[i] - _v2[i]
-		}
-		return T(blas32.Nrm2(diff)), nil
-
-	case []float64:
-		_v1 := any(v1).([]float64)
-		_v2 := any(v2).([]float64)
+	dist, err := L2DistanceSq(v1, v2)
+	if err != nil {
+		return dist, err
+	}
 
-		diff := blas64.Vector{
-			N:    len(_v1),
-			Inc:  1,
-			Data: make([]float64, len(_v1)),
-		}
+	return T(math.Sqrt(float64(dist))), nil
+}
+*/
 
-		for i := range _v1 {
-			diff.Data[i] = _v1[i] - _v2[i]
-		}
-		return T(blas64.Nrm2(diff)), nil
-	default:
-		return 0, moerr.NewInternalErrorNoCtx("L2Distance type not supported")
+func L2Distance[T types.RealNumbers](v1, v2 []T) (T, error) {
+	dist, err := L2DistanceSq(v1, v2)
+	if err != nil {
+		return dist, err
 	}
 
+	return T(math.Sqrt(float64(dist))), nil
 }
 
+/*
 func L2DistanceSq[T types.RealNumbers](v1, v2 []T) (T, error) {
 	var sumOfSquares T
 	for i := range v1 {
@@ -69,89 +51,220 @@ func L2DistanceSq[T types.RealNumbers](v1, v2 []T) (T, error) {
 	return sumOfSquares, nil
 
 }
+*/
+
+// L2SquareDistanceUnrolled calculates the L2 square distance using loop unrolling.
+// This optimization can improve performance for large vectors by reducing loop
+// overhead and allowing for better instruction-level parallelism.
+func L2DistanceSq[T types.RealNumbers](p, q []T) (T, error) {
+	var sum T
+	n := len(p)
+	i := 0
+
+	// BCE Hint
+	p = p[:n]
+	q = q[:n]
+
+	// Process the bulk of the data in chunks of 8.
+	for i <= n-8 {
+		d0 := p[i+0] - q[i+0]
+		d1 := p[i+1] - q[i+1]
+		d2 := p[i+2] - q[i+2]
+		d3 := p[i+3] - q[i+3]
+		d4 := p[i+4] - q[i+4]
+		d5 := p[i+5] - q[i+5]
+		d6 := p[i+6] - q[i+6]
+		d7 := p[i+7] - q[i+7]
+
+		sum += d0*d0 + d1*d1 + d2*d2 + d3*d3 + d4*d4 + d5*d5 + d6*d6 + d7*d7
+		i += 8
+	}
 
-func L1Distance[T types.RealNumbers](v1, v2 []T) (T, error) {
-	switch any(v1).(type) {
-	case []float32:
-		_v1 := any(v1).([]float32)
-		_v2 := any(v2).([]float32)
-
-		diff := blas32.Vector{
-			N:    len(_v1),
-			Inc:  1,
-			Data: make([]float32, len(_v1)),
-		}
-
-		for i := range _v1 {
-			diff.Data[i] = _v1[i] - _v2[i]
-		}
+	// Handle the remaining elements if the vector size is not a multiple of 8.
+	for i < n {
+		diff := p[i] - q[i]
+		sum += diff * diff
+		i++
+	}
 
-		return T(blas32.Asum(diff)), nil
+	return sum, nil
+}
 
-	case []float64:
-		_v1 := any(v1).([]float64)
-		_v2 := any(v2).([]float64)
+// L1Distance calculates the L1 (Manhattan) distance between two vectors.
+/*
+func L1Distance[T types.RealNumbers](v1, v2 []T) (T, error) {
+	var sum T
+	for i := range v1 {
+		sum += math.Abs(v1[i] - v2[i])
+	}
+	return sum, nil
 
-		diff := blas64.Vector{
-			N:    len(_v1),
-			Inc:  1,
-			Data: make([]float64, len(_v1)),
+}
+*/
+
+// L1DistanceUnrolled calculates the L1 distance using loop unrolling for optimization.
+// It processes 8 elements per iteration to reduce loop overhead and improve performance
+// on large vectors. It also uses an inline 'abs' for potential speed gains.
+func L1Distance[T types.RealNumbers](p, q []T) (T, error) {
+	var sum T
+	n := len(p)
+	i := 0
+
+	// BCE Hint
+	p = p[:n]
+	q = q[:n]
+
+	// Helper function for inline absolute value.
+	// A good compiler might inline this automatically.
+	abs := func(x T) T {
+		if x < 0 {
+			return -x
 		}
+		return x
+	}
 
-		for i := range _v1 {
-			diff.Data[i] = _v1[i] - _v2[i]
-		}
-		return T(blas64.Asum(diff)), nil
-	default:
-		return 0, moerr.NewInternalErrorNoCtx("L1Distance type not supported")
+	// Process the bulk of the data in chunks of 8.
+	for i <= n-8 {
+		sum += abs(p[i+0] - q[i+0])
+		sum += abs(p[i+1] - q[i+1])
+		sum += abs(p[i+2] - q[i+2])
+		sum += abs(p[i+3] - q[i+3])
+		sum += abs(p[i+4] - q[i+4])
+		sum += abs(p[i+5] - q[i+5])
+		sum += abs(p[i+6] - q[i+6])
+		sum += abs(p[i+7] - q[i+7])
+		i += 8
+	}
+
+	// Handle the remaining 0 to 7 elements.
+	for i < n {
+		sum += abs(p[i] - q[i])
+		i++
 	}
+
+	return sum, nil
 }
 
-func InnerProduct[T types.RealNumbers](v1, v2 []T) (T, error) {
-	switch any(v1).(type) {
-	case []float32:
-		_v1 := blas32.Vector{N: len(v1), Inc: 1, Data: any(v1).([]float32)}
-		_v2 := blas32.Vector{N: len(v2), Inc: 1, Data: any(v2).([]float32)}
+// InnerProduct calculates the inner product (dot product) of two vectors.
+// This is a clear, readable, and idiomatic Go implementation.
+/*
+func InnerProduct[T types.RealNumbers](p, q []T) (T, error) {
+	var sum T
+	for i := range p {
+		sum += p[i] * q[i]
+	}
 
-		return T(-blas32.Dot(_v1, _v2)), nil
+	return -sum, nil
+}
+*/
+
+// InnerProductUnrolled calculates the inner product using loop unrolling.
+// This can significantly improve performance for large vectors by reducing
+// loop overhead and enabling better CPU instruction scheduling.
+func InnerProduct[T types.RealNumbers](p, q []T) (T, error) {
+	var sum T
+	n := len(p)
+	i := 0
+
+	// BCE Hint
+	p = p[:n]
+	q = q[:n]
+
+	// Process the bulk of the data in chunks of 8.
+	for i <= n-8 {
+		sum += p[i+0]*q[i+0] +
+			p[i+1]*q[i+1] +
+			p[i+2]*q[i+2] +
+			p[i+3]*q[i+3] +
+			p[i+4]*q[i+4] +
+			p[i+5]*q[i+5] +
+			p[i+6]*q[i+6] +
+			p[i+7]*q[i+7]
+		i += 8
+	}
 
-	case []float64:
-		_v1 := blas64.Vector{N: len(v1), Inc: 1, Data: any(v1).([]float64)}
-		_v2 := blas64.Vector{N: len(v2), Inc: 1, Data: any(v2).([]float64)}
-		return T(-blas64.Dot(_v1, _v2)), nil
-	default:
-		return 0, moerr.NewInternalErrorNoCtx("InnerProduct type not supported")
+	// Handle the remaining 0 to 7 elements.
+	for i < n {
+		sum += p[i] * q[i]
+		i++
 	}
+
+	return -sum, nil
 }
 
+// CosineDistance calculates the cosine distance between two vectors using generics.
+//
+// Formula:
+// Cosine Distance = 1 - Cosine Similarity
+// Cosine Similarity = (v1 · v2) / (||v1|| * ||v2||)
+//
+// This implementation uses loop unrolling to optimize the calculation of the
+// dot product (v1 · v2) and the squared L2 norms (||v1||², ||v2||²) in a single pass.
+// This improves performance by reducing loop overhead and maximizing CPU cache efficiency.
 func CosineDistance[T types.RealNumbers](v1, v2 []T) (T, error) {
-	switch any(v1).(type) {
-	case []float32:
-		_v1 := blas32.Vector{N: len(v1), Inc: 1, Data: any(v1).([]float32)}
-		_v2 := blas32.Vector{N: len(v2), Inc: 1, Data: any(v2).([]float32)}
-
-		mag1 := blas32.Nrm2(_v1)
-		mag2 := blas32.Nrm2(_v2)
-		if mag1 == 0 || mag2 == 0 {
-			return 0, moerr.NewInternalErrorNoCtx("cannot compute cosine similarity with zero vector")
-		}
-		score := blas32.Dot(_v1, _v2) / (mag1 * mag2)
-		return T(1 - score), nil
-
-	case []float64:
-		_v1 := blas64.Vector{N: len(v1), Inc: 1, Data: any(v1).([]float64)}
-		_v2 := blas64.Vector{N: len(v2), Inc: 1, Data: any(v2).([]float64)}
-		mag1 := blas64.Nrm2(_v1)
-		mag2 := blas64.Nrm2(_v2)
-		if mag1 == 0 || mag2 == 0 {
-			return 0, moerr.NewInternalErrorNoCtx("cannot compute cosine similarity with zero vector")
-		}
-		score := blas64.Dot(_v1, _v2) / (mag1 * mag2)
-		return T(1 - score), nil
-	default:
-		return 0, moerr.NewInternalErrorNoCtx("CosineDistance type not supported")
+	if len(v1) == 0 {
+		// The distance is undefined for empty vectors. Returning 0 and no error is a common convention.
+		return 0, nil
+	}
+
+	var (
+		dotProduct T
+		normV1Sq   T
+		normV2Sq   T
+	)
+
+	n := len(v1)
+	i := 0
+
+	// BCE Hint
+	v1 = v1[:n]
+	v2 = v2[:n]
+
+	// Process the bulk of the data in chunks of 4.
+	// Unrolling by 4 provides a good balance between performance gain and code readability.
+	// We calculate all three components in one loop to improve data locality.
+	for i <= n-4 {
+		dotProduct += v1[i+0]*v2[i+0] + v1[i+1]*v2[i+1] + v1[i+2]*v2[i+2] + v1[i+3]*v2[i+3]
+		normV1Sq += v1[i+0]*v1[i+0] + v1[i+1]*v1[i+1] + v1[i+2]*v1[i+2] + v1[i+3]*v1[i+3]
+		normV2Sq += v2[i+0]*v2[i+0] + v2[i+1]*v2[i+1] + v2[i+2]*v2[i+2] + v2[i+3]*v2[i+3]
+		i += 4
+	}
+
+	// Handle the remaining 0 to 3 elements.
+	for i < n {
+		dotProduct += v1[i] * v2[i]
+		normV1Sq += v1[i] * v1[i]
+		normV2Sq += v2[i] * v2[i]
+		i++
 	}
 
+	// The denominator is the product of the L2 norms (Euclidean lengths).
+	// We must cast to float64 to use the standard library's math.Sqrt.
+	denominator := math.Sqrt(float64(normV1Sq)) * math.Sqrt(float64(normV2Sq))
+
+	// Handle the edge case of a zero-magnitude vector. If the denominator is zero,
+	// the cosine similarity is undefined. A distance of 1.0 is a common convention,
+	// implying the vectors are maximally dissimilar (orthogonal).
+	if denominator == 0 {
+		// This can happen if one or both vectors are all zeros.
+		return 1.0, nil
+	}
+
+	// Calculate cosine similarity.
+	similarity := float64(dotProduct) / denominator
+
+	// handle precision issues. Clamp the cosine simliarity to the range [-1, 1].
+	if similarity > 1.0 {
+		similarity = 1.0
+	} else if similarity < -1.0 {
+		similarity = -1.0
+	}
+
+	// Cosine distance is 1 minus the similarity.
+	// The result is cast back to the original type T.
+	distance := 1.0 - similarity
+
+	return T(distance), nil
 }
 
 // SphericalDistance is used for InnerProduct and CosineDistance in Spherical Kmeans.
@@ -159,24 +272,36 @@ func CosineDistance[T types.RealNumbers](v1, v2 []T) (T, error) {
 // angular distance between the two points, scaled by pi.
 // Refs:
 // https://en.wikipedia.org/wiki/Great-circle_distance#Vector_version
-func SphericalDistance[T types.RealNumbers](v1, v2 []T) (T, error) {
+func SphericalDistance[T types.RealNumbers](p, q []T) (T, error) {
 	// Compute the dot product of the two vectors.
 	// The dot product of two vectors is a measure of their similarity,
 	// and it can be used to calculate the angle between them.
-	dp := float64(0)
-
-	switch any(v1).(type) {
-	case []float32:
-		_v1 := blas32.Vector{N: len(v1), Inc: 1, Data: any(v1).([]float32)}
-		_v2 := blas32.Vector{N: len(v2), Inc: 1, Data: any(v2).([]float32)}
-		dp = float64(blas32.Dot(_v1, _v2))
-
-	case []float64:
-		_v1 := blas64.Vector{N: len(v1), Inc: 1, Data: any(v1).([]float64)}
-		_v2 := blas64.Vector{N: len(v2), Inc: 1, Data: any(v2).([]float64)}
-		dp = blas64.Dot(_v1, _v2)
-	default:
-		return 0, moerr.NewInternalErrorNoCtx("SphericalDistance type not supported")
+	dp := T(0)
+
+	n := len(p)
+	i := 0
+
+	// BCE Hint
+	p = p[:n]
+	q = q[:n]
+
+	// Process the bulk of the data in chunks of 8.
+	for i <= n-8 {
+		dp += p[i+0]*q[i+0] +
+			p[i+1]*q[i+1] +
+			p[i+2]*q[i+2] +
+			p[i+3]*q[i+3] +
+			p[i+4]*q[i+4] +
+			p[i+5]*q[i+5] +
+			p[i+6]*q[i+6] +
+			p[i+7]*q[i+7]
+		i += 8
+	}
+
+	// Handle the remaining 0 to 7 elements.
+	for i < n {
+		dp += p[i] * q[i]
+		i++
 	}
 
 	// Prevent NaN with acos with loss of precision.
@@ -186,7 +311,7 @@ func SphericalDistance[T types.RealNumbers](v1, v2 []T) (T, error) {
 		dp = -1.0
 	}
 
-	theta := math.Acos(dp)
+	theta := math.Acos(float64(dp))
 
 	//To scale the result to the range [0, 1], we divide by Pi.
 	return T(theta / math.Pi), nil
diff --git a/pkg/vectorindex/metric/distance_func_test.go b/pkg/vectorindex/metric/distance_func_test.go
index e7b1c6d76078d..189b2991a43db 100644
--- a/pkg/vectorindex/metric/distance_func_test.go
+++ b/pkg/vectorindex/metric/distance_func_test.go
@@ -143,13 +143,15 @@ func Test_ZeroVector(t *testing.T) {
 
 	v1 := []float64{0, 0, 0}
 	v2 := []float64{0, 0, 0}
-	_, err := CosineDistance[float64](v1, v2)
-	require.NotNil(t, err)
+	dist64, err := CosineDistance[float64](v1, v2)
+	require.NoError(t, err)
+	require.Equal(t, dist64, float64(1))
 
 	v1f32 := []float32{0, 0, 0}
 	v2f32 := []float32{0, 0, 0}
-	_, err = CosineDistance[float32](v1f32, v2f32)
-	require.NotNil(t, err)
+	dist32, err := CosineDistance[float32](v1f32, v2f32)
+	require.NoError(t, err)
+	require.Equal(t, dist32, float32(1))
 
 }
 
@@ -193,7 +195,7 @@ func Test_L2Distance(t *testing.T) {
 				v1: []float64{4, 1},
 				v2: []float64{1, 4},
 			},
-			want: 4.242640687119286,
+			want: 4.242640687119285,
 		},
 		{
 			name: "Test 3.c",
diff --git a/pkg/vectorindex/metric/gpu.go b/pkg/vectorindex/metric/gpu.go
new file mode 100644
index 0000000000000..a061c563ad1cd
--- /dev/null
+++ b/pkg/vectorindex/metric/gpu.go
@@ -0,0 +1,32 @@
+//go:build gpu
+
+// Copyright 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metric
+
+import (
+        cuvs "github.com/rapidsai/cuvs/go"
+)
+
+var (
+        MetricTypeToCuvsMetric = map[MetricType]cuvs.Distance{
+                Metric_L2sqDistance:   cuvs.DistanceSQEuclidean,
+                Metric_L2Distance:     cuvs.DistanceSQEuclidean,
+                Metric_InnerProduct:   cuvs.DistanceInnerProduct,
+                Metric_CosineDistance: cuvs.DistanceCosine,
+                Metric_L1Distance:     cuvs.DistanceL1,
+        }
+)
+
diff --git a/pkg/vectorindex/metric/types.go b/pkg/vectorindex/metric/types.go
index 7e1576a24498d..86df29c3254cb 100644
--- a/pkg/vectorindex/metric/types.go
+++ b/pkg/vectorindex/metric/types.go
@@ -83,6 +83,13 @@ var (
 		*/
 	}
 
+	MetricTypeToUsearchMetric = map[MetricType]usearch.Metric{
+		Metric_L2Distance:     usearch.L2sq,
+		Metric_L2sqDistance:   usearch.L2sq,
+		Metric_InnerProduct:   usearch.InnerProduct,
+		Metric_CosineDistance: usearch.Cosine,
+	}
+
 	MetricTypeToDistFuncName = map[MetricType]string{
 		Metric_L2Distance:     DistFn_L2Distance,
 		Metric_L2sqDistance:   DistFn_L2sqDistance,
diff --git a/pkg/vectorindex/types.go b/pkg/vectorindex/types.go
index 05df94e64feb0..e2d60d92aef10 100644
--- a/pkg/vectorindex/types.go
+++ b/pkg/vectorindex/types.go
@@ -113,6 +113,7 @@ type RuntimeConfig struct {
 	Probe             uint
 	OrigFuncName      string
 	BackgroundQueries []*plan.Query
+	NThreads          uint // Brute Force Index
 }
 
 type VectorIndexCdc[T types.RealNumbers] struct {
diff --git a/pkg/vectorize/moarray/external.go b/pkg/vectorize/moarray/external.go
index bd50cc078e57e..4103dfba05d1e 100644
--- a/pkg/vectorize/moarray/external.go
+++ b/pkg/vectorize/moarray/external.go
@@ -192,12 +192,8 @@ func CosineDistance[T types.RealNumbers](v1, v2 []T) (float64, error) {
 		return 0, moerr.NewArrayInvalidOpNoCtx(len(v1), len(v2))
 	}
 
-	cosine, err := CosineSimilarity[T](v1, v2)
-	if err != nil {
-		return 0, err
-	}
-
-	return float64(1 - cosine), nil
+	ret, err := metric.CosineDistance[T](v1, v2)
+	return float64(ret), err
 }
 
 func CosineSimilarity[T types.RealNumbers](v1, v2 []T) (float64, error) {
diff --git a/test/distributed/cases/pessimistic_transaction/vector/vector_hnsw.result b/test/distributed/cases/pessimistic_transaction/vector/vector_hnsw.result
index ecbf27d7cc0e8..a5aa6e38f3c3d 100644
--- a/test/distributed/cases/pessimistic_transaction/vector/vector_hnsw.result
+++ b/test/distributed/cases/pessimistic_transaction/vector/vector_hnsw.result
@@ -305,11 +305,11 @@ a    b    c    orderbyfn
 9776    [10, 3, 8, 5, 48, 26, 5, 16, 17, 0, 0, 2, 132, 53, 1, 16, 112, 6, 0, 0, 7, 2, 1, 48, 48, 15, 18, 31, 3, 0, 0, 9, 6, 10, 19, 27, 50, 46, 17, 9, 18, 1, 4, 48, 132, 23, 3, 5, 132, 9, 4, 3, 11, 0, 2, 46, 84, 12, 10, 10, 1, 0, 12, 76, 26, 22, 16, 26, 35, 15, 3, 16, 15, 1, 51, 132, 125, 8, 1, 2, 132, 51, 67, 91, 8, 0, 0, 30, 126, 39, 32, 38, 4, 0, 1, 12, 24, 2, 2, 2, 4, 7, 2, 19, 93, 19, 70, 92, 2, 3, 1, 21, 36, 58, 132, 94, 0, 0, 0, 0, 21, 25, 57, 48, 1, 0, 0, 1]    3    0.26629316806793213
 select *, l2_distance(b, "[1, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") as orderbyfn from vector_cos_01 order by cosine_distance(b, "[1, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") ASC LIMIT 2;
 a    b    c    orderbyfn
-9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    127.4205551147461
-9776    [10, 3, 8, 5, 48, 26, 5, 16, 17, 0, 0, 2, 132, 53, 1, 16, 112, 6, 0, 0, 7, 2, 1, 48, 48, 15, 18, 31, 3, 0, 0, 9, 6, 10, 19, 27, 50, 46, 17, 9, 18, 1, 4, 48, 132, 23, 3, 5, 132, 9, 4, 3, 11, 0, 2, 46, 84, 12, 10, 10, 1, 0, 12, 76, 26, 22, 16, 26, 35, 15, 3, 16, 15, 1, 51, 132, 125, 8, 1, 2, 132, 51, 67, 91, 8, 0, 0, 30, 126, 39, 32, 38, 4, 0, 1, 12, 24, 2, 2, 2, 4, 7, 2, 19, 93, 19, 70, 92, 2, 3, 1, 21, 36, 58, 132, 94, 0, 0, 0, 0, 21, 25, 57, 48, 1, 0, 0, 1]    3    364.642333984375
+9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    127.42056274414062
+9776    [10, 3, 8, 5, 48, 26, 5, 16, 17, 0, 0, 2, 132, 53, 1, 16, 112, 6, 0, 0, 7, 2, 1, 48, 48, 15, 18, 31, 3, 0, 0, 9, 6, 10, 19, 27, 50, 46, 17, 9, 18, 1, 4, 48, 132, 23, 3, 5, 132, 9, 4, 3, 11, 0, 2, 46, 84, 12, 10, 10, 1, 0, 12, 76, 26, 22, 16, 26, 35, 15, 3, 16, 15, 1, 51, 132, 125, 8, 1, 2, 132, 51, 67, 91, 8, 0, 0, 30, 126, 39, 32, 38, 4, 0, 1, 12, 24, 2, 2, 2, 4, 7, 2, 19, 93, 19, 70, 92, 2, 3, 1, 21, 36, 58, 132, 94, 0, 0, 0, 0, 21, 25, 57, 48, 1, 0, 0, 1]    3    364.6423034667969
 select *, cosine_distance(b, "[2, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") as orderbyfn from vector_cos_01 order by cosine_distance(b, "[1, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") ASC LIMIT 2;
 a    b    c    orderbyfn
-9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    0.031903373234243526
-9776    [10, 3, 8, 5, 48, 26, 5, 16, 17, 0, 0, 2, 132, 53, 1, 16, 112, 6, 0, 0, 7, 2, 1, 48, 48, 15, 18, 31, 3, 0, 0, 9, 6, 10, 19, 27, 50, 46, 17, 9, 18, 1, 4, 48, 132, 23, 3, 5, 132, 9, 4, 3, 11, 0, 2, 46, 84, 12, 10, 10, 1, 0, 12, 76, 26, 22, 16, 26, 35, 15, 3, 16, 15, 1, 51, 132, 125, 8, 1, 2, 132, 51, 67, 91, 8, 0, 0, 30, 126, 39, 32, 38, 4, 0, 1, 12, 24, 2, 2, 2, 4, 7, 2, 19, 93, 19, 70, 92, 2, 3, 1, 21, 36, 58, 132, 94, 0, 0, 0, 0, 21, 25, 57, 48, 1, 0, 0, 1]    3    0.26625744260882
+9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    0.031903598457574844
+9776    [10, 3, 8, 5, 48, 26, 5, 16, 17, 0, 0, 2, 132, 53, 1, 16, 112, 6, 0, 0, 7, 2, 1, 48, 48, 15, 18, 31, 3, 0, 0, 9, 6, 10, 19, 27, 50, 46, 17, 9, 18, 1, 4, 48, 132, 23, 3, 5, 132, 9, 4, 3, 11, 0, 2, 46, 84, 12, 10, 10, 1, 0, 12, 76, 26, 22, 16, 26, 35, 15, 3, 16, 15, 1, 51, 132, 125, 8, 1, 2, 132, 51, 67, 91, 8, 0, 0, 30, 126, 39, 32, 38, 4, 0, 1, 12, 24, 2, 2, 2, 4, 7, 2, 19, 93, 19, 70, 92, 2, 3, 1, 21, 36, 58, 132, 94, 0, 0, 0, 0, 21, 25, 57, 48, 1, 0, 0, 1]    3    0.2662576138973236
 drop table vector_cos_01;
 SET experimental_hnsw_index = 0;
diff --git a/test/distributed/cases/vector/vector_index.result b/test/distributed/cases/vector/vector_index.result
index f9abf16a9c969..5fedce5b6b3b9 100644
--- a/test/distributed/cases/vector/vector_index.result
+++ b/test/distributed/cases/vector/vector_index.result
@@ -296,7 +296,7 @@ a    b    c    orderbyfn
 9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    0.03196156024932861
 select *, l2_distance(b, "[1, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") as orderbyfn from vector_cos_01 order by cosine_distance(b, "[1, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") ASC LIMIT 2;
 a    b    c    orderbyfn
-9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    127.4205551147461
+9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    127.42056274414062
 select *, cosine_distance(b, "[2, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") as orderbyfn from vector_cos_01 order by cosine_distance(b, "[1, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") ASC LIMIT 2;
 a    b    c    orderbyfn
 9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    0.031903373234243526
diff --git a/test/distributed/cases/vector/vector_ivf_mode.result b/test/distributed/cases/vector/vector_ivf_mode.result
index 46e06670dfd52..b5d47c996739d 100644
--- a/test/distributed/cases/vector/vector_ivf_mode.result
+++ b/test/distributed/cases/vector/vector_ivf_mode.result
@@ -28,11 +28,11 @@ id2    greeting message    1.1550325221482527
 id3    vector search test    1.1828778308075465
 WITH q AS (SELECT id, text, l2_distance(vec, '[0.1,-0.2,0.3,0.4,-0.1,0.2,0.0,0.5]') AS dist FROM mini_vector_data) SELECT * FROM q ORDER BY dist LIMIT 5 by rank with option 'mode=force';
 id    text    dist
-id8    semantic item    0.7760798335075378
+id8    semantic item    0.7760798931121826
 id1    hello world    1.0551303625106812
 id10    additional entry    1.071820855140686
-id2    greeting message    1.1550323963165283
-id3    vector search test    1.1828778982162476
+id2    greeting message    1.1550325155258179
+id3    vector search test    1.182877779006958
 CREATE TABLE mini_embed_data (id VARCHAR(64) NOT NULL, embedding VECF32(8) DEFAULT NULL, content TEXT DEFAULT NULL, description VARCHAR(255) DEFAULT NULL, file_id VARCHAR(64) DEFAULT NULL, score FLOAT DEFAULT NULL, disabled TINYINT DEFAULT 0, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id), KEY idx_file_id (file_id));
 CREATE INDEX idx_vec_embedding USING ivfflat ON mini_embed_data (embedding) LISTS = 16 OP_TYPE 'vector_cosine_ops';
 INSERT INTO mini_embed_data (id, embedding, content, description, file_id, score, disabled) VALUES('id01','[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]','ai is artificial intelligence','what is ai?','file01',0.0,0);
@@ -57,8 +57,8 @@ id05    an ai assistant by openai    what is chatgpt?    [0.12, 0.32, 0.52, 0.72
 SELECT mini_embed_data.id, mini_embed_data.content, mini_embed_data.description, mini_embed_data.embedding, mini_embed_data.file_id, mini_embed_data.disabled FROM mini_embed_data WHERE mini_embed_data.file_id IN ('file01','file02') AND mini_embed_data.embedding IS NOT NULL AND (mini_embed_data.disabled IS NULL OR mini_embed_data.disabled = false) ORDER BY cosine_distance(mini_embed_data.embedding,"[0.12,0.55,0.33,0.88,0.22,0.44,0.66,0.11]") DESC LIMIT 10 by rank with option 'mode=force';
 id    content    description    embedding    file_id    disabled
 id02    sql is structured query language    what is sql?    [0.01, 0.03, 0.05, 0.07, 0.09, 0.11, 0.13, 0.15]    file01    0
-id04    mysql is a relational database    what is mysql?    [0.11, 0.22, 0.33, 0.44, 0.55, 0.66, 0.77, 0.88]    file02    0
 id01    ai is artificial intelligence    what is ai?    [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]    file01    0
+id04    mysql is a relational database    what is mysql?    [0.11, 0.22, 0.33, 0.44, 0.55, 0.66, 0.77, 0.88]    file02    0
 id03    it stores high dimensional vectors    what is vector db?    [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]    file02    0
 id05    an ai assistant by openai    what is chatgpt?    [0.12, 0.32, 0.52, 0.72, 0.11, 0.31, 0.51, 0.71]    file01    0
 WITH q AS (SELECT id, text, l2_distance(vec, '[0.1,-0.2,0.3,0.4,-0.1,0.2,0.0,0.5]') AS dist FROM mini_vector_data WHERE id LIKE 'id%') SELECT * FROM q ORDER BY dist LIMIT 3 by rank with option 'mode=pre';
@@ -83,7 +83,7 @@ id1    hello world    1.0551303640156267
 id10    additional entry    1.0718208877650799
 WITH q AS (SELECT id, text, l2_distance(vec, '[0.1,-0.2,0.3,0.4,-0.1,0.2,0.0,0.5]') AS dist FROM mini_vector_data WHERE id LIKE 'id%') SELECT * FROM q ORDER BY dist LIMIT 3 by rank with option 'mode=force';
 id    text    dist
-id8    semantic item    0.7760798335075378
+id8    semantic item    0.7760798931121826
 id1    hello world    1.0551303625106812
 id10    additional entry    1.071820855140686
 SELECT mini_embed_data.id, mini_embed_data.content FROM mini_embed_data WHERE mini_embed_data.file_id IN ('file01','file02') AND mini_embed_data.embedding IS NOT NULL AND mini_embed_data.disabled = 0 AND mini_embed_data.score = 0.0 ORDER BY cosine_distance(mini_embed_data.embedding,"[0.12,0.55,0.33,0.88,0.22,0.44,0.66,0.11]") DESC LIMIT 3 by rank with option 'mode=pre';
@@ -113,9 +113,9 @@ id1    hello world    1.0551303640156267
 id10    additional entry    1.0718208877650799
 WITH q AS (SELECT id, text, cosine_distance(vec, '[0.1,-0.2,0.3,0.4,-0.1,0.2,0.0,0.5]') AS dist FROM mini_vector_data) SELECT * FROM q ORDER BY dist DESC LIMIT 3 by rank with option 'mode=pre';
 id    text    dist
-id4    example data    1.1711897501680237
-id7    random note    0.9494411010292958
-id6    short text    0.8904764598385896
+id4    example data    1.171189785003662
+id7    random note    0.9494410753250122
+id6    short text    0.8904764652252197
 WITH q AS (SELECT id, text, l2_distance(vec, '[0.1,-0.2,0.3,0.4,-0.1,0.2,0.0,0.5]') AS dist FROM mini_vector_data WHERE id LIKE 'id%') SELECT * FROM q ORDER BY dist LIMIT 1 by rank with option 'mode=pre';
 id    text    dist
 id8    semantic item    0.7760798852372132
@@ -359,20 +359,21 @@ id8    semantic item    0.7760798852372132
 id1    hello world    1.0551303640156267
 id2    greeting message    1.1550325221482527
 id7    random note    1.1910499415172193
-(SELECT id, text AS content
+(SELECT id, text AS content, l2_distance(vec, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]') AS dist
 FROM mini_vector_data
 ORDER BY id, l2_distance(vec, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]')
 LIMIT 2 by rank with option 'mode=pre')
 UNION
-(SELECT id, content
+(SELECT id, content, cosine_distance(embedding, '[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2]') AS dist
 FROM mini_embed_data
-ORDER BY cosine_distance(embedding, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]')
+ORDER BY cosine_distance(embedding, '[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2]')
 LIMIT 2 by rank with option 'mode=pre')
-LIMIT 3;
-id    content
-id1    hello world
-id10    additional entry
-id04    mysql is a relational database
+LIMIT 4;
+id    content    dist
+id1    hello world    1.5163443088531494
+id10    additional entry    1.4459599256515503
+id03    it stores high dimensional vectors    0.0
+id05    an ai assistant by openai    0.2728450298309326
 (SELECT id, category, l2_distance(vec, '[0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]') AS dist
 FROM vec_with_multi_idx
 WHERE category = 'A' AND status = 1
@@ -437,19 +438,19 @@ id10    additional entry    1.0718208877650799
 (SELECT id, content
 FROM mini_embed_data
 WHERE file_id = 'file01'
-ORDER BY cosine_distance(embedding, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]')
+ORDER BY cosine_distance(embedding, '[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2]')
 LIMIT 1 by rank with option 'mode=pre')
 UNION ALL
 (SELECT id, content
 FROM mini_embed_data
 WHERE file_id = 'file02'
-ORDER BY cosine_distance(embedding, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]')
+ORDER BY cosine_distance(embedding, '[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2]')
 LIMIT 2 by rank with option 'mode=pre')
 LIMIT 3;
 id    content
-id04    mysql is a relational database
 id03    it stores high dimensional vectors
-id01    ai is artificial intelligence
+id04    mysql is a relational database
+id05    an ai assistant by openai
 (SELECT id, text, l2_distance(vec, '[0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5]') AS dist
 FROM mini_vector_data
 WHERE id LIKE 'id1%'
diff --git a/test/distributed/cases/vector/vector_ivf_mode.sql b/test/distributed/cases/vector/vector_ivf_mode.sql
index a7af074467d5e..3c13489907e98 100644
--- a/test/distributed/cases/vector/vector_ivf_mode.sql
+++ b/test/distributed/cases/vector/vector_ivf_mode.sql
@@ -283,16 +283,16 @@ UNION
 ORDER BY dist LIMIT 4;
 
 -- Test Case: UNION with mode=pre on different tables (mini_vector_data and mini_embed_data)
-(SELECT id, text AS content
+(SELECT id, text AS content, l2_distance(vec, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]') AS dist
  FROM mini_vector_data 
  ORDER BY id, l2_distance(vec, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]')
  LIMIT 2 by rank with option 'mode=pre')
 UNION
-(SELECT id, content
+(SELECT id, content, cosine_distance(embedding, '[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2]') AS dist
  FROM mini_embed_data 
- ORDER BY cosine_distance(embedding, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]') 
+ ORDER BY cosine_distance(embedding, '[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2]') 
  LIMIT 2 by rank with option 'mode=pre')
-LIMIT 3;
+LIMIT 4;
 
 -- Test Case: UNION with mode=pre and complex WHERE conditions
 (SELECT id, category, l2_distance(vec, '[0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]') AS dist 
@@ -352,13 +352,13 @@ ORDER BY dist LIMIT 3;
 (SELECT id, content
  FROM mini_embed_data 
  WHERE file_id = 'file01'
- ORDER BY cosine_distance(embedding, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]') 
+ ORDER BY cosine_distance(embedding, '[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2]') 
  LIMIT 1 by rank with option 'mode=pre')
 UNION ALL
 (SELECT id, content
  FROM mini_embed_data 
  WHERE file_id = 'file02'
- ORDER BY cosine_distance(embedding, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]') 
+ ORDER BY cosine_distance(embedding, '[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2]') 
  LIMIT 2 by rank with option 'mode=pre')
 LIMIT 3;
 
diff --git a/thirdparties/Makefile b/thirdparties/Makefile
index b25a4f48b9bd5..786083a2ef305 100644
--- a/thirdparties/Makefile
+++ b/thirdparties/Makefile
@@ -15,8 +15,8 @@
 PWD=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 UNAME_S=$(shell uname -s | tr A-Z a-z)
 UNAME_M=$(shell uname -m)
-USEARCH_DIR=USearch-2.21.1
-USEARCH_TAR=usearch-2.21.1.tar.gz
+USEARCH_DIR=usearch-2.21.1-hotfix-20251212
+USEARCH_TAR=usearch-2.21.1-hotfix-20251212.tar.gz
 STRINGZILLA_DIR=StringZilla-4.2.1
 STRINGZILLA_TAR=$(STRINGZILLA_DIR).tar.gz
 SIMSIMD_DIR=SimSIMD-6.5.3
@@ -59,13 +59,13 @@ ifeq ($(UNAME_S),linux)
   ifeq ($(MUSL),)
 	USEARCH_CMAKE_FLAG += -DUSEARCH_USE_OPENMP=1
   endif
-  AVX512FP16 := $(shell lscpu | grep avx512fp16)
+  AVX512FP16 := $(shell lscpu | grep -e avx512fp16 -e avx512_bf16)
 endif
 
 ifeq ($(UNAME_M),x86_64)
   ifneq ($(AVX512FP16),)
     ifeq ($(MUSL),)
-	USEARCH_CMAKE_FLAG += -DUSEARCH_USE_SIMSIMD=1
+	USEARCH_CMAKE_FLAG += -DUSEARCH_USE_SIMSIMD=1 -DUSEARCH_DEFINED_CLANG=1
     endif
   endif
 endif
diff --git a/thirdparties/usearch-2.21.1-hotfix-20251212.tar.gz b/thirdparties/usearch-2.21.1-hotfix-20251212.tar.gz
new file mode 100644
index 0000000000000..38ac7d13c9eeb
Binary files /dev/null and b/thirdparties/usearch-2.21.1-hotfix-20251212.tar.gz differ
diff --git a/thirdparties/usearch-2.21.1.tar.gz b/thirdparties/usearch-2.21.1.tar.gz
deleted file mode 100644
index 3e70aaf663688..0000000000000
Binary files a/thirdparties/usearch-2.21.1.tar.gz and /dev/null differ