streed · Copilot · Sep 24, 2025 · Sep 24, 2025 · Sep 24, 2025
diff --git a/SEARCH_SCORING.md b/SEARCH_SCORING.md
@@ -0,0 +1,93 @@
+# Search Scoring in Lil-RAG
+
+## Overview
+
+Lil-RAG uses **cosine distance** between query embeddings and chunk embeddings to score search results. This document explains how the search scoring system works.
+
+## Implementation Details
+
+### Core Algorithm
+
+1. **Embedding Calculation**: Query text is converted to an embedding vector using the configured embedding model
+2. **Distance Calculation**: For each indexed chunk, the system calculates the cosine distance between the query embedding and the chunk's embedding using SQLite's `vec_distance_cosine()` function
+3. **Score Conversion**: The distance is converted to a score using: `score = 1.0 - distance`
+
+### SQL Query
+
+The search uses this SQL query to find and score results:
+
+```sql
+SELECT 
+    c.document_id,
+    c.chunk_text_compressed,
+    c.chunk_index,
+    c.page_number,
+    c.chunk_type,
+    d.original_text_compressed,
+    d.file_path,
+    d.source_path,
+    vec_distance_cosine(e.embedding, ?) as distance
+FROM chunks c
+JOIN documents d ON c.document_id = d.id
+JOIN embeddings e ON c.chunk_id = e.chunk_id
+ORDER BY distance
+LIMIT ?
+```
+
+Key points:
+- **`e.embedding`** contains the chunk's embedding vector
+- **`?`** is the query embedding vector
+- **`vec_distance_cosine()`** calculates cosine distance between the two vectors
+
+### Score Interpretation
+
+| Distance | Score | Meaning |
+|----------|-------|---------|
+| 0.0 | 1.0 | Identical embeddings (perfect match) |
+| 1.0 | 0.0 | Orthogonal embeddings (no similarity) |
+| 2.0 | -1.0 | Opposite embeddings (maximum dissimilarity) |
+
+- **Higher scores** indicate **better matches**
+- **Scores close to 1.0** indicate very similar content
+- **Scores around 0.0** indicate neutral/unrelated content
+- **Negative scores** indicate semantically opposite content
+
+### Chunk-Level vs Document-Level
+
+The search system operates at the **chunk level**, not the document level:
+
+- Each document is split into chunks during indexing
+- Each chunk gets its own embedding
+- Search compares the query against **individual chunk embeddings**
+- This enables more precise matching of specific parts of documents
+
+## Example
+
+```go
+// Query: "machine learning algorithms"
+// Query embedding: [0.8, 0.2, 0.1, ...]
+
+// Document chunks:
+// Chunk 1: "Machine learning is a subset of AI" → embedding: [0.9, 0.1, 0.05, ...]
+// Chunk 2: "The weather is sunny today" → embedding: [0.1, 0.8, 0.7, ...]
+
+// Results:
+// Chunk 1: distance=0.05, score=0.95 (very similar)
+// Chunk 2: distance=1.2, score=-0.2 (dissimilar)
+```
+
+## Validation
+
+The search scoring behavior is validated in the test `TestSQLiteStorage_SearchScoring_ChunkEmbeddingDistance`, which verifies:
+
+- Identical embeddings get score ≈ 1.0
+- Close embeddings get high scores (0.9+)
+- Orthogonal embeddings get score ≈ 0.0
+- Opposite embeddings get negative scores
+- Results are sorted by score in descending order
+
+## Performance
+
+- The system uses SQLite's vector extension for efficient cosine distance calculations
+- Results are pre-sorted by the database, minimizing post-processing
+- The `LIMIT` clause ensures only the top results are processed
diff --git a/go.mod b/go.mod
@@ -7,7 +7,6 @@ toolchain go1.24.6
 require (
 	github.com/asg017/sqlite-vec-go-bindings v0.1.6
 	github.com/dslipak/pdf v0.0.2
-	github.com/gen2brain/go-fitz v1.24.15
 	github.com/google/uuid v1.6.0
 	github.com/mattn/go-sqlite3 v1.14.32
 	github.com/nguyenthenguyen/docx v0.0.0-20230621112118-9c8e795a11db
@@ -22,9 +21,7 @@ require (
 require (
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
-	github.com/ebitengine/purego v0.8.4 // indirect
 	github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect
-	github.com/jupiterrider/ffi v0.5.0 // indirect
 	github.com/kr/text v0.2.0 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/prometheus/client_model v0.6.2 // indirect

diff --git a/go.sum b/go.sum
@@ -9,18 +9,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dslipak/pdf v0.0.2 h1:djAvcM5neg9Ush+zR6QXB+VMJzR6TdnX766HPIg1JmI=
 github.com/dslipak/pdf v0.0.2/go.mod h1:2L3SnkI9cQwnAS9gfPz2iUoLC0rUZwbucpbKi5R1mUo=
-github.com/ebitengine/purego v0.8.4 h1:CF7LEKg5FFOsASUj0+QwaXf8Ht6TlFxg09+S9wz0omw=
-github.com/ebitengine/purego v0.8.4/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
-github.com/gen2brain/go-fitz v1.24.15 h1:sJNB1MOWkqnzzENPHggFpgxTwW0+S5WF/rM5wUBpJWo=
-github.com/gen2brain/go-fitz v1.24.15/go.mod h1:SftkiVbTHqF141DuiLwBBM65zP7ig6AVDQpf2WlHamo=
 github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc h1:GN2Lv3MGO7AS6PrRoT6yV5+wkrOpcszoIsO4+4ds248=
 github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk=
-github.com/jupiterrider/ffi v0.5.0 h1:j2nSgpabbV1JOwgP4Kn449sJUHq3cVLAZVBoOYn44V8=
-github.com/jupiterrider/ffi v0.5.0/go.mod h1:x7xdNKo8h0AmLuXfswDUBxUsd2OqUP4ekC8sCnsmbvo=
 github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
 github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=

diff --git a/pkg/lilrag/storage.go b/pkg/lilrag/storage.go
@@ -483,6 +483,9 @@ func (s *SQLiteStorage) SearchWithOptions(ctx context.Context, embedding []float
 	}
 
 	// Search through chunks and return best matches
+	// This query uses vec_distance_cosine() to calculate the cosine distance between
+	// the query embedding and each chunk's embedding (e.embedding), ensuring that
+	// search scoring is based on the semantic similarity between query and chunk content.
 	query := `
 		SELECT 
 			c.document_id,
@@ -534,6 +537,11 @@ func (s *SQLiteStorage) SearchWithOptions(ctx context.Context, embedding []float
 			return nil, fmt.Errorf("failed to scan row: %w", err)
 		}
 
+		// Calculate score from cosine distance: score = 1.0 - distance
+		// Cosine distance ranges from 0 (identical) to 2 (opposite), so:
+		// - Distance 0 (identical vectors) → Score 1.0 (highest)
+		// - Distance 1 (orthogonal vectors) → Score 0.0 (neutral)  
+		// - Distance 2 (opposite vectors) → Score -1.0 (lowest)
 		score := 1.0 - distance
 
 		// Handle deduplication based on mode

diff --git a/pkg/lilrag/storage_test.go b/pkg/lilrag/storage_test.go
@@ -1095,3 +1095,136 @@ func TestSQLiteStorage_DocumentReindexing(t *testing.T) {
 		t.Error("Final content not found, metadata re-indexing failed")
 	}
 }
+
+// TestSQLiteStorage_SearchScoring_ChunkEmbeddingDistance validates that search scoring
+// correctly uses the distance between query embedding and chunk embeddings.
+// This test specifically verifies the core search scoring requirement.
+func TestSQLiteStorage_SearchScoring_ChunkEmbeddingDistance(t *testing.T) {
+	storage, tempDir := setupTestStorage(t)
+	defer os.RemoveAll(tempDir)
+
+	err := storage.Initialize()
+	if err != nil {
+		if strings.Contains(err.Error(), "sqlite-vec extension not available") {
+			t.Skip("Skipping test: sqlite-vec extension not available")
+		}
+		t.Fatalf("Failed to initialize storage: %v", err)
+	}
+	defer storage.Close()
+
+	ctx := context.Background()
+
+	// Index documents with known embeddings for precise distance calculations
+	documents := []struct {
+		id        string
+		text      string
+		embedding []float32
+	}{
+		{
+			id:        "exact_match",
+			text:      "Exact match document",
+			embedding: []float32{1.0, 0.0, 0.0}, // Unit vector
+		},
+		{
+			id:        "close_match",
+			text:      "Close match document", 
+			embedding: []float32{0.9, 0.1, 0.0}, // Close to exact_match
+		},
+		{
+			id:        "orthogonal",
+			text:      "Orthogonal document",
+			embedding: []float32{0.0, 1.0, 0.0}, // Orthogonal to exact_match
+		},
+		{
+			id:        "opposite",
+			text:      "Opposite document",
+			embedding: []float32{-1.0, 0.0, 0.0}, // Opposite to exact_match
+		},
+	}
+
+	for _, doc := range documents {
+		err := storage.Index(ctx, doc.id, doc.text, doc.embedding)
+		if err != nil {
+			t.Fatalf("Failed to index document %s: %v", doc.id, err)
+		}
+	}
+
+	// Test with query embedding identical to exact_match
+	queryEmbedding := []float32{1.0, 0.0, 0.0}
+
+	results, err := storage.SearchWithOptions(ctx, queryEmbedding, 10, false)
+	if err != nil {
+		t.Fatalf("Search failed: %v", err)
+	}
+
+	if len(results) < 4 {
+		t.Fatalf("Expected at least 4 results, got %d", len(results))
+	}
+
+	// Verify that exact_match gets the highest score (distance ≈ 0, score ≈ 1.0)
+	if results[0].ID != "exact_match" {
+		t.Errorf("Expected 'exact_match' to be top result, got '%s'", results[0].ID)
+	}
+
+	// Score should be very close to 1.0 for identical embeddings
+	if results[0].Score < 0.999 {
+		t.Errorf("Expected score ≈ 1.0 for identical embeddings, got %.6f", results[0].Score)
+	}
+
+	// Verify that close_match gets second highest score
+	if results[1].ID != "close_match" {
+		t.Errorf("Expected 'close_match' to be second result, got '%s'", results[1].ID)
+	}
+
+	// Score should be high but less than exact match
+	if results[1].Score >= results[0].Score {
+		t.Errorf("Expected close_match score (%.6f) < exact_match score (%.6f)", 
+			results[1].Score, results[0].Score)
+	}
+	if results[1].Score < 0.9 {
+		t.Errorf("Expected close_match score > 0.9, got %.6f", results[1].Score)
+	}
+
+	// Verify that orthogonal gets lower score (cosine similarity ≈ 0, score ≈ 0)
+	orthogonalResult := findResultByID(results, "orthogonal")
+	if orthogonalResult == nil {
+		t.Fatal("Could not find orthogonal result")
+	}
+	if orthogonalResult.Score > 0.1 {
+		t.Errorf("Expected orthogonal score ≈ 0, got %.6f", orthogonalResult.Score)
+	}
+
+	// Verify that opposite gets the lowest score (cosine similarity ≈ -1, distance ≈ 2, score ≈ -1)
+	// However, cosine distance is typically clamped to [0, 2], so score could be negative
+	oppositeResult := findResultByID(results, "opposite")
+	if oppositeResult == nil {
+		t.Fatal("Could not find opposite result")
+	}
+	if oppositeResult.Score >= orthogonalResult.Score {
+		t.Errorf("Expected opposite score (%.6f) < orthogonal score (%.6f)", 
+			oppositeResult.Score, orthogonalResult.Score)
+	}
+
+	// Verify results are sorted by score in descending order
+	for i := 1; i < len(results); i++ {
+		if results[i-1].Score < results[i].Score {
+			t.Errorf("Results not sorted by score: result %d score (%.6f) < result %d score (%.6f)",
+				i-1, results[i-1].Score, i, results[i].Score)
+		}
+	}
+
+	t.Logf("Search scoring validation passed:")
+	for i, result := range results {
+		t.Logf("  %d. %s: score=%.6f", i+1, result.ID, result.Score)
+	}
+}
+
+// Helper function to find a result by ID
+func findResultByID(results []SearchResult, id string) *SearchResult {
+	for i := range results {
+		if results[i].ID == id {
+			return &results[i]
+		}
+	}
+	return nil
+}