From 853449e988a66d7b6ec7090b25240f182057a89a Mon Sep 17 00:00:00 2001 From: Tom Dicarlo <24437642+tomdicarlo@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:56:33 -0500 Subject: [PATCH 1/5] Working locally against OpenAI In a working state. Still needs tidying. Continued trimming files --- packages/sandcastle/package.json | 2 + packages/sandcastle/scripts/buildGallery.js | 95 +++++- .../sandcastle/src/Gallery/EmbeddingSearch.ts | 306 ++++++++++++++++++ .../src/Gallery/GalleryItemSearchInput.tsx | 36 +-- .../src/Gallery/GalleryItemStore.ts | 161 ++++----- 5 files changed, 500 insertions(+), 100 deletions(-) create mode 100644 packages/sandcastle/src/Gallery/EmbeddingSearch.ts diff --git a/packages/sandcastle/package.json b/packages/sandcastle/package.json index f7a9c47add7..33b8cce5353 100644 --- a/packages/sandcastle/package.json +++ b/packages/sandcastle/package.json @@ -11,6 +11,7 @@ }, "dependencies": { "@ariakit/react": "^0.4.17", + "@huggingface/transformers": "^3.3.1", "@monaco-editor/react": "^4.7.0", "@stratakit/bricks": "^0.3.4", "@stratakit/foundations": "^0.2.4", @@ -18,6 +19,7 @@ "@stratakit/structures": "^0.3.2", "allotment": "^1.20.4", "classnames": "^2.5.1", + "gulp": "^5.0.1", "monaco-editor": "^0.52.2", "pako": "^2.1.0", "prettier": "^3.5.3", diff --git a/packages/sandcastle/scripts/buildGallery.js b/packages/sandcastle/scripts/buildGallery.js index eaa1dceeafb..528ebc3da52 100644 --- a/packages/sandcastle/scripts/buildGallery.js +++ b/packages/sandcastle/scripts/buildGallery.js @@ -9,6 +9,7 @@ import { rimraf } from "rimraf"; import { parse } from "yaml"; import { globby } from "globby"; import * as pagefind from "pagefind"; +import { AutoModel, AutoTokenizer } from "@huggingface/transformers"; import createGalleryRecord from "./createGalleryRecord.js"; @@ -20,6 +21,78 @@ const defaultThumbnailPath = "images/placeholder-thumbnail.jpg"; const requiredMetadataKeys = ["title", "description"]; const galleryItemConfig = /sandcastle\.(yml|yaml)/; +// Embedding configuration +const MODEL_ID = "avsolatorio/GIST-small-Embedding-v0"; +const DOCUMENT_PREFIX = "title: none | text: "; + +/** + * Initialize the embedding model (singleton pattern) + */ +let embeddingModel = null; +let embeddingTokenizer = null; + +async function initEmbeddingModel() { + if (embeddingModel && embeddingTokenizer) { + return { model: embeddingModel, tokenizer: embeddingTokenizer }; + } + + console.log("Loading embedding model..."); + console.log("This may take a while on first run as the model is downloaded."); + + embeddingTokenizer = await AutoTokenizer.from_pretrained(MODEL_ID); + embeddingModel = await AutoModel.from_pretrained(MODEL_ID, { + dtype: "fp32", // Options: "fp32" | "q8" | "q4" + }); + + console.log("Embedding model loaded successfully!"); + return { model: embeddingModel, tokenizer: embeddingTokenizer }; +} + +/** + * Convert a gallery item to text for embedding + */ +function itemToText(title, description, labels) { + const text = `Title: ${title} +Description: ${description} +Labels: ${labels.join(", ")}`; + + return DOCUMENT_PREFIX + text; +} + +/** + * Generate embeddings for a batch of items + */ +async function generateEmbeddings(items) { + const { model, tokenizer } = await initEmbeddingModel(); + + console.log(`\nGenerating embeddings for ${items.length} gallery items...`); + + const embeddings = []; + const batchSize = 10; + + for (let i = 0; i < items.length; i += batchSize) { + const batch = items.slice(i, i + batchSize); + const batchEnd = Math.min(i + batchSize, items.length); + + console.log( + `Processing embeddings ${i + 1} to ${batchEnd} of ${items.length}...`, + ); + + const texts = batch.map((item) => + itemToText(item.title, item.description, item.labels), + ); + + const inputs = await tokenizer(texts, { padding: true, truncation: true }); + const { sentence_embedding } = await model(inputs); + + const batchEmbeddings = sentence_embedding.tolist(); + embeddings.push(...batchEmbeddings); + } + + console.log(`Generated ${embeddings.length} embeddings`); + return embeddings; +} + async function createPagefindIndex() { try { const { index } = await pagefind.createIndex({ @@ -89,6 +162,7 @@ export async function buildGalleryList(options = {}) { * @property {number} lineCount * @property {string} description * @property {string[]} labels + * @property {number[]} [embedding] - Vector embedding for semantic search */ /** @@ -117,8 +191,12 @@ export async function buildGalleryList(options = {}) { return condition; }; + // globby requires forward slashes in glob patterns, even on Windows. + // path.join() uses backslashes on Windows, so convert them to forward slashes. const galleryFiles = await globby( - galleryFilesPattern.map((pattern) => join(rootDirectory, pattern, "**/*")), + galleryFilesPattern.map((pattern) => + join(rootDirectory, pattern, "**/*").replace(/\\/g, "/") + ), ); const yamlFiles = galleryFiles.filter((path) => basename(path).match(galleryItemConfig), @@ -265,6 +343,21 @@ export async function buildGalleryList(options = {}) { // regardless of if titles match the directory names output.entries.sort((a, b) => a.title.localeCompare(b.title)); + // Generate embeddings for all entries + try { + const embeddings = await generateEmbeddings(output.entries); + // Add embeddings to each entry + output.entries.forEach((entry, index) => { + entry.embedding = embeddings[index]; + }); + console.log( + `\n✓ Successfully added embeddings to ${output.entries.length} gallery items`, + ); + } catch (error) { + console.error("Failed to generate embeddings:", error); + console.log("Continuing without embeddings..."); + } + const outputDirectory = join(rootDirectory, publicDirectory, "gallery"); await rimraf(outputDirectory); await mkdir(outputDirectory, { recursive: true }); diff --git a/packages/sandcastle/src/Gallery/EmbeddingSearch.ts b/packages/sandcastle/src/Gallery/EmbeddingSearch.ts new file mode 100644 index 00000000000..f742c3e5e2b --- /dev/null +++ b/packages/sandcastle/src/Gallery/EmbeddingSearch.ts @@ -0,0 +1,306 @@ +/** + * Embedding Search + * + * Provides semantic search for Sandcastle gallery items using embeddings. + * + * This service: + * 1. Loads pre-generated embeddings from the gallery list + * 2. Generates embeddings for search queries using embedding-gemma model in the browser + * 3. Computes cosine similarity against indexed gallery items + * 4. Returns ranked search results + * + * Benefits: + * - Runs entirely in the browser (no backend needed) + * - Uses local embedding model (privacy-friendly) + * - Fast search after initial model load + */ + +import { AutoModel, AutoTokenizer } from '@huggingface/transformers'; + +export interface VectorSearchResult { + rank: number; + id: string; + legacy_id: string; + title: string; + description: string; + labels: string[]; + distance: number; + score: number; + url: string; +} + +interface GalleryListItem { + url: string; + id: string; + title: string; + thumbnail: string; + lineCount: number; + description: string; + labels: string[]; + embedding?: number[]; +} + +interface GalleryList { + entries: GalleryListItem[]; + legacyIds: Record; +} + +class EmbeddingSearch { + private galleryList: GalleryList | null = null; + private tokenizer: any = null; + private model: any = null; + private modelId: string = 'avsolatorio/GIST-small-Embedding-v0'; + private queryPrefix: string = 'task: search result | query: '; + private loadingPromise: Promise | null = null; + + /** + * Load the gallery list with embeddings + */ + private async loadGalleryList(): Promise { + if (this.galleryList) return; + + try { + const response = await fetch('/gallery/list.json'); + if (!response.ok) { + throw new Error(`Failed to load gallery list: ${response.statusText}`); + } + this.galleryList = await response.json(); + + // Verify that embeddings are present + const itemsWithEmbeddings = this.galleryList!.entries.filter(item => item.embedding); + if (itemsWithEmbeddings.length === 0) { + throw new Error('No embeddings found in gallery list. Run the build script to generate them.'); + } + + console.log(`Loaded ${itemsWithEmbeddings.length} gallery items with embeddings`); + } catch (error) { + console.error('Failed to load gallery list:', error); + throw error; + } + } + + /** + * Initialize the embedding model + */ + private async loadModel(): Promise { + if (this.model && this.tokenizer) return; + + try { + console.log(`Loading embedding model: ${this.modelId} (this may take a moment on first load)...`); + this.tokenizer = await AutoTokenizer.from_pretrained(this.modelId); + this.model = await AutoModel.from_pretrained(this.modelId); + console.log(`Embedding model loaded successfully: ${this.modelId}`); + } catch (error) { + console.error('Failed to load embedding model:', error); + throw error; + } + } + + /** + * Initialize both the gallery list and the model + */ + async initialize(): Promise { + // If already fully initialized, return immediately + if (this.galleryList && this.model && this.tokenizer) { + return; + } + + // If currently loading, wait for that to finish + if (this.loadingPromise) { + return this.loadingPromise; + } + + // Start loading + this.loadingPromise = (async () => { + await Promise.all([ + this.loadGalleryList(), + this.loadModel(), + ]); + // Clear the promise after successful initialization + this.loadingPromise = null; + })(); + + return this.loadingPromise; + } + + /** + * Calculate cosine similarity between two vectors + */ + private cosineSimilarity(vecA: number[], vecB: number[]): number { + if (vecA.length !== vecB.length) { + throw new Error('Vectors must have the same length'); + } + + let dotProduct = 0; + let normA = 0; + let normB = 0; + + for (let i = 0; i < vecA.length; i++) { + dotProduct += vecA[i] * vecB[i]; + normA += vecA[i] * vecA[i]; + normB += vecB[i] * vecB[i]; + } + + return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); + } + + /** + * Perform a vector similarity search + * @param query - The search query text + * @param limit - Maximum number of results to return + * @returns Array of search results ranked by relevance + */ + async search(query: string, limit: number = 10): Promise { + if (!query || query.trim().length === 0) { + return []; + } + + const totalStartTime = performance.now(); + let initTime = 0; + + try { + // Ensure everything is loaded (will be near-instant after first load) + const wasAlreadyInitialized = this.galleryList && this.model && this.tokenizer; + if (!wasAlreadyInitialized) { + const initStartTime = performance.now(); + await this.initialize(); + initTime = performance.now() - initStartTime; + } + + if (!this.galleryList || !this.model || !this.tokenizer) { + throw new Error('Vector search not properly initialized'); + } + + // Add query prefix and generate embedding + const prefixedQuery = this.queryPrefix + query.trim(); + const tokenizeStartTime = performance.now(); + const inputs = await this.tokenizer([prefixedQuery], { padding: true, truncation: true }); + const tokenizeTime = performance.now() - tokenizeStartTime; + + const modelStartTime = performance.now(); + const { sentence_embedding } = await this.model(inputs); + const queryEmbedding = sentence_embedding.tolist()[0]; + const modelTime = performance.now() - modelStartTime; + + // Calculate similarities with all documents that have embeddings + const similarityStartTime = performance.now(); + const itemsWithEmbeddings = this.galleryList.entries.filter(item => item.embedding); + const results = itemsWithEmbeddings.map((item) => { + const score = this.cosineSimilarity(queryEmbedding, item.embedding!); + return { + ...item, + score, + distance: 1 - score, // Convert similarity to distance + }; + }); + const similarityTime = performance.now() - similarityStartTime; + + // Sort by score (descending) and take top K + const sortStartTime = performance.now(); + results.sort((a, b) => b.score - a.score); + const topResults = results.slice(0, limit); + const sortTime = performance.now() - sortStartTime; + + // Format results to match expected interface + const formatStartTime = performance.now(); + const formattedResults = topResults.map((result, index) => ({ + rank: index + 1, + id: result.id, + legacy_id: result.id, // Use id as legacy_id for compatibility + title: result.title, + description: result.description, + labels: result.labels, + distance: result.distance, + score: result.score, + url: result.url, + })); + const formatTime = performance.now() - formatStartTime; + + const totalTime = performance.now() - totalStartTime; + + console.log('[EmbeddingSearch] Timing breakdown:', { + model: this.modelId, + query, + total: `${totalTime.toFixed(2)}ms`, + initialization: `${initTime.toFixed(2)}ms`, + tokenization: `${tokenizeTime.toFixed(2)}ms`, + modelInference: `${modelTime.toFixed(2)}ms`, + similarityCalc: `${similarityTime.toFixed(2)}ms`, + sorting: `${sortTime.toFixed(2)}ms`, + formatting: `${formatTime.toFixed(2)}ms`, + itemsSearched: itemsWithEmbeddings.length, + resultsReturned: formattedResults.length, + }); + + return formattedResults; + } catch (error) { + console.error('[EmbeddingSearch] Search failed:', error); + // Return empty results on error so the app can continue + return []; + } + } + + /** + * Check if the search service is ready + */ + isReady(): boolean { + return this.galleryList !== null && this.model !== null && this.tokenizer !== null; + } + + /** + * Clear the cached model files to force a fresh download on next load + * Useful for testing cold-start performance + */ + async clearModelCache(): Promise { + try { + // Clear the transformers.js cache + if ('caches' in window) { + const cacheNames = await caches.keys(); + const transformerCaches = cacheNames.filter(name => + name.includes('transformers') || name.includes('huggingface') + ); + + for (const cacheName of transformerCaches) { + await caches.delete(cacheName); + console.log(`Cleared cache: ${cacheName}`); + } + } + + // Reset the model and tokenizer instances + this.model = null; + this.tokenizer = null; + this.loadingPromise = null; + + console.log('[EmbeddingSearch] Model cache cleared. Next search will download fresh model.'); + } catch (error) { + console.error('[EmbeddingSearch] Failed to clear model cache:', error); + throw error; + } + } +} + +// Export singleton instance +const embeddingSearch = new EmbeddingSearch(); + +// Pre-load the model and gallery list at application startup +// This happens in the background so the UI can load while the model downloads +if (typeof window !== 'undefined') { + // Start loading immediately when the module is imported + embeddingSearch.initialize().then(() => { + console.log('[EmbeddingSearch] Model and embeddings pre-loaded and ready for search!'); + }).catch((error) => { + console.error('[EmbeddingSearch] Failed to pre-load model:', error); + }); +} + +/** + * Performs vector search using the local embedding model + * @param query - The search query string + * @param limit - Maximum number of results to return (default: 10) + * @returns Promise resolving to array of search results + */ +export async function vectorSearch(query: string, limit: number = 10): Promise { + return embeddingSearch.search(query, limit); +} + +export { embeddingSearch }; diff --git a/packages/sandcastle/src/Gallery/GalleryItemSearchInput.tsx b/packages/sandcastle/src/Gallery/GalleryItemSearchInput.tsx index befb4e6e972..64939c5325f 100644 --- a/packages/sandcastle/src/Gallery/GalleryItemSearchInput.tsx +++ b/packages/sandcastle/src/Gallery/GalleryItemSearchInput.tsx @@ -8,35 +8,30 @@ export function GalleryItemSearchInput() { const store = useGalleryItemContext(); const inputRef = useRef(null); const { setSearchTerm, items } = store ?? {}; - const [hasValue, setHasValue] = useState(false); + const [inputValue, setInputValue] = useState(""); const clearSearch = useCallback(() => { - const input = inputRef.current; - if (input) { - input.value = ""; - setHasValue(false); - input.focus(); - } - + setInputValue(""); + if (setSearchTerm) { setSearchTerm(null); } + + // Focus input after clearing + setTimeout(() => inputRef.current?.focus(), 0); }, [setSearchTerm]); const updateSearch = useCallback( (e: { target: { value: string | null } }) => { - let term = e.target.value; - setHasValue(!!term && term !== ""); + let term = e.target.value || ""; + + // Update local state immediately for responsive input + setInputValue(term); + + // Update search term in store if (setSearchTerm) { - if (term) { - term = term.trim(); - } - - if (!term || term === "") { - term = null; - } - - setSearchTerm(term); + const trimmed = term.trim(); + setSearchTerm(trimmed || null); } }, [setSearchTerm], @@ -50,12 +45,13 @@ export function GalleryItemSearchInput() {