diff --git a/cmd/lil-rag/main.go b/cmd/lil-rag/main.go index 0a99a9b..0e916a2 100644 --- a/cmd/lil-rag/main.go +++ b/cmd/lil-rag/main.go @@ -648,7 +648,6 @@ func handleChat(ctx context.Context, rag *lilrag.LilRag, profileConfig *config.P i++ } } - break } } @@ -826,17 +825,23 @@ func handleDelete(ctx context.Context, rag *lilrag.LilRag, args []string) error func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error { if len(args) > 0 && (args[0] == "--help" || args[0] == "-h") { - fmt.Println("Usage: lil-rag reindex [--force]") + fmt.Println("Usage: lil-rag reindex [--force] [--chunking=STRATEGY]") fmt.Println("") - fmt.Println("Reprocess all documents with the current recursive chunking configuration.") + fmt.Println("Reprocess all documents with the specified chunking configuration.") fmt.Println("This will:") - fmt.Println(" • Re-chunk all documents using the latest algorithm") + fmt.Println(" • Re-chunk all documents using the specified algorithm") fmt.Println(" • Regenerate embeddings for all chunks") fmt.Println(" • Update chunk boundaries and overlap") fmt.Println(" • Preserve original document content and metadata") fmt.Println("") fmt.Println("Options:") - fmt.Println(" --force Skip confirmation prompt") + fmt.Println(" --force Skip confirmation prompt") + fmt.Println(" --chunking=STRATEGY Chunking strategy to use (default: recursive)") + fmt.Println("") + fmt.Println("Available chunking strategies:") + fmt.Println(" recursive Hierarchical text splitting with semantic boundaries (default)") + fmt.Println(" semantic Adaptive chunking focused on semantic coherence") + fmt.Println(" simple Basic word-based chunking") fmt.Println("") fmt.Println("Note: This operation can take several minutes depending on the number") fmt.Println("of documents and their size. The system will remain accessible during") @@ -844,14 +849,30 @@ func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error return nil } - // Check if --force flag is provided + // Parse flags force := false + chunkingStrategy := "recursive" // default strategy for _, arg := range args { if arg == forceFlag { force = true + } else if strings.HasPrefix(arg, "--chunking=") { + chunkingStrategy = strings.TrimPrefix(arg, "--chunking=") + } + } + + // Validate chunking strategy + validStrategies := []string{"recursive", "semantic", "simple"} + isValid := false + for _, valid := range validStrategies { + if chunkingStrategy == valid { + isValid = true break } } + if !isValid { + return fmt.Errorf("invalid chunking strategy '%s'. Valid strategies: %s", + chunkingStrategy, strings.Join(validStrategies, ", ")) + } // Get document count for confirmation documents, err := rag.ListDocuments(ctx) @@ -864,9 +885,9 @@ func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error return nil } - fmt.Printf("This will reindex %d documents with recursive chunking.\n", len(documents)) + fmt.Printf("This will reindex %d documents with %s chunking.\n", len(documents), chunkingStrategy) fmt.Println("The process will:") - fmt.Println(" • Re-chunk all documents using the latest algorithm") + fmt.Println(" • Re-chunk all documents using the specified algorithm") fmt.Println(" • Regenerate embeddings for improved search performance") fmt.Println(" • Update chunk boundaries for better semantic coherence") fmt.Printf("\nEstimated time: %d-%d minutes (depending on document size and Ollama performance)\n", @@ -887,11 +908,11 @@ func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error } } - fmt.Println("\n🔄 Starting reindex with recursive chunking...") + fmt.Printf("\n🔄 Starting reindex with %s chunking...\n", chunkingStrategy) fmt.Println("This may take several minutes. Please do not interrupt the process.") startTime := time.Now() - err = rag.ReindexAllDocuments(ctx) + err = rag.ReindexAllDocumentsWithStrategy(ctx, chunkingStrategy) duration := time.Since(startTime) if err != nil { @@ -900,7 +921,7 @@ func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error } fmt.Printf("\n✅ Reindex completed successfully in %v\n", duration) - fmt.Printf("All %d documents have been reprocessed with recursive chunking.\n", len(documents)) + fmt.Printf("All %d documents have been reprocessed with %s chunking.\n", len(documents), chunkingStrategy) fmt.Println("Your RAG system now uses improved chunk boundaries for better search performance.") return nil @@ -1058,7 +1079,7 @@ func printUsage() { fmt.Println(" --list-sessions List all chat sessions") fmt.Println(" documents List all indexed documents") fmt.Println(" delete [--force] Delete a document by ID") - fmt.Println(" reindex [--force] Reprocess all documents with recursive chunking") + fmt.Println(" reindex [--force] [--chunking=STRATEGY] Reprocess all documents with specified chunking") fmt.Println(" health Check system health status") fmt.Println(" config Manage user profile configuration") fmt.Println(" reset [--force] Delete database and all indexed data") @@ -1114,8 +1135,9 @@ func printUsage() { fmt.Println(" lil-rag chat --list-sessions # List all chat sessions") fmt.Println(" lil-rag documents # List all documents") fmt.Println(" lil-rag delete doc1 --force # Delete document") - fmt.Println(" lil-rag reindex # Reindex all documents with recursive chunking") + fmt.Println(" lil-rag reindex # Reindex all documents with recursive chunking (default)") fmt.Println(" lil-rag reindex --force # Reindex without confirmation") + fmt.Println(" lil-rag reindex --chunking=simple # Reindex with simple chunking strategy") fmt.Println(" lil-rag health # Check system health") fmt.Println(" lil-rag auth add alice password123 # Add user with username and password") fmt.Println(" lil-rag auth list # List all users") diff --git a/main b/main deleted file mode 100755 index 8d60f44..0000000 Binary files a/main and /dev/null differ diff --git a/pkg/lilrag/chunker.go b/pkg/lilrag/chunker.go index 45f12a9..e5a348e 100644 --- a/pkg/lilrag/chunker.go +++ b/pkg/lilrag/chunker.go @@ -89,6 +89,30 @@ func (tc *TextChunker) ChunkText(text string) []Chunk { return semanticChunks } +// ChunkTextWithStrategy applies the specified chunking strategy +func (tc *TextChunker) ChunkTextWithStrategy(text, strategy string) []Chunk { + text = strings.TrimSpace(text) + if text == "" { + return nil + } + + contentType := tc.detectContentType(text) + + switch strategy { + case "simple": + return tc.fallbackChunk(text, contentType) + case "semantic": + // Semantic chunking focuses on content-aware boundaries + return tc.adaptiveChunk(text, contentType) + case "recursive": + // Default recursive chunking (same as current ChunkText behavior) + return tc.ChunkText(text) + default: + // Default recursive chunking (same as current ChunkText behavior) + return tc.ChunkText(text) + } +} + // detectContentType analyzes text to determine optimal chunking strategy func (tc *TextChunker) detectContentType(text string) string { codeIndicators := []string{"function", "class", "def ", "```", "import ", "#include", "var ", "let ", "const "} diff --git a/pkg/lilrag/lilrag.go b/pkg/lilrag/lilrag.go index 94c9f90..efd0708 100644 --- a/pkg/lilrag/lilrag.go +++ b/pkg/lilrag/lilrag.go @@ -740,6 +740,52 @@ func (m *LilRag) ReindexAllDocuments(ctx context.Context) error { return nil } +// ReindexAllDocumentsWithStrategy reprocesses all documents with the specified chunking strategy +func (m *LilRag) ReindexAllDocumentsWithStrategy(ctx context.Context, strategy string) error { + if m.storage == nil || m.chunker == nil || m.embedder == nil || m.documentHandler == nil { + return fmt.Errorf("LilRag not properly initialized") + } + + // Get all documents + documents, err := m.storage.ListDocuments(ctx) + if err != nil { + return fmt.Errorf("failed to list documents: %w", err) + } + + if len(documents) == 0 { + fmt.Println("No documents found to reindex") + return nil + } + + fmt.Printf("Starting reindex of %d documents with %s chunking...\n", len(documents), strategy) + + processed := 0 + failed := 0 + + for i, doc := range documents { + fmt.Printf("Reindexing document %d/%d: %s\n", i+1, len(documents), doc.ID) + + err := m.reindexDocumentWithStrategy(ctx, &doc, strategy) + if err != nil { + fmt.Printf("Failed to reindex document %s: %v\n", doc.ID, err) + failed++ + continue + } + + processed++ + if processed%10 == 0 { + fmt.Printf("Progress: %d/%d documents processed\n", processed, len(documents)) + } + } + + fmt.Printf("Reindex completed: %d processed, %d failed\n", processed, failed) + if failed > 0 { + return fmt.Errorf("reindex completed with %d failures", failed) + } + + return nil +} + // reindexDocument reprocesses a single document with current chunking settings func (m *LilRag) reindexDocument(ctx context.Context, doc *DocumentInfo) error { // If document has a source path, try to reprocess from the original file @@ -826,6 +872,63 @@ func (m *LilRag) reindexWithChunks(ctx context.Context, doc *DocumentInfo, chunk return m.storage.IndexChunks(ctx, doc.ID, combinedText.String(), chunks, embeddings) } +// reindexDocumentWithStrategy reprocesses a single document with the specified chunking strategy +func (m *LilRag) reindexDocumentWithStrategy(ctx context.Context, doc *DocumentInfo, strategy string) error { + // If document has a source path, try to reprocess from the original file + if doc.SourcePath != "" { + // Check if the source file still exists + if _, err := os.Stat(doc.SourcePath); err == nil { + // File exists, reprocess from original with strategy + return m.reindexFromFileWithStrategy(ctx, doc, strategy) + } + // File doesn't exist, fall back to reprocessing stored text + fmt.Printf("Source file %s not found, reprocessing from stored text\n", doc.SourcePath) + } + + // Reprocess from stored text content with strategy + return m.reindexFromTextWithStrategy(ctx, doc, strategy) +} + +// reindexFromFileWithStrategy reprocesses a document from its original file with the specified strategy +func (m *LilRag) reindexFromFileWithStrategy(ctx context.Context, doc *DocumentInfo, strategy string) error { + if !m.documentHandler.IsSupported(doc.SourcePath) { + return fmt.Errorf("unsupported file format: %s", doc.SourcePath) + } + + // Parse the file to get text content + text, err := m.documentHandler.ParseFile(doc.SourcePath) + if err != nil { + return fmt.Errorf("failed to parse file %s: %w", doc.SourcePath, err) + } + + if text == "" { + return fmt.Errorf("no content found in file %s", doc.SourcePath) + } + + // Apply the specified chunking strategy + chunks := m.chunker.ChunkTextWithStrategy(text, strategy) + if len(chunks) == 0 { + return fmt.Errorf("failed to create chunks from file %s", doc.SourcePath) + } + + return m.reindexWithChunks(ctx, doc, chunks) +} + +// reindexFromTextWithStrategy reprocesses a document from its stored text with the specified strategy +func (m *LilRag) reindexFromTextWithStrategy(ctx context.Context, doc *DocumentInfo, strategy string) error { + if doc.Text == "" { + return fmt.Errorf("no text content available for document %s", doc.ID) + } + + // Apply the specified chunking strategy + chunks := m.chunker.ChunkTextWithStrategy(doc.Text, strategy) + if len(chunks) == 0 { + return fmt.Errorf("failed to create chunks from text for document %s", doc.ID) + } + + return m.reindexWithChunks(ctx, doc, chunks) +} + // Services returns the modern service interfaces func (m *LilRag) Services() *Services { return m.services