Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 35 additions & 13 deletions cmd/lil-rag/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,6 @@ func handleChat(ctx context.Context, rag *lilrag.LilRag, profileConfig *config.P
i++
}
}
break
}
}

Expand Down Expand Up @@ -826,32 +825,54 @@ func handleDelete(ctx context.Context, rag *lilrag.LilRag, args []string) error

func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error {
if len(args) > 0 && (args[0] == "--help" || args[0] == "-h") {
fmt.Println("Usage: lil-rag reindex [--force]")
fmt.Println("Usage: lil-rag reindex [--force] [--chunking=STRATEGY]")
fmt.Println("")
fmt.Println("Reprocess all documents with the current recursive chunking configuration.")
fmt.Println("Reprocess all documents with the specified chunking configuration.")
fmt.Println("This will:")
fmt.Println(" • Re-chunk all documents using the latest algorithm")
fmt.Println(" • Re-chunk all documents using the specified algorithm")
fmt.Println(" • Regenerate embeddings for all chunks")
fmt.Println(" • Update chunk boundaries and overlap")
fmt.Println(" • Preserve original document content and metadata")
fmt.Println("")
fmt.Println("Options:")
fmt.Println(" --force Skip confirmation prompt")
fmt.Println(" --force Skip confirmation prompt")
fmt.Println(" --chunking=STRATEGY Chunking strategy to use (default: recursive)")
fmt.Println("")
fmt.Println("Available chunking strategies:")
fmt.Println(" recursive Hierarchical text splitting with semantic boundaries (default)")
fmt.Println(" semantic Adaptive chunking focused on semantic coherence")
fmt.Println(" simple Basic word-based chunking")
fmt.Println("")
fmt.Println("Note: This operation can take several minutes depending on the number")
fmt.Println("of documents and their size. The system will remain accessible during")
fmt.Println("reindexing, but performance may be impacted.")
return nil
}

// Check if --force flag is provided
// Parse flags
force := false
chunkingStrategy := "recursive" // default strategy
for _, arg := range args {
if arg == forceFlag {
force = true
} else if strings.HasPrefix(arg, "--chunking=") {
chunkingStrategy = strings.TrimPrefix(arg, "--chunking=")
}
}

// Validate chunking strategy
validStrategies := []string{"recursive", "semantic", "simple"}
isValid := false
for _, valid := range validStrategies {
if chunkingStrategy == valid {
isValid = true
break
}
}
if !isValid {
return fmt.Errorf("invalid chunking strategy '%s'. Valid strategies: %s",
chunkingStrategy, strings.Join(validStrategies, ", "))
}

// Get document count for confirmation
documents, err := rag.ListDocuments(ctx)
Expand All @@ -864,9 +885,9 @@ func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error
return nil
}

fmt.Printf("This will reindex %d documents with recursive chunking.\n", len(documents))
fmt.Printf("This will reindex %d documents with %s chunking.\n", len(documents), chunkingStrategy)
fmt.Println("The process will:")
fmt.Println(" • Re-chunk all documents using the latest algorithm")
fmt.Println(" • Re-chunk all documents using the specified algorithm")
fmt.Println(" • Regenerate embeddings for improved search performance")
fmt.Println(" • Update chunk boundaries for better semantic coherence")
fmt.Printf("\nEstimated time: %d-%d minutes (depending on document size and Ollama performance)\n",
Expand All @@ -887,11 +908,11 @@ func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error
}
}

fmt.Println("\n🔄 Starting reindex with recursive chunking...")
fmt.Printf("\n🔄 Starting reindex with %s chunking...\n", chunkingStrategy)
fmt.Println("This may take several minutes. Please do not interrupt the process.")

startTime := time.Now()
err = rag.ReindexAllDocuments(ctx)
err = rag.ReindexAllDocumentsWithStrategy(ctx, chunkingStrategy)
duration := time.Since(startTime)

if err != nil {
Expand All @@ -900,7 +921,7 @@ func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error
}

fmt.Printf("\n✅ Reindex completed successfully in %v\n", duration)
fmt.Printf("All %d documents have been reprocessed with recursive chunking.\n", len(documents))
fmt.Printf("All %d documents have been reprocessed with %s chunking.\n", len(documents), chunkingStrategy)
fmt.Println("Your RAG system now uses improved chunk boundaries for better search performance.")

return nil
Expand Down Expand Up @@ -1058,7 +1079,7 @@ func printUsage() {
fmt.Println(" --list-sessions List all chat sessions")
fmt.Println(" documents List all indexed documents")
fmt.Println(" delete <id> [--force] Delete a document by ID")
fmt.Println(" reindex [--force] Reprocess all documents with recursive chunking")
fmt.Println(" reindex [--force] [--chunking=STRATEGY] Reprocess all documents with specified chunking")
fmt.Println(" health Check system health status")
fmt.Println(" config <init|show|set> Manage user profile configuration")
fmt.Println(" reset [--force] Delete database and all indexed data")
Expand Down Expand Up @@ -1114,8 +1135,9 @@ func printUsage() {
fmt.Println(" lil-rag chat --list-sessions # List all chat sessions")
fmt.Println(" lil-rag documents # List all documents")
fmt.Println(" lil-rag delete doc1 --force # Delete document")
fmt.Println(" lil-rag reindex # Reindex all documents with recursive chunking")
fmt.Println(" lil-rag reindex # Reindex all documents with recursive chunking (default)")
fmt.Println(" lil-rag reindex --force # Reindex without confirmation")
fmt.Println(" lil-rag reindex --chunking=simple # Reindex with simple chunking strategy")
fmt.Println(" lil-rag health # Check system health")
fmt.Println(" lil-rag auth add alice password123 # Add user with username and password")
fmt.Println(" lil-rag auth list # List all users")
Expand Down
Binary file removed main
Binary file not shown.
24 changes: 24 additions & 0 deletions pkg/lilrag/chunker.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,30 @@ func (tc *TextChunker) ChunkText(text string) []Chunk {
return semanticChunks
}

// ChunkTextWithStrategy applies the specified chunking strategy
func (tc *TextChunker) ChunkTextWithStrategy(text, strategy string) []Chunk {
text = strings.TrimSpace(text)
if text == "" {
return nil
}

contentType := tc.detectContentType(text)

switch strategy {
case "simple":
return tc.fallbackChunk(text, contentType)
case "semantic":
// Semantic chunking focuses on content-aware boundaries
return tc.adaptiveChunk(text, contentType)
case "recursive":
// Default recursive chunking (same as current ChunkText behavior)
return tc.ChunkText(text)
default:
// Default recursive chunking (same as current ChunkText behavior)
return tc.ChunkText(text)
}
}

// detectContentType analyzes text to determine optimal chunking strategy
func (tc *TextChunker) detectContentType(text string) string {
codeIndicators := []string{"function", "class", "def ", "```", "import ", "#include", "var ", "let ", "const "}
Expand Down
103 changes: 103 additions & 0 deletions pkg/lilrag/lilrag.go
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,52 @@ func (m *LilRag) ReindexAllDocuments(ctx context.Context) error {
return nil
}

// ReindexAllDocumentsWithStrategy reprocesses all documents with the specified chunking strategy
func (m *LilRag) ReindexAllDocumentsWithStrategy(ctx context.Context, strategy string) error {
if m.storage == nil || m.chunker == nil || m.embedder == nil || m.documentHandler == nil {
return fmt.Errorf("LilRag not properly initialized")
}

// Get all documents
documents, err := m.storage.ListDocuments(ctx)
if err != nil {
return fmt.Errorf("failed to list documents: %w", err)
}

if len(documents) == 0 {
fmt.Println("No documents found to reindex")
return nil
}

fmt.Printf("Starting reindex of %d documents with %s chunking...\n", len(documents), strategy)

processed := 0
failed := 0

for i, doc := range documents {
fmt.Printf("Reindexing document %d/%d: %s\n", i+1, len(documents), doc.ID)

err := m.reindexDocumentWithStrategy(ctx, &doc, strategy)
if err != nil {
fmt.Printf("Failed to reindex document %s: %v\n", doc.ID, err)
failed++
continue
}

processed++
if processed%10 == 0 {
fmt.Printf("Progress: %d/%d documents processed\n", processed, len(documents))
}
}

fmt.Printf("Reindex completed: %d processed, %d failed\n", processed, failed)
if failed > 0 {
return fmt.Errorf("reindex completed with %d failures", failed)
}

return nil
}

// reindexDocument reprocesses a single document with current chunking settings
func (m *LilRag) reindexDocument(ctx context.Context, doc *DocumentInfo) error {
// If document has a source path, try to reprocess from the original file
Expand Down Expand Up @@ -826,6 +872,63 @@ func (m *LilRag) reindexWithChunks(ctx context.Context, doc *DocumentInfo, chunk
return m.storage.IndexChunks(ctx, doc.ID, combinedText.String(), chunks, embeddings)
}

// reindexDocumentWithStrategy reprocesses a single document with the specified chunking strategy
func (m *LilRag) reindexDocumentWithStrategy(ctx context.Context, doc *DocumentInfo, strategy string) error {
// If document has a source path, try to reprocess from the original file
if doc.SourcePath != "" {
// Check if the source file still exists
if _, err := os.Stat(doc.SourcePath); err == nil {
// File exists, reprocess from original with strategy
return m.reindexFromFileWithStrategy(ctx, doc, strategy)
}
// File doesn't exist, fall back to reprocessing stored text
fmt.Printf("Source file %s not found, reprocessing from stored text\n", doc.SourcePath)
}

// Reprocess from stored text content with strategy
return m.reindexFromTextWithStrategy(ctx, doc, strategy)
}

// reindexFromFileWithStrategy reprocesses a document from its original file with the specified strategy
func (m *LilRag) reindexFromFileWithStrategy(ctx context.Context, doc *DocumentInfo, strategy string) error {
if !m.documentHandler.IsSupported(doc.SourcePath) {
return fmt.Errorf("unsupported file format: %s", doc.SourcePath)
}

// Parse the file to get text content
text, err := m.documentHandler.ParseFile(doc.SourcePath)
if err != nil {
return fmt.Errorf("failed to parse file %s: %w", doc.SourcePath, err)
}

if text == "" {
return fmt.Errorf("no content found in file %s", doc.SourcePath)
}

// Apply the specified chunking strategy
chunks := m.chunker.ChunkTextWithStrategy(text, strategy)
if len(chunks) == 0 {
return fmt.Errorf("failed to create chunks from file %s", doc.SourcePath)
}

return m.reindexWithChunks(ctx, doc, chunks)
}

// reindexFromTextWithStrategy reprocesses a document from its stored text with the specified strategy
func (m *LilRag) reindexFromTextWithStrategy(ctx context.Context, doc *DocumentInfo, strategy string) error {
if doc.Text == "" {
return fmt.Errorf("no text content available for document %s", doc.ID)
}

// Apply the specified chunking strategy
chunks := m.chunker.ChunkTextWithStrategy(doc.Text, strategy)
if len(chunks) == 0 {
return fmt.Errorf("failed to create chunks from text for document %s", doc.ID)
}

return m.reindexWithChunks(ctx, doc, chunks)
}

// Services returns the modern service interfaces
func (m *LilRag) Services() *Services {
return m.services
Expand Down
Loading