Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 37 additions & 12 deletions cmd/lil-rag/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -826,32 +826,56 @@ func handleDelete(ctx context.Context, rag *lilrag.LilRag, args []string) error

func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error {
if len(args) > 0 && (args[0] == "--help" || args[0] == "-h") {
fmt.Println("Usage: lil-rag reindex [--force]")
fmt.Println("Usage: lil-rag reindex [--force] [--chunking=STRATEGY]")
fmt.Println("")
fmt.Println("Reprocess all documents with the current recursive chunking configuration.")
fmt.Println("Reprocess all documents with the specified chunking configuration.")
fmt.Println("This will:")
fmt.Println(" • Re-chunk all documents using the latest algorithm")
fmt.Println(" • Re-chunk all documents using the specified algorithm")
fmt.Println(" • Regenerate embeddings for all chunks")
fmt.Println(" • Update chunk boundaries and overlap")
fmt.Println(" • Preserve original document content and metadata")
fmt.Println("")
fmt.Println("Options:")
fmt.Println(" --force Skip confirmation prompt")
fmt.Println(" --force Skip confirmation prompt")
fmt.Println(" --chunking=STRATEGY Chunking strategy to use (default: recursive)")
fmt.Println("")
fmt.Println("Available chunking strategies:")
fmt.Println(" recursive Adaptive chunking with semantic boundaries (default)")
fmt.Println(" fast Small chunks for precise search (128 tokens, 19 overlap)")
fmt.Println(" contextual Large chunks for context preservation (512 tokens, 76 overlap)")
fmt.Println(" legacy Large chunks for backward compatibility (1800 tokens, 200 overlap)")
fmt.Println(" fallback Simple word-based chunking")
fmt.Println("")
fmt.Println("Note: This operation can take several minutes depending on the number")
fmt.Println("of documents and their size. The system will remain accessible during")
fmt.Println("reindexing, but performance may be impacted.")
return nil
}

// Check if --force flag is provided
// Parse flags
force := false
chunkingStrategy := "recursive" // default strategy
for _, arg := range args {
if arg == forceFlag {
force = true
} else if strings.HasPrefix(arg, "--chunking=") {
chunkingStrategy = strings.TrimPrefix(arg, "--chunking=")
}
}

// Validate chunking strategy
validStrategies := []string{"recursive", "fast", "contextual", "legacy", "fallback"}
isValid := false
for _, valid := range validStrategies {
if chunkingStrategy == valid {
isValid = true
break
}
}
if !isValid {
return fmt.Errorf("invalid chunking strategy '%s'. Valid strategies: %s",
chunkingStrategy, strings.Join(validStrategies, ", "))
}

// Get document count for confirmation
documents, err := rag.ListDocuments(ctx)
Expand All @@ -864,9 +888,9 @@ func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error
return nil
}

fmt.Printf("This will reindex %d documents with recursive chunking.\n", len(documents))
fmt.Printf("This will reindex %d documents with %s chunking.\n", len(documents), chunkingStrategy)
fmt.Println("The process will:")
fmt.Println(" • Re-chunk all documents using the latest algorithm")
fmt.Println(" • Re-chunk all documents using the specified algorithm")
fmt.Println(" • Regenerate embeddings for improved search performance")
fmt.Println(" • Update chunk boundaries for better semantic coherence")
fmt.Printf("\nEstimated time: %d-%d minutes (depending on document size and Ollama performance)\n",
Expand All @@ -887,11 +911,11 @@ func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error
}
}

fmt.Println("\n🔄 Starting reindex with recursive chunking...")
fmt.Printf("\n🔄 Starting reindex with %s chunking...\n", chunkingStrategy)
fmt.Println("This may take several minutes. Please do not interrupt the process.")

startTime := time.Now()
err = rag.ReindexAllDocuments(ctx)
err = rag.ReindexAllDocumentsWithStrategy(ctx, chunkingStrategy)
duration := time.Since(startTime)

if err != nil {
Expand All @@ -900,7 +924,7 @@ func handleReindex(ctx context.Context, rag *lilrag.LilRag, args []string) error
}

fmt.Printf("\n✅ Reindex completed successfully in %v\n", duration)
fmt.Printf("All %d documents have been reprocessed with recursive chunking.\n", len(documents))
fmt.Printf("All %d documents have been reprocessed with %s chunking.\n", len(documents), chunkingStrategy)
fmt.Println("Your RAG system now uses improved chunk boundaries for better search performance.")

return nil
Expand Down Expand Up @@ -1058,7 +1082,7 @@ func printUsage() {
fmt.Println(" --list-sessions List all chat sessions")
fmt.Println(" documents List all indexed documents")
fmt.Println(" delete <id> [--force] Delete a document by ID")
fmt.Println(" reindex [--force] Reprocess all documents with recursive chunking")
fmt.Println(" reindex [--force] [--chunking=STRATEGY] Reprocess all documents with specified chunking")
fmt.Println(" health Check system health status")
fmt.Println(" config <init|show|set> Manage user profile configuration")
fmt.Println(" reset [--force] Delete database and all indexed data")
Expand Down Expand Up @@ -1114,8 +1138,9 @@ func printUsage() {
fmt.Println(" lil-rag chat --list-sessions # List all chat sessions")
fmt.Println(" lil-rag documents # List all documents")
fmt.Println(" lil-rag delete doc1 --force # Delete document")
fmt.Println(" lil-rag reindex # Reindex all documents with recursive chunking")
fmt.Println(" lil-rag reindex # Reindex all documents with recursive chunking (default)")
fmt.Println(" lil-rag reindex --force # Reindex without confirmation")
fmt.Println(" lil-rag reindex --chunking=fast # Reindex with fast chunking strategy")
fmt.Println(" lil-rag health # Check system health")
fmt.Println(" lil-rag auth add alice password123 # Add user with username and password")
fmt.Println(" lil-rag auth list # List all users")
Expand Down
71 changes: 71 additions & 0 deletions docs/CONFIGURATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,77 @@ Controls how documents are split into searchable chunks.
lil-rag config set chunking.overlap 200
```

## Reindexing and Chunking Strategies

The `reindex` command allows you to reprocess all documents with different chunking strategies without losing your original content.

### Using Reindex with Chunking Strategies

```bash
# Default recursive chunking (uses current configuration)
lil-rag reindex

# Fast search optimization (small chunks)
lil-rag reindex --chunking=fast

# Context preservation (large chunks)
lil-rag reindex --chunking=contextual

# Legacy compatibility
lil-rag reindex --chunking=legacy

# Simple word-based chunking
lil-rag reindex --chunking=fallback

# Skip confirmation prompt
lil-rag reindex --force --chunking=fast
```

### Available Chunking Strategies

#### `recursive` (default)
- **Description**: Adaptive chunking with semantic boundaries
- **Behavior**: Uses current `chunking.max_tokens` and `chunking.overlap` settings
- **Best for**: Balanced performance with semantic coherence
- **Algorithm**: Hierarchical text splitting that respects document structure

#### `fast`
- **Tokens**: 128 max, 19 overlap
- **Best for**: Precise search results, Q&A applications
- **Trade-off**: Less context per chunk, more precise retrieval

#### `contextual`
- **Tokens**: 512 max, 76 overlap
- **Best for**: Summarization, context-heavy applications
- **Trade-off**: More context per chunk, potentially less precise retrieval

#### `legacy`
- **Tokens**: 1800 max, 200 overlap
- **Best for**: Backward compatibility with older configurations
- **Trade-off**: Large chunks may reduce search precision

#### `fallback`
- **Description**: Simple word-based chunking
- **Behavior**: Uses current token settings but applies basic word splitting
- **Best for**: Troubleshooting or when semantic chunking fails

### Strategy Selection Guidelines

```bash
# For knowledge bases and FAQs
lil-rag config set chunking.max-tokens 128
lil-rag config set chunking.overlap 19
lil-rag reindex --chunking=fast

# For document summarization
lil-rag config set chunking.max-tokens 512
lil-rag config set chunking.overlap 76
lil-rag reindex --chunking=contextual

# For existing systems (pre-2025)
lil-rag reindex --chunking=legacy
```

## Command Line Overrides

All configuration options can be overridden with command line flags.
Expand Down
5 changes: 5 additions & 0 deletions pkg/lilrag/chunker.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ func (tc *TextChunker) ChunkText(text string) []Chunk {
return semanticChunks
}

// ChunkTextWithFallback applies simple word-based chunking (for fallback strategy)
func (tc *TextChunker) ChunkTextWithFallback(text string) []Chunk {
return tc.fallbackChunk(text, "text")
}

// detectContentType analyzes text to determine optimal chunking strategy
func (tc *TextChunker) detectContentType(text string) string {
codeIndicators := []string{"function", "class", "def ", "```", "import ", "#include", "var ", "let ", "const "}
Expand Down
109 changes: 109 additions & 0 deletions pkg/lilrag/chunker_strategy_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package lilrag

import (
"testing"
)

// Test the new ChunkTextWithFallback method
func TestTextChunker_ChunkTextWithFallback(t *testing.T) {
tests := []struct {
name string
maxTokens int
overlap int
text string
wantChunks int
wantPattern string // Pattern to check in first chunk
}{
{
name: "simple_text",
maxTokens: 5,
overlap: 1,
text: "This is a simple test document.",
wantChunks: 2,
wantPattern: "This is a simple test",
},
{
name: "single_chunk",
maxTokens: 10,
overlap: 2,
text: "Short text.",
wantChunks: 1,
wantPattern: "Short text.",
},
{
name: "empty_text",
maxTokens: 5,
overlap: 1,
text: "",
wantChunks: 0,
wantPattern: "",
},
{
name: "long_text",
maxTokens: 3,
overlap: 0,
text: "One two three four five six seven eight nine ten",
wantChunks: 4,
wantPattern: "One two three",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
chunker := NewTextChunker(tt.maxTokens, tt.overlap)
chunks := chunker.ChunkTextWithFallback(tt.text)

if len(chunks) != tt.wantChunks {
t.Errorf("ChunkTextWithFallback() got %d chunks, want %d", len(chunks), tt.wantChunks)
}

if tt.wantChunks > 0 && chunks[0].Text != tt.wantPattern {
t.Errorf("ChunkTextWithFallback() first chunk = %q, want %q", chunks[0].Text, tt.wantPattern)
}

// Verify each chunk has correct token count
for i, chunk := range chunks {
expectedTokens := chunker.EstimateTokenCount(chunk.Text)
if chunk.TokenCount != expectedTokens {
t.Errorf("Chunk %d token count = %d, want %d", i, chunk.TokenCount, expectedTokens)
}
}
})
}
}

// Test chunker creation for different strategies
func TestCreateChunkerForStrategy(t *testing.T) {
// Create a mock LilRag with default chunker
defaultChunker := NewTextChunker(256, 38)
rag := &LilRag{chunker: defaultChunker}

tests := []struct {
strategy string
wantTokens int
wantOverlap int
}{
{"fast", 128, 19},
{"contextual", 512, 76},
{"legacy", 1800, 200},
{"fallback", 256, 38}, // Uses current settings
{"recursive", 256, 38}, // Uses current settings
{"invalid", 256, 38}, // Uses current settings (default)
}

for _, tt := range tests {
t.Run(tt.strategy, func(t *testing.T) {
chunker := rag.createChunkerForStrategy(tt.strategy)

if chunker.MaxTokens != tt.wantTokens {
t.Errorf("createChunkerForStrategy(%s) MaxTokens = %d, want %d",
tt.strategy, chunker.MaxTokens, tt.wantTokens)
}

if chunker.Overlap != tt.wantOverlap {
t.Errorf("createChunkerForStrategy(%s) Overlap = %d, want %d",
tt.strategy, chunker.Overlap, tt.wantOverlap)
}
})
}
}
Loading
Loading