Skip to content

Commit bd1a0e4

Browse files
Copilotdanielsiegl
andauthored
Implement schema/data separation feature for cleaner Git workflows (#110)
* Implement schema/data separation feature with new CLI flags Co-authored-by: danielsiegl <41949368+danielsiegl@users.noreply.github.com> * Add comprehensive documentation and CI tests for schema/data separation Co-authored-by: danielsiegl <41949368+danielsiegl@users.noreply.github.com> * #109 removed build dependency * #109 updated README.md * Fix multi-line CREATE TABLE handling in schema separation Co-authored-by: danielsiegl <41949368+danielsiegl@users.noreply.github.com> * Fix CI/CD failures: resolve schema file auto-loading issue in smudge operation Co-authored-by: danielsiegl <41949368+danielsiegl@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: danielsiegl <41949368+danielsiegl@users.noreply.github.com> Co-authored-by: Daniel Siegl <daniel.siegl@syntevo.com>
1 parent b97cd80 commit bd1a0e4

8 files changed

Lines changed: 416 additions & 44 deletions

File tree

.github/workflows/main.yml

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,109 @@ jobs:
231231
name: sql-artifacts-windows-amd64
232232
path: '*.sql'
233233
retention-days: 5
234+
235+
test-schema-data-separation:
236+
needs: [build, create-smoketest-db]
237+
runs-on: ubuntu-latest
238+
steps:
239+
- uses: actions/checkout@v4
240+
- name: Download binaries
241+
uses: actions/download-artifact@v4
242+
with:
243+
name: gitsqlite-binaries
244+
path: bin
245+
- name: Download smoketest database
246+
uses: actions/download-artifact@v4
247+
with:
248+
name: smoketest-db
249+
path: .
250+
- name: Make Linux binary executable
251+
run: chmod +x bin/gitsqlite-linux-amd64
252+
- name: Test schema/data separation workflow
253+
run: |
254+
echo "=== Testing Schema/Data Separation Feature ==="
255+
256+
echo "Step 1: Extract data-only + schema to separate files"
257+
./bin/gitsqlite-linux-amd64 -data-only -schema-output .gitsqliteschema clean < smoketest.db > data_only.sql
258+
259+
echo "Step 2: Verify schema file was created"
260+
if [ ! -f .gitsqliteschema ]; then
261+
echo "ERROR: Schema file .gitsqliteschema was not created"
262+
exit 1
263+
fi
264+
265+
echo "Step 3: Verify data file contains only INSERT statements (and pragmas)"
266+
if ! grep -q "INSERT INTO" data_only.sql; then
267+
echo "ERROR: Data file missing INSERT statements"
268+
exit 1
269+
fi
270+
if grep -q "CREATE TABLE" data_only.sql; then
271+
echo "ERROR: Data file contains CREATE statements (should be data-only)"
272+
exit 1
273+
fi
274+
275+
echo "Step 4: Verify schema file contains only CREATE statements (and pragmas)"
276+
if ! grep -q "CREATE TABLE" .gitsqliteschema; then
277+
echo "ERROR: Schema file missing CREATE statements"
278+
exit 1
279+
fi
280+
if grep -q "INSERT INTO" .gitsqliteschema; then
281+
echo "ERROR: Schema file contains INSERT statements (should be schema-only)"
282+
exit 1
283+
fi
284+
285+
echo "Step 5: Restore database from separated files"
286+
./bin/gitsqlite-linux-amd64 -schema-file .gitsqliteschema smudge < data_only.sql > restored.db
287+
288+
echo "Step 6: Verify roundtrip integrity"
289+
./bin/gitsqlite-linux-amd64 clean < restored.db > roundtrip.sql
290+
./bin/gitsqlite-linux-amd64 clean < smoketest.db > original.sql
291+
292+
if ! diff original.sql roundtrip.sql; then
293+
echo "ERROR: Roundtrip test failed - restored database differs from original"
294+
exit 1
295+
fi
296+
297+
echo "Step 7: Test diff operation with schema/data separation"
298+
./bin/gitsqlite-linux-amd64 -data-only -schema-output schema_diff.sql diff smoketest.db > data_diff.sql
299+
300+
if ! diff data_only.sql data_diff.sql; then
301+
echo "ERROR: Clean and diff produce different data-only output"
302+
exit 1
303+
fi
304+
305+
if ! diff .gitsqliteschema schema_diff.sql; then
306+
echo "ERROR: Clean and diff produce different schema output"
307+
exit 1
308+
fi
309+
310+
echo "=== Schema/Data Separation Tests: PASSED ==="
311+
312+
echo "File sizes:"
313+
echo "Original database: $(wc -c < smoketest.db) bytes"
314+
echo "Full SQL: $(wc -c < original.sql) bytes"
315+
echo "Data-only SQL: $(wc -c < data_only.sql) bytes"
316+
echo "Schema SQL: $(wc -c < .gitsqliteschema) bytes"
317+
318+
echo "Line counts:"
319+
echo "Full SQL: $(wc -l < original.sql) lines"
320+
echo "Data-only SQL: $(wc -l < data_only.sql) lines"
321+
echo "Schema SQL: $(wc -l < .gitsqliteschema) lines"
322+
323+
- name: Upload schema/data artifacts
324+
uses: actions/upload-artifact@v4
325+
with:
326+
name: schema-data-separation-artifacts
327+
path: |
328+
.gitsqliteschema
329+
data_only.sql
330+
schema_diff.sql
331+
data_diff.sql
332+
restored.db
333+
roundtrip.sql
334+
original.sql
335+
retention-days: 5
336+
234337
cross-evaluate:
235338
needs: [test-ubuntu-amd64, test-windows-amd64]
236339
runs-on: ubuntu-latest

README.md

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,31 @@ There are several benefits over [using sqlite3 .dump directly](https://garrit.xy
6969

7070
Git will automatically convert SQLite files to SQL text for storage and back to binary when checked out.
7171

72+
## Quick Start: Schema/Data Separation
73+
74+
For cleaner diffs that only show data changes, use the schema/data separation feature:
75+
76+
1. **Configure Git filters for data-only mode**:
77+
```bash
78+
echo '*.db filter=gitsqlite-data' >> .gitattributes
79+
git config filter.gitsqlite-data.clean "gitsqlite -data-only -schema-output .gitsqliteschema clean"
80+
git config filter.gitsqlite-data.smudge "gitsqlite -schema-file .gitsqliteschema smudge"
81+
```
82+
83+
2. **Add schema file to Git**:
84+
```bash
85+
git add .gitsqliteschema
86+
git commit -m "Add database schema"
87+
```
88+
89+
3. **Version your database**:
90+
```bash
91+
git add mydb.db
92+
git commit -m "Add database data"
93+
```
94+
95+
Git will now store only data changes in the database file, while schema is managed separately. This results in much cleaner diffs that only show INSERT operations.
96+
7297
## Quick Start Git Diff
7398

7499
To enable SQL-based diffs for SQLite databases in Git, add the following to your repository's `.gitattributes` and configure your Git diff driver: (It doesn't matter if it is stored as binary or via smudge/clean.)
@@ -210,6 +235,25 @@ See [CLI Parameters](#cli-parameters) for all available options.
210235
gitsqlite -help
211236
```
212237

238+
### Schema/Data Separation Options
239+
**`-data-only`** - For clean/diff: output only data (INSERT statements), no schema
240+
```bash
241+
gitsqlite -data-only clean < database.db > data.sql
242+
gitsqlite -data-only diff database.db > data.sql
243+
```
244+
245+
**`-schema-output <file>`** - Save schema to this file during clean/diff (default: do not save schema separately)
246+
```bash
247+
gitsqlite -schema-output .gitsqliteschema clean < database.db > data.sql
248+
gitsqlite -schema-output schema.sql diff database.db > data.sql
249+
```
250+
251+
**`-schema-file <file>`** - For smudge: read schema from this file instead of stdin (default: ".gitsqliteschema")
252+
```bash
253+
gitsqlite -schema-file .gitsqliteschema smudge < data.sql > database.db
254+
gitsqlite -schema-file custom_schema.sql smudge < data.sql > database.db
255+
```
256+
213257
## Examples
214258

215259
### Quick Start Example
@@ -257,6 +301,47 @@ gitsqlite smudge < sample.sql > restored.db
257301
sqlite3 restored.db "SELECT * FROM users;"
258302
```
259303

304+
### Schema/Data Separation Workflow
305+
306+
The schema/data separation feature allows you to store database schema and data separately for cleaner Git workflows and easier diff viewing.
307+
308+
1. **Separate schema and data during clean:**
309+
```bash
310+
# Extract data-only (INSERT statements) and save schema to separate file
311+
gitsqlite -data-only -schema-output .gitsqliteschema clean < database.db > data.sql
312+
```
313+
314+
2. **View the separated files:**
315+
```bash
316+
# Schema file contains CREATE TABLE statements
317+
cat .gitsqliteschema
318+
# PRAGMA foreign_keys=OFF;
319+
# BEGIN TRANSACTION;
320+
# CREATE TABLE users (id INTEGER PRIMARY KEY, name TEXT, email TEXT);
321+
# COMMIT;
322+
323+
# Data file contains INSERT statements
324+
cat data.sql
325+
# PRAGMA foreign_keys=OFF;
326+
# BEGIN TRANSACTION;
327+
# INSERT INTO users VALUES(1,'John Doe','john@example.com');
328+
# INSERT INTO users VALUES(2,'Jane Smith','jane@example.com');
329+
# COMMIT;
330+
```
331+
332+
3. **Restore database from separated files:**
333+
```bash
334+
# Combine schema and data back into database
335+
gitsqlite -schema-file .gitsqliteschema smudge < data.sql > restored.db
336+
```
337+
338+
4. **Benefit: Cleaner diffs that show only data changes:**
339+
```bash
340+
# After modifying data, diff will only show INSERT/UPDATE/DELETE changes
341+
gitsqlite -data-only clean < modified.db > modified_data.sql
342+
diff data.sql modified_data.sql
343+
```
344+
260345
### Advanced Usage Examples
261346

262347
**With custom SQLite path:**

internal/filters/clean.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ import (
1414
// Clean reads a binary SQLite DB from 'in', dumps SQL via sqlite engine using
1515
// selective table dumping to exclude sqlite_sequence, and writes SQL to 'out'.
1616
// using temporary file for robustness, pipelining would be more efficient - but it has to survive ~500mb files
17-
func Clean(ctx context.Context, eng *sqlite.Engine, in io.Reader, out io.Writer, floatPrecision int) error {
17+
// If dataOnly is true, only data (INSERT statements) are output to 'out'.
18+
// If schemaOutput is not empty, schema is saved to that file.
19+
func Clean(ctx context.Context, eng *sqlite.Engine, in io.Reader, out io.Writer, floatPrecision int, dataOnly bool, schemaOutput string) error {
1820
startTime := time.Now()
1921
slog.Info("Starting clean operation")
2022

@@ -48,9 +50,25 @@ func Clean(ctx context.Context, eng *sqlite.Engine, in io.Reader, out io.Writer,
4850

4951
slog.Info("Starting SQLite selective dump", "dbPath", tmp.Name())
5052

53+
// Save schema to separate file if requested
54+
if schemaOutput != "" {
55+
schemaFile, err := os.Create(schemaOutput)
56+
if err != nil {
57+
slog.Error("Failed to create schema output file", "file", schemaOutput, "error", err)
58+
return err
59+
}
60+
defer schemaFile.Close()
61+
62+
if err := DumpSchema(dumpCtx, eng, tmp.Name(), schemaFile); err != nil {
63+
slog.Error("Schema dump failed", "error", err)
64+
return err
65+
}
66+
slog.Info("Schema saved to file", "file", schemaOutput)
67+
}
68+
5169
// Use the new selective dumping method that excludes sqlite_sequence natively
5270
// This now uses the logical filtering function from the filters package
53-
if err := DumpTables(dumpCtx, eng, tmp.Name(), out, floatPrecision); err != nil {
71+
if err := DumpTables(dumpCtx, eng, tmp.Name(), out, floatPrecision, dataOnly); err != nil {
5472
slog.Error("SQLite selective dump failed", "error", err)
5573
return err
5674
}

internal/filters/diff.go

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,54 +2,44 @@ package filters
22

33
import (
44
"context"
5-
"fmt"
65
"io"
76
"log/slog"
8-
"os/exec"
9-
"strings"
7+
"os"
108
"time"
119

1210
"github.com/danielsiegl/gitsqlite/internal/sqlite"
1311
)
1412

1513
// Diff streams a binary SQLite DB from 'in' directly into sqlite3 .dump and writes SQL to 'out'.
1614
// No temp file is created; input is piped to sqlite3 and output is streamed to stdout.
17-
func Diff(ctx context.Context, eng *sqlite.Engine, dbFile string, out io.Writer) error {
15+
// If dataOnly is true, only data (INSERT statements) are output.
16+
// If schemaOutput is not empty, schema is saved to that file.
17+
func Diff(ctx context.Context, eng *sqlite.Engine, dbFile string, out io.Writer, dataOnly bool, schemaOutput string) error {
1818
startTime := time.Now()
1919
slog.Info("Starting diff operation")
2020

21-
binaryPath, err := eng.GetBinPath()
22-
if err != nil {
23-
slog.Error("Failed to get sqlite3 binary", "error", err)
24-
return err
25-
}
26-
27-
cmd := exec.CommandContext(ctx, binaryPath, dbFile, ".dump")
28-
stdoutPipe, err := cmd.StdoutPipe()
29-
if err != nil {
30-
slog.Error("Failed to get stdout pipe", "error", err)
31-
return err
21+
// Save schema to separate file if requested
22+
if schemaOutput != "" {
23+
schemaFile, err := os.Create(schemaOutput)
24+
if err != nil {
25+
slog.Error("Failed to create schema output file", "file", schemaOutput, "error", err)
26+
return err
27+
}
28+
defer schemaFile.Close()
29+
30+
if err := DumpSchema(ctx, eng, dbFile, schemaFile); err != nil {
31+
slog.Error("Schema dump failed", "error", err)
32+
return err
33+
}
34+
slog.Info("Schema saved to file", "file", schemaOutput)
3235
}
33-
var stderr strings.Builder
34-
cmd.Stderr = &stderr
3536

36-
if err := cmd.Start(); err != nil {
37-
slog.Error("Failed to start sqlite3 diff", "error", err)
37+
// For data output, use DumpTables with filtering
38+
if err := DumpTables(ctx, eng, dbFile, out, 9, dataOnly); err != nil {
39+
slog.Error("Diff dump failed", "error", err)
3840
return err
3941
}
4042

41-
if _, err := io.Copy(out, stdoutPipe); err != nil {
42-
return fmt.Errorf("error copying diff output: %w", err)
43-
}
44-
45-
if err := cmd.Wait(); err != nil {
46-
stderrOutput := stderr.String()
47-
if stderrOutput != "" {
48-
return fmt.Errorf("sqlite3 diff failed: %s: %w", stderrOutput, err)
49-
}
50-
return fmt.Errorf("sqlite3 diff failed: %w", err)
51-
}
52-
5343
slog.Info("Diff operation completed", "duration", time.Since(startTime))
5444
return nil
5545
}

0 commit comments

Comments
 (0)