Skip to content

Commit 5a4698d

Browse files
authored
Merge pull request #5 from sigman78/sigman/storage-if
Storage interface for FS access
2 parents 5bd7707 + 102e4c6 commit 5a4698d

5 files changed

Lines changed: 120 additions & 69 deletions

File tree

internal/wayback/css.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package wayback
22

33
import (
44
"net/url"
5-
"os"
65
"path/filepath"
76
"regexp"
87
"strings"
@@ -92,12 +91,12 @@ func RewriteCSSContent(css, pageURL string, cfg *Config, idx *SnapshotIndex) str
9291
return css
9392
}
9493

95-
// RewriteCSSFile reads a CSS file, rewrites its URLs, and writes it back.
96-
func RewriteCSSFile(filePath, pageURL string, cfg *Config, idx *SnapshotIndex) error {
97-
data, err := os.ReadFile(filePath) //nolint:gosec // G304: path is written by this program
94+
// RewriteCSSFile reads a CSS file from storage, rewrites its URLs, and writes it back.
95+
func RewriteCSSFile(store Storage, logicalPath, pageURL string, cfg *Config, idx *SnapshotIndex) error {
96+
data, err := store.Get(logicalPath)
9897
if err != nil {
9998
return err
10099
}
101100
rewritten := RewriteCSSContent(string(data), pageURL, cfg, idx)
102-
return os.WriteFile(filePath, []byte(rewritten), 0600)
101+
return store.PutBytes(logicalPath, []byte(rewritten))
103102
}

internal/wayback/downloader.go

Lines changed: 20 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
package wayback
22

33
import (
4+
"bytes"
45
"context"
56
"fmt"
67
"io"
78
"log"
89
"net/http"
9-
"os"
10-
"path/filepath"
1110
"strings"
1211
"sync/atomic"
1312
"time"
@@ -33,8 +32,9 @@ type Config struct {
3332
DownloadExternalAssets bool
3433
Debug bool
3534
StopOnError bool
36-
CDXRatePerMin int // CDX API requests per minute (default 60)
37-
CDXMaxRetries int // max retry attempts on throttle/5xx (default 5)
35+
CDXRatePerMin int // CDX API requests per minute (default 60)
36+
CDXMaxRetries int // max retry attempts on throttle/5xx (default 5)
37+
Storage Storage // if nil, NewLocalStorage(Directory) is used
3838
}
3939

4040
var downloadHTTPClient = &http.Client{
@@ -69,8 +69,9 @@ func DownloadAll(cfg *Config) error {
6969
fmt.Printf("Found %d unique snapshots to download.\n", total)
7070
}
7171

72-
if err := os.MkdirAll(cfg.Directory, 0750); err != nil {
73-
return fmt.Errorf("create output dir: %w", err)
72+
store := cfg.Storage
73+
if store == nil {
74+
store = NewLocalStorage(cfg.Directory)
7475
}
7576

7677
pool, err := ants.NewPool(cfg.Threads)
@@ -91,7 +92,7 @@ func DownloadAll(cfg *Config) error {
9192
}
9293
errCh := make(chan error, 1)
9394
if err := pool.Submit(func() {
94-
errCh <- downloadOne(ctx, s, cfg, idx, dlProg)
95+
errCh <- downloadOne(ctx, s, cfg, store, idx, dlProg)
9596
}); err != nil {
9697
return fmt.Errorf("submit task: %w", err)
9798
}
@@ -119,17 +120,16 @@ func DownloadAll(cfg *Config) error {
119120
}
120121

121122
// downloadOne downloads a single snapshot and optionally rewrites its links.
122-
func downloadOne(ctx context.Context, snap Snapshot, cfg *Config, idx *SnapshotIndex, dlProg *Progress) error {
123+
func downloadOne(ctx context.Context, snap Snapshot, cfg *Config, store Storage, idx *SnapshotIndex, dlProg *Progress) error {
123124

124125
if ctx.Err() != nil {
125126
return ctx.Err()
126127
}
127128

128-
localPath := URLToLocalPath(snap.FileURL, cfg.PrettyPath)
129-
localPath = filepath.Join(cfg.Directory, filepath.FromSlash(localPath))
129+
logicalPath := URLToLocalPath(snap.FileURL, cfg.PrettyPath)
130130

131131
// Skip existing files
132-
if _, err := os.Stat(localPath); err == nil {
132+
if store.Exists(logicalPath) {
133133
dlProg.Inc()
134134
return nil
135135
}
@@ -160,53 +160,27 @@ func downloadOne(ctx context.Context, snap Snapshot, cfg *Config, idx *SnapshotI
160160
return fmt.Errorf("HTTP %d for %s", resp.StatusCode, waybackURL)
161161
}
162162

163-
// Ensure parent directory exists
164-
if err := os.MkdirAll(filepath.Dir(localPath), 0750); err != nil {
165-
return fmt.Errorf("mkdirall: %w", err)
166-
}
167-
168-
// Stream to temp file, then rename atomically
169-
tmpFile, err := os.CreateTemp(filepath.Dir(localPath), ".wbdl-*")
170-
if err != nil {
171-
return fmt.Errorf("create temp: %w", err)
172-
}
173-
tmpName := tmpFile.Name()
174-
defer func() {
175-
_ = tmpFile.Close()
176-
_ = os.Remove(tmpName) // no-op if renamed
177-
}()
178-
179-
// Read first 512 bytes for content sniffing
163+
// Read first 512 bytes for content sniffing, then stream remainder via storage
180164
first := make([]byte, 512)
181165
n, _ := io.ReadFull(resp.Body, first)
182166
first = first[:n]
183167

184-
if _, err := tmpFile.Write(first); err != nil {
185-
return fmt.Errorf("write first bytes: %w", err)
186-
}
187-
if _, err := io.Copy(tmpFile, resp.Body); err != nil {
188-
return fmt.Errorf("write body: %w", err)
189-
}
190-
if err := tmpFile.Close(); err != nil {
191-
return fmt.Errorf("close temp: %w", err)
192-
}
193-
194-
if err := os.Rename(tmpName, localPath); err != nil { //nolint:gosec // G703: localPath is sanitized by URLToLocalPath
195-
return fmt.Errorf("rename: %w", err)
168+
if err := store.Put(logicalPath, io.MultiReader(bytes.NewReader(first), resp.Body)); err != nil {
169+
return fmt.Errorf("store: %w", err)
196170
}
197171

198172
// Post-process HTML / CSS
199173
if cfg.RewriteLinks {
200174
ct := resp.Header.Get("Content-Type")
201175
fileURL := snap.FileURL
202176

203-
if IsHTMLFile(localPath, ct, first) {
204-
if err := ProcessHTML(localPath, fileURL, cfg, idx); err != nil && cfg.Debug {
205-
log.Printf("html rewrite %s: %v", localPath, err)
177+
if IsHTMLFile(logicalPath, ct, first) {
178+
if err := ProcessHTML(store, logicalPath, fileURL, cfg, idx); err != nil && cfg.Debug {
179+
log.Printf("html rewrite %s: %v", logicalPath, err)
206180
}
207-
} else if IsCSSResource(localPath, ct) {
208-
if err := RewriteCSSFile(localPath, fileURL, cfg, idx); err != nil && cfg.Debug {
209-
log.Printf("css rewrite %s: %v", localPath, err)
181+
} else if IsCSSResource(logicalPath, ct) {
182+
if err := RewriteCSSFile(store, logicalPath, fileURL, cfg, idx); err != nil && cfg.Debug {
183+
log.Printf("css rewrite %s: %v", logicalPath, err)
210184
}
211185
}
212186
}

internal/wayback/html.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,15 @@ package wayback
33
import (
44
"bytes"
55
"net/url"
6-
"os"
76
"path/filepath"
87
"strings"
98

109
"golang.org/x/net/html"
1110
)
1211

1312
// ProcessHTML rewrites links and canonical tags in an HTML file.
14-
func ProcessHTML(filePath, pageURL string, cfg *Config, idx *SnapshotIndex) error {
15-
data, err := os.ReadFile(filePath) //nolint:gosec // G304: path is written by this program
13+
func ProcessHTML(store Storage, logicalPath, pageURL string, cfg *Config, idx *SnapshotIndex) error {
14+
data, err := store.Get(logicalPath)
1615
if err != nil {
1716
return err
1817
}
@@ -28,7 +27,7 @@ func ProcessHTML(filePath, pageURL string, cfg *Config, idx *SnapshotIndex) erro
2827
}
2928

3029
// Relative directory of the output file (used for RelativeLink)
31-
localDir := ToPosix(filepath.ToSlash(filepath.Dir(filePath)))
30+
localDir := ToPosix(filepath.ToSlash(filepath.Dir(filepath.Join(cfg.Directory, filepath.FromSlash(logicalPath)))))
3231

3332
var walk func(*html.Node)
3433
walk = func(n *html.Node) {
@@ -75,7 +74,7 @@ func ProcessHTML(filePath, pageURL string, cfg *Config, idx *SnapshotIndex) erro
7574
if err := html.Render(&buf, doc); err != nil {
7675
return err
7776
}
78-
return os.WriteFile(filePath, buf.Bytes(), 0600)
77+
return store.PutBytes(logicalPath, buf.Bytes())
7978
}
8079

8180
// attrName returns the relevant URL attribute for a given tag name.

internal/wayback/html_test.go

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,29 @@
11
package wayback
22

33
import (
4-
"os"
5-
"path/filepath"
64
"strings"
75
"testing"
86
)
97

10-
// processHTMLInTemp writes htmlContent to a temp file at <dir>/index.html,
11-
// sets cfg.Directory = dir so that relative-link math is consistent, then
12-
// runs ProcessHTML and returns the rewritten file contents.
8+
// processHTMLInTemp writes htmlContent into a LocalStorage backed by a temp
9+
// directory, runs ProcessHTML, and returns the rewritten file contents.
1310
func processHTMLInTemp(t *testing.T, htmlContent, pageURL string, cfg *Config) string {
1411
t.Helper()
15-
dir := t.TempDir()
16-
cfg.Directory = dir
17-
18-
filePath := filepath.Join(dir, "index.html")
19-
if err := os.WriteFile(filePath, []byte(htmlContent), 0600); err != nil {
12+
store := NewLocalStorage(t.TempDir())
13+
if err := store.PutBytes("test.html", []byte(htmlContent)); err != nil {
2014
t.Fatalf("write test HTML: %v", err)
2115
}
2216

2317
idx := NewSnapshotIndex()
24-
if err := ProcessHTML(filePath, pageURL, cfg, idx); err != nil {
18+
if err := ProcessHTML(store, "test.html", pageURL, cfg, idx); err != nil {
2519
t.Fatalf("ProcessHTML: %v", err)
2620
}
2721

28-
out, err := os.ReadFile(filePath)
22+
got, err := store.Get("test.html")
2923
if err != nil {
3024
t.Fatalf("read result: %v", err)
3125
}
32-
return string(out)
26+
return string(got)
3327
}
3428

3529
func testHTMLCfg() *Config {

internal/wayback/storage.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
package wayback
2+
3+
import (
4+
"io"
5+
"os"
6+
"path/filepath"
7+
)
8+
9+
// Storage abstracts reading and writing downloaded snapshot files.
10+
// Logical paths are forward-slash relative paths as returned by URLToLocalPath
11+
// (e.g. "example.com/page/index.html"). Implementations map them to wherever
12+
// files actually live (OS directory, zip archive, memory map, …).
13+
type Storage interface {
14+
// Exists reports whether the logical path already has content.
15+
Exists(path string) bool
16+
// Put writes the content of r to path. The write is atomic —
17+
// no partial file is visible to concurrent readers.
18+
Put(path string, r io.Reader) error
19+
// Get returns the full content of path.
20+
Get(path string) ([]byte, error)
21+
// PutBytes writes data to path (convenience wrapper around Put).
22+
PutBytes(path string, data []byte) error
23+
}
24+
25+
// LocalStorage is the default Storage implementation that mirrors the
26+
// logical layout into a root directory on the OS filesystem.
27+
type LocalStorage struct {
28+
rootDir string
29+
}
30+
31+
// NewLocalStorage returns a LocalStorage rooted at dir.
32+
// The root directory is created lazily by Put/PutBytes.
33+
func NewLocalStorage(dir string) *LocalStorage {
34+
return &LocalStorage{rootDir: dir}
35+
}
36+
37+
// abs converts a logical forward-slash path to an absolute OS path.
38+
func (s *LocalStorage) abs(path string) string {
39+
return filepath.Join(s.rootDir, filepath.FromSlash(path))
40+
}
41+
42+
// Exists reports whether path already exists in storage.
43+
func (s *LocalStorage) Exists(path string) bool {
44+
_, err := os.Stat(s.abs(path))
45+
return err == nil
46+
}
47+
48+
// Put streams r into path atomically via a temp file + rename.
49+
func (s *LocalStorage) Put(path string, r io.Reader) error {
50+
fullPath := s.abs(path)
51+
dir := filepath.Dir(fullPath)
52+
if err := os.MkdirAll(dir, 0750); err != nil {
53+
return err
54+
}
55+
tmpFile, err := os.CreateTemp(dir, ".wbdl-*")
56+
if err != nil {
57+
return err
58+
}
59+
tmpName := tmpFile.Name()
60+
defer func() {
61+
_ = tmpFile.Close()
62+
_ = os.Remove(tmpName) // no-op if already renamed
63+
}()
64+
if _, err := io.Copy(tmpFile, r); err != nil {
65+
return err
66+
}
67+
if err := tmpFile.Close(); err != nil {
68+
return err
69+
}
70+
return os.Rename(tmpName, fullPath) //nolint:gosec // G703: fullPath is sanitized by URLToLocalPath
71+
}
72+
73+
// Get returns the full content of path.
74+
func (s *LocalStorage) Get(path string) ([]byte, error) {
75+
return os.ReadFile(s.abs(path)) //nolint:gosec // G304: path is written by this program
76+
}
77+
78+
// PutBytes writes data to path, creating parent directories as needed.
79+
func (s *LocalStorage) PutBytes(path string, data []byte) error {
80+
fullPath := s.abs(path)
81+
if err := os.MkdirAll(filepath.Dir(fullPath), 0750); err != nil {
82+
return err
83+
}
84+
return os.WriteFile(fullPath, data, 0600)
85+
}

0 commit comments

Comments
 (0)