diff --git a/agent/agentserver/server.go b/agent/agentserver/server.go index 93376021b..038c916ae 100644 --- a/agent/agentserver/server.go +++ b/agent/agentserver/server.go @@ -43,6 +43,15 @@ import ( type Config struct { // How long a successful readiness check is valid for. If 0, disable caching successful readiness. readinessCacheTTL time.Duration `yaml:"readiness_cache_ttl"` + // Timeout configurations + DownloadTimeout time.Duration `yaml:"download_timeout"` +} + +func (c Config) applyDefaults() Config { + if c.DownloadTimeout == 0 { + c.DownloadTimeout = 15 * time.Minute + } + return c } // Server defines the agent HTTP server. @@ -65,7 +74,9 @@ func New( sched scheduler.ReloadableScheduler, tags tagclient.Client, ac announceclient.Client, - containerRuntime containerruntime.Factory) *Server { + containerRuntime containerruntime.Factory, +) *Server { + config = config.applyDefaults() stats = stats.Tagged(map[string]string{ "module": "agentserver", diff --git a/agent/cmd/cmd.go b/agent/cmd/cmd.go index 09a61325b..fe7052506 100644 --- a/agent/cmd/cmd.go +++ b/agent/cmd/cmd.go @@ -243,13 +243,14 @@ func Run(flags *Flags, opts ...Option) { go heartbeat(stats) log.Fatal(nginx.Run(config.Nginx, map[string]interface{}{ - "allowed_cidrs": config.AllowedCidrs, - "port": flags.AgentRegistryPort, - "registry_server": nginx.GetServer( - config.Registry.Docker.HTTP.Net, config.Registry.Docker.HTTP.Addr), + "allowed_cidrs": config.AllowedCidrs, + "port": flags.AgentRegistryPort, + "registry_server": nginx.GetServer(config.Registry.Docker.HTTP.Net, config.Registry.Docker.HTTP.Addr), "agent_server": fmt.Sprintf("127.0.0.1:%d", flags.AgentServerPort), - "registry_backup": config.RegistryBackup}, - nginx.WithTLS(config.TLS))) + "registry_backup": config.RegistryBackup, + // Pass timeout parameters from agent server config + "download_timeout": nginx.FormatDurationForNginx(config.AgentServer.DownloadTimeout), + }, nginx.WithTLS(config.TLS))) } // heartbeat periodically emits a counter metric which allows us to monitor the diff --git a/config/agent/base.yaml b/config/agent/base.yaml index 69c183888..bbe613477 100644 --- a/config/agent/base.yaml +++ b/config/agent/base.yaml @@ -58,6 +58,10 @@ registry: peer_id_factory: addr_hash +agentserver: + # Timeout configurations (also used by nginx) + download_timeout: 15m # nginx proxy_read_timeout for downloads + # Allow agent to only serve localhost and Docker default bridge requests. allowed_cidrs: - 127.0.0.1 diff --git a/config/origin/base.yaml b/config/origin/base.yaml index 9e98701d9..356f81039 100644 --- a/config/origin/base.yaml +++ b/config/origin/base.yaml @@ -49,6 +49,12 @@ blobserver: net: unix addr: /tmp/kraken-origin.sock + # Timeout configurations (also used by nginx) + download_timeout: 15m # nginx proxy_read_timeout for downloads + upload_timeout: 10m # nginx proxy_read_timeout/send_timeout for uploads + replication_timeout: 3m # nginx timeout for replication operations + backend_timeout: 2m # nginx proxy_connect_timeout + nginx: name: kraken-origin cache_dir: /var/cache/kraken/kraken-origin/nginx/ diff --git a/config/tracker/base.yaml b/config/tracker/base.yaml index ddeb07c45..973e4f54c 100644 --- a/config/tracker/base.yaml +++ b/config/tracker/base.yaml @@ -41,6 +41,9 @@ trackerserver: net: unix addr: /tmp/kraken-tracker.sock + # Timeout configurations (also used by nginx) + metainfo_timeout: 2m # nginx proxy_read_timeout for metainfo requests to origins + nginx: name: kraken-tracker cache_dir: /var/cache/kraken/kraken-tracker/nginx/ diff --git a/nginx/config/agent.go b/nginx/config/agent.go index a2a338f95..c50f1ba8d 100644 --- a/nginx/config/agent.go +++ b/nginx/config/agent.go @@ -42,9 +42,22 @@ server { {{healthEndpoint "agent-server"}} + # Download operations + location ~ ^/namespace/.*/blobs/ { + proxy_pass http://agent-server; + + # Use download timeout for blob operations + proxy_read_timeout {{.download_timeout}}; + proxy_send_timeout {{.download_timeout}}; + } + location / { proxy_pass http://registry-backend; proxy_next_upstream error timeout http_404 http_500; + + # Standard timeouts for registry operations + proxy_read_timeout {{.download_timeout}}; + proxy_send_timeout {{.download_timeout}}; } } ` diff --git a/nginx/config/origin.go b/nginx/config/origin.go index 00d89850b..84d2f56d8 100644 --- a/nginx/config/origin.go +++ b/nginx/config/origin.go @@ -30,8 +30,65 @@ server { {{healthEndpoint .server}} + # Timeout configurations from origin server config + proxy_connect_timeout {{.backend_timeout}}; + proxy_send_timeout {{.upload_timeout}}; + proxy_read_timeout {{.download_timeout}}; + + # Disable buffering for large blob transfers + # + # proxy_buffering off: Stream responses directly from upstream to client + # instead of buffering entire response in nginx memory/disk. Critical for + # large container image layers (multi-GB) to avoid memory exhaustion and + # provide immediate streaming to clients. + # + # proxy_request_buffering off: Stream request body directly to upstream + # instead of buffering entire request. Enables immediate upload streaming + # for large image pushes without requiring disk space for temporary files. + # + # Without these settings, nginx would buffer entire blobs before forwarding, + # causing high memory usage, storage requirements, and delayed transfers. + proxy_buffering off; + proxy_request_buffering off; + location / { proxy_pass http://{{.server}}; + + # Pass original client info + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Special handling for upload operations with longer timeout + location ~ ^/namespace/.*/blobs/.*/uploads { + proxy_pass http://{{.server}}; + + # Use upload timeout for these operations + proxy_read_timeout {{.upload_timeout}}; + proxy_send_timeout {{.upload_timeout}}; + + # Pass original client info + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Replication operations with their own timeout + location ~ ^/namespace/.*/blobs/.*/remote { + proxy_pass http://{{.server}}; + + # Use replication timeout for these operations + proxy_read_timeout {{.replication_timeout}}; + proxy_send_timeout {{.replication_timeout}}; + + # Pass original client info + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; } } ` diff --git a/nginx/config/tracker.go b/nginx/config/tracker.go index 5bd2eede6..651075e3b 100644 --- a/nginx/config/tracker.go +++ b/nginx/config/tracker.go @@ -43,6 +43,10 @@ server { proxy_cache_valid 200 5m; proxy_cache_valid any 1s; proxy_cache_lock on; + + # Use metainfo timeout for these operations + proxy_read_timeout {{.metainfo_timeout}}; + proxy_send_timeout {{.metainfo_timeout}}; } } ` diff --git a/nginx/nginx.go b/nginx/nginx.go index fe46d866c..352a4b014 100644 --- a/nginx/nginx.go +++ b/nginx/nginx.go @@ -22,6 +22,7 @@ import ( "path" "path/filepath" "text/template" + "time" "github.com/uber/kraken/nginx/config" "github.com/uber/kraken/utils/httputil" @@ -261,3 +262,33 @@ func GetServer(net, addr string) string { } return addr } + +// FormatDurationForNginx converts a Go time.Duration to an nginx-compatible timeout string. +// +// This function adds a 30-second buffer to the input duration to ensure that the Go server +// times out before nginx does. This approach provides better observability and error handling +// because the Go application can return structured error responses with proper HTTP status codes, +// rather than nginx returning generic 504 Gateway Timeout errors. +// +// Timeout Strategy: +// - Go server timeout: d (original duration) +// - Nginx timeout: d + 30s (buffered duration) +// - This ensures Go responds with proper errors before nginx cuts the connection +// +// Format: Always returns seconds format (e.g., "60s", "150s", "3600s") for simplicity. +// Nginx accepts both seconds and minutes formats, so this approach works universally. +// +// Examples: +// +// FormatDurationForNginx(5 * time.Minute) // "330s" (5m + 30s = 330s) +// FormatDurationForNginx(2 * time.Minute) // "150s" (2m + 30s = 150s) +// FormatDurationForNginx(30 * time.Second) // "60s" (30s + 30s = 60s) +// FormatDurationForNginx(10 * time.Second) // "40s" (10s + 30s = 40s) +// FormatDurationForNginx(500 * time.Millisecond) // "30s" (500ms + 30s = 30.5s → 30s) +// +// Note: Nginx accepts both "60s" and "1m" formats. This function uses seconds for consistency. +func FormatDurationForNginx(d time.Duration) string { + bufferedDuration := d + (30 * time.Second) + seconds := int(bufferedDuration.Seconds()) + return fmt.Sprintf("%ds", seconds) +} diff --git a/origin/blobserver/config.go b/origin/blobserver/config.go index 1fda79806..9c51a38e7 100644 --- a/origin/blobserver/config.go +++ b/origin/blobserver/config.go @@ -23,11 +23,27 @@ import ( type Config struct { Listener listener.Config `yaml:"listener"` DuplicateWriteBackStagger time.Duration `yaml:"duplicate_write_back_stagger"` + DownloadTimeout time.Duration `yaml:"download_timeout"` + UploadTimeout time.Duration `yaml:"upload_timeout"` + ReplicationTimeout time.Duration `yaml:"replication_timeout"` + BackendTimeout time.Duration `yaml:"backend_timeout"` } func (c Config) applyDefaults() Config { if c.DuplicateWriteBackStagger == 0 { c.DuplicateWriteBackStagger = 30 * time.Minute } + if c.DownloadTimeout == 0 { + c.DownloadTimeout = 15 * time.Minute + } + if c.UploadTimeout == 0 { + c.UploadTimeout = 10 * time.Minute + } + if c.ReplicationTimeout == 0 { + c.ReplicationTimeout = 3 * time.Minute + } + if c.BackendTimeout == 0 { + c.BackendTimeout = 2 * time.Minute + } return c } diff --git a/origin/cmd/cmd.go b/origin/cmd/cmd.go index cb0c3f80e..6645ed5bd 100644 --- a/origin/cmd/cmd.go +++ b/origin/cmd/cmd.go @@ -284,8 +284,12 @@ func Run(flags *Flags, opts ...Option) { log.Fatal(nginx.Run( config.Nginx, map[string]interface{}{ - "port": flags.BlobServerPort, - "server": nginx.GetServer(config.BlobServer.Listener.Net, config.BlobServer.Listener.Addr), + "port": flags.BlobServerPort, + "server": nginx.GetServer(config.BlobServer.Listener.Net, config.BlobServer.Listener.Addr), + "download_timeout": nginx.FormatDurationForNginx(config.BlobServer.DownloadTimeout), + "upload_timeout": nginx.FormatDurationForNginx(config.BlobServer.UploadTimeout), + "backend_timeout": nginx.FormatDurationForNginx(config.BlobServer.BackendTimeout), + "replication_timeout": nginx.FormatDurationForNginx(config.BlobServer.ReplicationTimeout), }, nginx.WithTLS(config.TLS))) } diff --git a/tracker/cmd/cmd.go b/tracker/cmd/cmd.go index 270a9cb3c..bb87f464c 100644 --- a/tracker/cmd/cmd.go +++ b/tracker/cmd/cmd.go @@ -148,16 +148,15 @@ func Run(flags *Flags, opts ...Option) { r := blobclient.NewClientResolver(blobclient.NewProvider(blobclient.WithTLS(tls)), origins) originCluster := blobclient.NewClusterClient(r) - server := trackerserver.New( - config.TrackerServer, stats, policy, peerStore, originStore, originCluster) + server := trackerserver.New(config.TrackerServer, stats, policy, peerStore, originStore, originCluster) go func() { log.Fatal(server.ListenAndServe()) }() log.Info("Starting nginx...") log.Fatal(nginx.Run(config.Nginx, map[string]interface{}{ - "port": flags.Port, - "server": nginx.GetServer( - config.TrackerServer.Listener.Net, config.TrackerServer.Listener.Addr)}, - nginx.WithTLS(config.TLS))) + "port": flags.Port, + "server": nginx.GetServer(config.TrackerServer.Listener.Net, config.TrackerServer.Listener.Addr), + "metainfo_timeout": nginx.FormatDurationForNginx(config.TrackerServer.MetaInfoTimeout), + }, nginx.WithTLS(config.TLS))) } diff --git a/tracker/trackerserver/config.go b/tracker/trackerserver/config.go index d163310c1..5fada7385 100644 --- a/tracker/trackerserver/config.go +++ b/tracker/trackerserver/config.go @@ -29,7 +29,8 @@ type Config struct { AnnounceInterval time.Duration `yaml:"announce_interval"` - Listener listener.Config `yaml:"listener"` + Listener listener.Config `yaml:"listener"` + MetaInfoTimeout time.Duration `yaml:"metainfo_timeout"` // Timeout for metainfo requests to origins } func (c Config) applyDefaults() Config { @@ -42,5 +43,8 @@ func (c Config) applyDefaults() Config { if c.AnnounceInterval == 0 { c.AnnounceInterval = 3 * time.Second } + if c.MetaInfoTimeout == 0 { + c.MetaInfoTimeout = 2 * time.Minute + } return c }