Skip to content

Commit 3634750

Browse files
authored
Merge pull request #3370 from nosammai/disconnect-after-uptime
Add disconnect-after-uptime flag to set a max lifetime for agents
2 parents 3d25bd5 + 7901001 commit 3634750

File tree

4 files changed

+122
-0
lines changed

4 files changed

+122
-0
lines changed

agent/agent_configuration.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ type AgentConfiguration struct {
4949
HealthCheckAddr string
5050
DisconnectAfterJob bool
5151
DisconnectAfterIdleTimeout int
52+
DisconnectAfterUptime int
5253
CancelGracePeriod int
5354
SignalGracePeriod time.Duration
5455
EnableJobLogTmpfile bool

agent/agent_worker.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ type AgentWorker struct {
106106
state agentWorkerState
107107
currentJobID string
108108

109+
// The time when this agent worker started
110+
startTime time.Time
111+
109112
// disable the delay between pings, to speed up certain testing scenarios
110113
noWaitBetweenPingsForTesting bool
111114
}
@@ -213,6 +216,9 @@ func (a *AgentWorker) statusCallback(context.Context) (any, error) {
213216

214217
// Starts the agent worker
215218
func (a *AgentWorker) Start(ctx context.Context, idleMonitor *IdleMonitor) error {
219+
// Record the start time for max agent lifetime tracking
220+
a.startTime = time.Now()
221+
216222
a.metrics = a.metricsCollector.Scope(metrics.Tags{
217223
"agent_name": a.agent.Name,
218224
})
@@ -331,6 +337,7 @@ func (a *AgentWorker) runPingLoop(ctx context.Context, idleMonitor *IdleMonitor)
331337
// ping action isn't "pause",
332338
// * the agent is in disconnect-after-idle-timeout mode, has been idle for
333339
// longer than the idle timeout, and the ping action isn't "pause".
340+
// * the agent has exceeded its disconnect-after-uptime and the ping action isn't "pause".
334341
for {
335342
setStat("😴 Waiting until next ping interval tick")
336343
select {
@@ -411,6 +418,18 @@ func (a *AgentWorker) runPingLoop(ctx context.Context, idleMonitor *IdleMonitor)
411418
return nil
412419
}
413420

421+
// Exit after disconnect-after-uptime is exceeded.
422+
if a.agentConfiguration.DisconnectAfterUptime > 0 {
423+
maxUptime := time.Second * time.Duration(a.agentConfiguration.DisconnectAfterUptime)
424+
if time.Since(a.startTime) >= maxUptime {
425+
if job != nil {
426+
a.logger.Error("Agent ping dispatched a job (id %q) but agent has exceeded max uptime of %v!", job.ID, maxUptime)
427+
}
428+
a.logger.Info("Agent has exceeded max uptime of %v. Disconnecting...", maxUptime)
429+
return nil
430+
}
431+
}
432+
414433
// Note that Ping only returns a job if err == nil.
415434
if job == nil {
416435
if disconnectAfterIdleTimeout == 0 {

agent/agent_worker_test.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,92 @@ func TestAgentWorker_DisconnectAfterJob_Start_Pause_Unpause(t *testing.T) {
487487
}
488488
}
489489

490+
func TestAgentWorker_DisconnectAfterUptime(t *testing.T) {
491+
t.Parallel()
492+
493+
ctx, cancel := context.WithCancel(context.Background())
494+
t.Cleanup(cancel)
495+
496+
server := NewFakeAPIServer()
497+
defer server.Close()
498+
499+
// Create a job that the agent could potentially accept
500+
job := server.AddJob(map[string]string{
501+
"BUILDKITE_COMMAND": "echo hello",
502+
})
503+
504+
// Pre-register the agent.
505+
const agentSessionToken = "alpacas"
506+
agent := server.AddAgent(agentSessionToken)
507+
508+
pingCount := 0
509+
agent.PingHandler = func(*http.Request) (api.Ping, error) {
510+
pingCount++
511+
// Always offer the job to test that the agent stops accepting jobs after max lifetime
512+
return api.Ping{
513+
Job: job.Job,
514+
}, nil
515+
}
516+
517+
server.Assign(agent, job)
518+
519+
apiClient := api.NewClient(logger.Discard, api.Config{
520+
Endpoint: server.URL,
521+
Token: "llamas",
522+
})
523+
524+
l := logger.NewConsoleLogger(logger.NewTestPrinter(t), func(int) {})
525+
526+
worker := NewAgentWorker(
527+
l,
528+
&api.AgentRegisterResponse{
529+
UUID: uuid.New().String(),
530+
Name: "agent-1",
531+
AccessToken: agentSessionToken,
532+
Endpoint: server.URL,
533+
PingInterval: 1,
534+
JobStatusInterval: 1,
535+
HeartbeatInterval: 10,
536+
},
537+
metrics.NewCollector(logger.Discard, metrics.CollectorConfig{}),
538+
apiClient,
539+
AgentWorkerConfig{
540+
SpawnIndex: 1,
541+
AgentConfiguration: AgentConfiguration{
542+
BootstrapScript: "./dummy_bootstrap.sh",
543+
BuildPath: filepath.Join(os.TempDir(), t.Name(), "build"),
544+
HooksPath: filepath.Join(os.TempDir(), t.Name(), "hooks"),
545+
DisconnectAfterUptime: 1, // 1 second max uptime
546+
},
547+
},
548+
)
549+
worker.noWaitBetweenPingsForTesting = true
550+
551+
idleMonitor := NewIdleMonitor(1)
552+
553+
// Record start time
554+
startTime := time.Now()
555+
556+
if err := worker.Start(ctx, idleMonitor); err != nil {
557+
t.Errorf("worker.Start() = %v", err)
558+
}
559+
560+
// Check that the agent disconnected after approximately 1 second
561+
elapsed := time.Since(startTime)
562+
if elapsed < 900*time.Millisecond || elapsed > 2*time.Second {
563+
t.Errorf("Agent should have disconnected after ~1 second, but took %v", elapsed)
564+
}
565+
566+
// The agent should have made at least one ping before disconnecting
567+
if pingCount == 0 {
568+
t.Error("Agent should have made at least one ping before disconnecting")
569+
}
570+
571+
// The agent should have made at least one ping and should have disconnected
572+
// due to max uptime being exceeded. The important thing is that the agent
573+
// disconnected properly with the uptime check, which we verified above.
574+
}
575+
490576
func TestAgentWorker_SetEndpointDuringRegistration(t *testing.T) {
491577
// The registration request is made in clicommand.AgentStartCommand, and the response
492578
// is passed into agent.NewAgentWorker(...), so we'll just test the response handling.

clicommand/agent_start.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ type AgentStartConfig struct {
104104
AcquireJob string `cli:"acquire-job"`
105105
DisconnectAfterJob bool `cli:"disconnect-after-job"`
106106
DisconnectAfterIdleTimeout int `cli:"disconnect-after-idle-timeout"`
107+
DisconnectAfterUptime int `cli:"disconnect-after-uptime"`
107108
CancelGracePeriod int `cli:"cancel-grace-period"`
108109
SignalGracePeriodSeconds int `cli:"signal-grace-period-seconds"`
109110

@@ -235,6 +236,10 @@ func (asc AgentStartConfig) Features(ctx context.Context) []string {
235236
features = append(features, "disconnect-after-idle")
236237
}
237238

239+
if asc.DisconnectAfterUptime != 0 {
240+
features = append(features, "disconnect-after-uptime")
241+
}
242+
238243
if asc.NoPlugins {
239244
features = append(features, "no-plugins")
240245
}
@@ -363,6 +368,12 @@ var AgentStartCommand = cli.Command{
363368
Usage: "The maximum idle time in seconds to wait for a job before disconnecting. The default of 0 means no timeout",
364369
EnvVar: "BUILDKITE_AGENT_DISCONNECT_AFTER_IDLE_TIMEOUT",
365370
},
371+
cli.IntFlag{
372+
Name: "disconnect-after-uptime",
373+
Value: 0,
374+
Usage: "The maximum uptime in seconds before the agent stops accepting new jobs and shuts down after any running jobs complete. The default of 0 means no timeout",
375+
EnvVar: "BUILDKITE_AGENT_DISCONNECT_AFTER_UPTIME",
376+
},
366377
cancelGracePeriodFlag,
367378
cli.BoolFlag{
368379
Name: "enable-job-log-tmpfile",
@@ -1014,6 +1025,7 @@ var AgentStartCommand = cli.Command{
10141025
TimestampLines: cfg.TimestampLines,
10151026
DisconnectAfterJob: cfg.DisconnectAfterJob,
10161027
DisconnectAfterIdleTimeout: cfg.DisconnectAfterIdleTimeout,
1028+
DisconnectAfterUptime: cfg.DisconnectAfterUptime,
10171029
CancelGracePeriod: cfg.CancelGracePeriod,
10181030
SignalGracePeriod: signalGracePeriod,
10191031
EnableJobLogTmpfile: cfg.EnableJobLogTmpfile,
@@ -1109,6 +1121,10 @@ var AgentStartCommand = cli.Command{
11091121
l.Info("Agents will disconnect after %d seconds of inactivity", agentConf.DisconnectAfterIdleTimeout)
11101122
}
11111123

1124+
if agentConf.DisconnectAfterUptime > 0 {
1125+
l.Info("Agents will disconnect after %d seconds of uptime and shut down after any running jobs complete", agentConf.DisconnectAfterUptime)
1126+
}
1127+
11121128
if len(cfg.AllowedRepositories) > 0 {
11131129
agentConf.AllowedRepositories = make([]*regexp.Regexp, 0, len(cfg.AllowedRepositories))
11141130
for _, v := range cfg.AllowedRepositories {

0 commit comments

Comments
 (0)