Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 74 additions & 6 deletions src/app/pty.zig
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ extern "c" fn getenv(name: [*:0]const u8) ?[*:0]const u8;
extern "c" fn execvp(file: [*:0]const u8, argv: [*]const ?[*:0]const u8) c_int;
extern "c" fn chdir(path: [*:0]const u8) c_int;
extern "c" fn waitpid(pid: c_int, status: ?*c_int, options: c_int) c_int;
extern "c" fn kill(pid: c_int, sig: c_int) c_int;
extern "c" fn tcgetpgrp(fd: c_int) posix.pid_t;
extern "c" fn getuid() c_uint;
extern "c" fn access(path: [*:0]const u8, mode: c_int) c_int;
Expand Down Expand Up @@ -262,16 +263,20 @@ pub const Pty = struct {
pub fn childExited(self: *Pty) bool {
if (self.exit_status != null) return true; // already reaped
// Raw C waitpid: returns pid if reaped, 0 if still running, -1 on error.
// ECHILD (-1) means already reaped — treat as exited.
var status: c_int = undefined;
const result = waitpid(self.pid, &status, 1); // 1 = WNOHANG
if (result != 0) {
// Store exit status only on successful reap (result > 0).
// ECHILD (result < 0) means already reaped — no status available.
if (result > 0) self.exit_status = status;
if (result > 0) {
self.exit_status = status;
return true;
}
return false;
if (result == 0) return false; // still running
// result < 0: ECHILD — we're not the parent of this process.
// This happens after hot-upgrade: the new daemon inherits PTY fds
// but the shell processes were forked by the old daemon and got
// reparented to PID 1. Use kill(pid, 0) to check if the process
// is actually still alive.
if (kill(self.pid, 0) == 0) return false; // process exists
return true; // process is gone
Comment on lines +278 to +279
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The kill(pid, 0) existence check treats any non-zero return as "process is gone". On POSIX, kill(pid, 0) can fail with EPERM even when the process exists (no permission to signal it). This would cause childExited() to incorrectly report the process as exited. Prefer using std.posix.kill and treating PermissionDenied as "still alive", and only treating ProcessNotFound/ESRCH as exited.

Suggested change
if (kill(self.pid, 0) == 0) return false; // process exists
return true; // process is gone
posix.kill(self.pid, 0) catch |err| {
// PermissionDenied means the process exists but we can't signal it.
// Only treat ProcessNotFound/ESRCH as "process is gone".
return switch (err) {
error.PermissionDenied => false,
error.ProcessNotFound => true,
else => false,
};
};
// kill succeeded: process exists
return false;

Copilot uses AI. Check for mistakes.
Comment on lines +273 to +279
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

childExited() treats any waitpid() error (result < 0) as ECHILD. waitpid can also fail with EINTR or other errors; in those cases this path can silently mask real issues and may return incorrect exit state. Consider checking errno and only using the kill(pid, 0) fallback for ECHILD; for EINTR, return false and retry on the next call (and for other errors, decide an explicit policy).

Suggested change
// result < 0: ECHILD — we're not the parent of this process.
// This happens after hot-upgrade: the new daemon inherits PTY fds
// but the shell processes were forked by the old daemon and got
// reparented to PID 1. Use kill(pid, 0) to check if the process
// is actually still alive.
if (kill(self.pid, 0) == 0) return false; // process exists
return true; // process is gone
// result < 0: check errno to distinguish ECHILD from other errors.
const err = posix.errno();
if (err == posix.ECHILD) {
// We're not the parent of this process anymore.
// This happens after hot-upgrade: the new daemon inherits PTY fds
// but the shell processes were forked by the old daemon and got
// reparented to PID 1. Use kill(pid, 0) to check if the process
// is actually still alive.
if (kill(self.pid, 0) == 0) return false; // process exists
return true; // process is gone
}
if (err == posix.EINTR) {
// Interrupted by signal; caller can retry on next invocation.
return false;
}
// For other errors, conservatively report "not exited yet" to avoid
// incorrectly marking the child as gone.
return false;

Copilot uses AI. Check for mistakes.
}

/// Extract the process exit code (0-255), or 1 if killed by signal.
Expand All @@ -288,3 +293,66 @@ pub const Pty = struct {
}

};

// ── Tests ──

test "childExited returns false for non-child process that is still alive" {
// Simulates the hot-upgrade scenario: the new daemon process inherits
// PTY fds but the shell processes are NOT its children (they were forked
// by the old daemon and reparented to PID 1 on its exit).
//
// We create a grandchild: fork → child forks grandchild → child exits →
// grandchild is reparented to init. Our process can't waitpid on the
// grandchild (ECHILD), but it's still alive.

// Use a pipe to communicate the grandchild PID back to us.
const pipe_fds = try posix.pipe();

const child = try posix.fork();
if (child == 0) {
// ── child ──
posix.close(pipe_fds[0]); // close read end
const grandchild = posix.fork() catch posix.abort();
if (grandchild == 0) {
// ── grandchild: sleep for a while ──
posix.close(pipe_fds[1]);
posix.nanosleep(5, 0);
posix.abort();
}
// Write grandchild PID to pipe, then exit immediately.
// This reparents grandchild to PID 1.
const gc_pid: i32 = grandchild;
_ = posix.write(pipe_fds[1], std.mem.asBytes(&gc_pid)) catch {};
posix.close(pipe_fds[1]);
std.process.exit(0);
}

// ── parent (test process) ──
posix.close(pipe_fds[1]); // close write end

// Reap our direct child
_ = waitpid(child, null, 0);

// Read grandchild PID from pipe
var gc_pid_bytes: [@sizeOf(i32)]u8 = undefined;
const n = posix.read(pipe_fds[0], &gc_pid_bytes) catch 0;
posix.close(pipe_fds[0]);
if (n != @sizeOf(i32)) return error.TestFailed;
Comment on lines +338 to +340
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test reads the grandchild PID from a pipe with a single read() and expects exactly @sizeof(i32) bytes. Pipe reads can legally return fewer bytes than requested (e.g., due to EINTR), which can make this test flaky. Consider looping until all bytes are read (or EOF) rather than failing on a short read.

Suggested change
const n = posix.read(pipe_fds[0], &gc_pid_bytes) catch 0;
posix.close(pipe_fds[0]);
if (n != @sizeOf(i32)) return error.TestFailed;
var total_read: usize = 0;
while (total_read < @sizeOf(i32)) {
const n = posix.read(pipe_fds[0], gc_pid_bytes[total_read..]) catch 0;
if (n == 0) break; // EOF or error
total_read += n;
}
posix.close(pipe_fds[0]);
if (total_read != @sizeOf(i32)) return error.TestFailed;

Copilot uses AI. Check for mistakes.

const gc_pid = std.mem.bytesToValue(i32, &gc_pid_bytes);

// Now gc_pid is alive but NOT our child — waitpid would return ECHILD.
// This is exactly what happens after hot-upgrade.
var pty = Pty.fromExisting(-1, gc_pid); // fd doesn't matter for this test

// The bug: old code would return true here (ECHILD treated as exited).
// The fix: kill(pid, 0) checks the process is alive → returns false.
try std.testing.expect(!pty.childExited());

// Clean up: kill the grandchild
_ = kill(gc_pid, 9); // SIGKILL
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test uses raw numeric signal values (9) with the C kill() binding. Using std.posix.SIG.KILL (and std.posix.kill) would avoid magic numbers and also make it easier to handle errors consistently across platforms.

Suggested change
_ = kill(gc_pid, 9); // SIGKILL
posix.kill(gc_pid, posix.SIG.KILL) catch {};

Copilot uses AI. Check for mistakes.

// After kill, childExited should eventually return true
posix.nanosleep(0, 50_000_000); // 50ms for signal delivery
Comment on lines +355 to +356
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test cleanup uses a fixed 50ms nanosleep and then expects childExited() to return true. Signal delivery + init reaping timing can vary on loaded CI, so this can be flaky. Consider polling childExited() with a bounded timeout (or using a longer/backoff wait) instead of a single fixed sleep.

Suggested change
// After kill, childExited should eventually return true
posix.nanosleep(0, 50_000_000); // 50ms for signal delivery
// After kill, childExited should eventually return true. Poll with a
// bounded timeout instead of relying on a single fixed sleep to avoid
// flakiness on slow or loaded systems.
var attempts: usize = 0;
while (attempts < 100 and !pty.childExited()) : (attempts += 1) {
// Sleep 10ms between checks, for up to ~1s total.
posix.nanosleep(0, 10_000_000);
}

Copilot uses AI. Check for mistakes.
try std.testing.expect(pty.childExited());
}
Loading