Skip to content

Commit 08f9b5c

Browse files
sanityclaude
andauthored
fix: surface auto-update silent failures for unsupervised peers (#4580) (#4582)
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent f341830 commit 08f9b5c

5 files changed

Lines changed: 418 additions & 12 deletions

File tree

crates/core/src/bin/commands/auto_update.rs

Lines changed: 208 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use anyhow::Result;
1717
use semver::Version;
1818
use std::fs;
1919
use std::path::PathBuf;
20+
use std::sync::atomic::{AtomicBool, Ordering};
2021
use std::time::{Duration, SystemTime};
2122

2223
pub use freenet::transport::{
@@ -28,6 +29,73 @@ pub use freenet::transport::{
2829
/// The service wrapper catches this and runs `freenet update` before restarting.
2930
pub const EXIT_CODE_UPDATE_NEEDED: i32 = 42;
3031

32+
/// Environment variable set by every Freenet supervisor (systemd unit, macOS
33+
/// launchd wrapper script, and the Windows/Linux in-process `service
34+
/// run-wrapper` loop) on the `freenet network` child it spawns. Its presence is
35+
/// positive evidence that *something* will catch exit code 42 and run
36+
/// `freenet update` before restarting the node.
37+
///
38+
/// Auto-update is structurally supervised-install-only: the running node never
39+
/// replaces its own binary — it exits 42 and relies on its supervisor to apply
40+
/// the update and restart it. A bare `freenet network` run has no supervisor, so
41+
/// it detects the update, exits 42, dies, and is never restarted (issue #4580).
42+
/// We use this marker (and `INVOCATION_ID` as a systemd fallback) to tell the
43+
/// operator, loudly, when an update was detected but will not be applied.
44+
pub const SUPERVISED_ENV_VAR: &str = "FREENET_SUPERVISED";
45+
46+
/// Whether the running node appears to be under a supervisor that will catch
47+
/// exit code 42 and apply the update.
48+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
49+
pub enum SupervisorStatus {
50+
/// The authoritative [`SUPERVISED_ENV_VAR`] marker is set — a Freenet
51+
/// supervisor spawned us and *will* catch exit 42 and run `freenet update`.
52+
/// Nothing to warn about; exit 42 is expected to be applied.
53+
Supervised,
54+
/// We are under systemd (`INVOCATION_ID` is set) but our authoritative
55+
/// marker is absent. systemd alone does not prove the unit has the
56+
/// exit-42 → `freenet update` `ExecStopPost` hook: a custom unit,
57+
/// `systemd-run`, or a hand-written service running `freenet network`
58+
/// without that hook is also `INVOCATION_ID`-bearing. We therefore can't
59+
/// confirm the update will be applied — warn, but more softly, since this
60+
/// also covers older Freenet units installed before the marker existed.
61+
SupervisedUnverified,
62+
/// No evidence of any supervisor. Exit 42 will most likely kill the node
63+
/// without the update ever being applied — the operator must run
64+
/// `freenet update` manually (or install Freenet as a service).
65+
Unsupervised,
66+
}
67+
68+
/// Detect whether a supervisor is present, using an injectable env lookup so the
69+
/// logic is unit-testable without mutating the process environment.
70+
///
71+
/// Honest by construction: we only claim the fully-quiet [`Supervised`] state on
72+
/// *our own* marker. systemd's generic `INVOCATION_ID` is reported as
73+
/// [`SupervisedUnverified`] (still warned about, softly) because it does not
74+
/// prove the unit carries the exit-42 → `freenet update` hook. Absence of both
75+
/// is [`Unsupervised`], which drives the loud error.
76+
///
77+
/// [`Supervised`]: SupervisorStatus::Supervised
78+
/// [`SupervisedUnverified`]: SupervisorStatus::SupervisedUnverified
79+
/// [`Unsupervised`]: SupervisorStatus::Unsupervised
80+
pub fn detect_supervisor_status<F>(env_get: F) -> SupervisorStatus
81+
where
82+
F: Fn(&str) -> Option<String>,
83+
{
84+
let present = |key: &str| env_get(key).is_some_and(|v| !v.trim().is_empty());
85+
if present(SUPERVISED_ENV_VAR) {
86+
SupervisorStatus::Supervised
87+
} else if present("INVOCATION_ID") {
88+
SupervisorStatus::SupervisedUnverified
89+
} else {
90+
SupervisorStatus::Unsupervised
91+
}
92+
}
93+
94+
/// Read the live supervisor status from the process environment.
95+
pub fn supervisor_status() -> SupervisorStatus {
96+
detect_supervisor_status(|key| std::env::var(key).ok())
97+
}
98+
3199
/// Initial backoff interval for update checks (1 minute).
32100
const INITIAL_BACKOFF: Duration = Duration::from_secs(60);
33101

@@ -82,14 +150,39 @@ pub enum UpdateCheckResult {
82150
///
83151
/// Security: This function verifies against GitHub, so a malicious peer
84152
/// claiming a fake version won't trigger an exit.
153+
/// Set the first time this process logs the auto-update lockout at warn! level,
154+
/// so the permanent locked-out state is surfaced loudly once rather than on
155+
/// every 60s update-loop tick (it can be hit from multiple triggers per tick).
156+
static LOCKOUT_WARNED: AtomicBool = AtomicBool::new(false);
157+
85158
pub async fn check_if_update_available(current_version: &str) -> UpdateCheckResult {
86159
// Don't check if we've failed too many times
87160
if !should_attempt_update() {
88-
tracing::debug!(
89-
failures = get_update_failure_count(),
90-
max = MAX_UPDATE_FAILURES,
91-
"Skipping update check - too many previous failures"
92-
);
161+
// Loud, operator-visible (issue #4580): a persistent lockout means this
162+
// peer has stopped trying to auto-update entirely and will silently stay
163+
// behind. A common cause is a non-writable binary path (e.g. a
164+
// hand-installed binary the service account can't replace), which is a
165+
// *permanent* lockout until an operator intervenes. Warn LOUDLY the first
166+
// time we observe it this process, then drop to debug! so the permanent
167+
// state does not spam the log every minute.
168+
let failures = get_update_failure_count();
169+
if !LOCKOUT_WARNED.swap(true, Ordering::Relaxed) {
170+
tracing::warn!(
171+
failures,
172+
max = MAX_UPDATE_FAILURES,
173+
"Auto-update is LOCKED OUT after {MAX_UPDATE_FAILURES} consecutive failed update \
174+
attempts and will NOT retry. This peer will stay on the current version until an \
175+
operator intervenes. Run `freenet update` manually to update and reset the \
176+
lockout; if updates keep failing, the binary path is likely not writable by this \
177+
account.",
178+
);
179+
} else {
180+
tracing::debug!(
181+
failures,
182+
max = MAX_UPDATE_FAILURES,
183+
"Skipping update check - auto-update locked out (already warned this process)"
184+
);
185+
}
93186
return UpdateCheckResult::Skipped;
94187
}
95188

@@ -739,6 +832,116 @@ mod tests {
739832
);
740833
}
741834

835+
#[test]
836+
fn test_detect_supervisor_status_marker_present() {
837+
// Issue #4580: the explicit FREENET_SUPERVISED marker (set by all three
838+
// supervisors on the network child) is positive evidence of a supervisor.
839+
let env = |key: &str| {
840+
if key == SUPERVISED_ENV_VAR {
841+
Some("1".to_string())
842+
} else {
843+
None
844+
}
845+
};
846+
assert_eq!(detect_supervisor_status(env), SupervisorStatus::Supervised);
847+
}
848+
849+
#[test]
850+
fn test_detect_supervisor_status_invocation_id_is_unverified() {
851+
// systemd sets INVOCATION_ID for EVERY service instance, including custom
852+
// units / systemd-run that lack our exit-42 → `freenet update` hook. So a
853+
// bare INVOCATION_ID is only *unverified* supervision: we still warn (more
854+
// softly), rather than going fully quiet as if the update were guaranteed
855+
// to apply. This is the false-positive guard from the Codex review.
856+
let env = |key: &str| {
857+
if key == "INVOCATION_ID" {
858+
Some("a1b2c3d4".to_string())
859+
} else {
860+
None
861+
}
862+
};
863+
assert_eq!(
864+
detect_supervisor_status(env),
865+
SupervisorStatus::SupervisedUnverified
866+
);
867+
}
868+
869+
#[test]
870+
fn test_detect_supervisor_status_marker_beats_invocation_id() {
871+
// When BOTH our authoritative marker and systemd's INVOCATION_ID are set
872+
// (the normal Freenet systemd-unit case), the marker wins and we report
873+
// the fully-quiet Supervised state.
874+
let env = |key: &str| match key {
875+
SUPERVISED_ENV_VAR => Some("1".to_string()),
876+
"INVOCATION_ID" => Some("a1b2c3d4".to_string()),
877+
_ => None,
878+
};
879+
assert_eq!(detect_supervisor_status(env), SupervisorStatus::Supervised);
880+
}
881+
882+
#[test]
883+
fn test_detect_supervisor_status_unsupervised_when_absent() {
884+
// The whole point of #4580: a bare `freenet network` run has neither
885+
// signal and must be reported as Unsupervised so the loud warning fires
886+
// instead of silently exiting 42 with nothing to apply the update.
887+
let env = |_key: &str| None;
888+
assert_eq!(
889+
detect_supervisor_status(env),
890+
SupervisorStatus::Unsupervised
891+
);
892+
}
893+
894+
#[test]
895+
fn test_detect_supervisor_status_empty_and_whitespace_are_not_evidence() {
896+
// An empty or whitespace-only value (e.g. `FREENET_SUPERVISED=`) must NOT
897+
// count as a supervisor — it would suppress the warning while leaving the
898+
// update unapplied, which is exactly the silent failure we are fixing.
899+
for value in ["", " ", "\t", "\n"] {
900+
let env = move |key: &str| {
901+
if key == SUPERVISED_ENV_VAR || key == "INVOCATION_ID" {
902+
Some(value.to_string())
903+
} else {
904+
None
905+
}
906+
};
907+
assert_eq!(
908+
detect_supervisor_status(env),
909+
SupervisorStatus::Unsupervised,
910+
"value {value:?} must not count as supervisor evidence"
911+
);
912+
}
913+
}
914+
915+
#[test]
916+
fn test_locked_out_update_check_is_loud() {
917+
// Source-scrape pin for #4580: when the failure lockout disables
918+
// auto-update, the skip MUST be operator-visible (warn!), not a silent
919+
// debug! line. A regression to debug! would re-hide the permanent
920+
// lockout (e.g. non-writable binary path) the issue calls out.
921+
let src = include_str!("auto_update.rs");
922+
let (_, after_fn_start) = src
923+
.split_once("pub async fn check_if_update_available(")
924+
.expect("check_if_update_available definition not found");
925+
let (_, after_guard) = after_fn_start
926+
.split_once("if !should_attempt_update() {")
927+
.expect("lockout guard not found");
928+
let (guard_body, _) = after_guard
929+
.split_once("return UpdateCheckResult::Skipped;")
930+
.expect("could not locate end of lockout guard");
931+
// Strip line comments so the explanatory comment (which itself mentions
932+
// "warn!"/"debug!") cannot satisfy the assertion — match only on code.
933+
let code_only: String = guard_body
934+
.lines()
935+
.map(|line| line.split_once("//").map(|(c, _)| c).unwrap_or(line))
936+
.collect::<Vec<_>>()
937+
.join("\n");
938+
assert!(
939+
code_only.contains("tracing::warn!"),
940+
"the auto-update lockout skip must log at warn! level so a permanent \
941+
lockout is diagnosable (#4580), not silently dropped at debug!"
942+
);
943+
}
944+
742945
#[test]
743946
fn test_backoff_constants() {
744947
// Verify backoff progression: 1m -> 2m -> 4m -> 8m -> 16m -> 32m -> 64m (capped to 60m)

crates/core/src/bin/commands/service/linux.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,12 @@ Wants=network-online.target
267267
268268
[Service]
269269
Type=simple
270+
# Mark the node as supervised (issue #4580): tells `freenet network` that exit
271+
# code 42 (update needed) will be applied by ExecStopPost below, so it logs an
272+
# informational message instead of a loud "no supervisor" error on update exit.
273+
# systemd also sets INVOCATION_ID for every service, which the node detects as a
274+
# fallback; this makes the intent explicit and robust.
275+
Environment=FREENET_SUPERVISED=1
270276
# Stale-orphan self-heal (issue #3967): RestartPreventExitStatus=43 below
271277
# means an exit 43 ("another instance already running") never restarts the
272278
# unit. That is correct for a legitimate second instance, but if the port
@@ -372,6 +378,11 @@ Wants=network-online.target
372378
Type=simple
373379
User={username}
374380
Environment=HOME={home}
381+
# Mark the node as supervised (issue #4580): see the matching comment in the
382+
# user unit. Tells `freenet network` that exit code 42 (update needed) will be
383+
# applied by ExecStopPost below, so it logs an informational message instead of
384+
# a loud "no supervisor" error on update exit.
385+
Environment=FREENET_SUPERVISED=1
375386
# Stale-orphan self-heal (issue #3967): see the matching comment in the user
376387
# unit (including the systemd $$-escaping, the PPID-after-final-')' parse, and
377388
# why we do NOT anchor on the holder's exe equalling this unit's on-disk binary

crates/core/src/bin/commands/service/macos.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,12 @@ pub fn generate_wrapper_script(binary_path: &Path) -> String {
133133
# Includes exponential backoff to prevent rapid restart loops on repeated failures.
134134
# On startup, kills any stale 'freenet network' processes to avoid port conflicts.
135135
136+
# Mark the node child as supervised (issue #4580). This tells `freenet network`
137+
# that exit code 42 (update needed) will actually be caught and applied by this
138+
# wrapper, so it logs an informational message rather than a loud "no supervisor,
139+
# update will not be applied" error on exit.
140+
export {supervised_env}=1
141+
136142
BACKOFF=10 # Initial backoff in seconds
137143
MAX_BACKOFF=300 # Maximum backoff (5 minutes)
138144
CONSECUTIVE_FAILURES=0
@@ -347,7 +353,8 @@ while true; do
347353
fi
348354
done
349355
"#,
350-
binary = binary_path.display()
356+
binary = binary_path.display(),
357+
supervised_env = super::super::auto_update::SUPERVISED_ENV_VAR,
351358
)
352359
}
353360

crates/core/src/bin/commands/service/wrapper.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,11 @@ fn run_wrapper_loop(
801801

802802
let mut cmd = std::process::Command::new(&exe_path);
803803
cmd.arg("network");
804+
// Mark the child as supervised so it knows exit code 42 will actually be
805+
// caught and applied (issue #4580). Without this, the node can only fall
806+
// back to systemd's INVOCATION_ID, which the in-process wrapper path does
807+
// not set.
808+
cmd.env(super::super::auto_update::SUPERVISED_ENV_VAR, "1");
804809

805810
// On Windows, prevent a console window from appearing for the child process.
806811
// The wrapper has already detached from the console via FreeConsole(),

0 commit comments

Comments
 (0)