From 5b303f5292a9242a1b0ad2cfcf3250f26f22562a Mon Sep 17 00:00:00 2001 From: Danny Canter Date: Wed, 19 Nov 2025 03:13:34 -0800 Subject: [PATCH] Vminitd: Add pause command Due to us supporting a pod type now, and pid ns sharing being quite a common thing for pods, lets add a pause container like command to vminitd to eventually enable pid ns sharing between containers in our variant of a pod. This changes vminitd slightly to have pause and init (default) commands as it seemed simpler than creating a whole new binary to include in the guest image. --- vminitd/Sources/vminitd/Application.swift | 205 +++++---------------- vminitd/Sources/vminitd/InitCommand.swift | 184 ++++++++++++++++++ vminitd/Sources/vminitd/PauseCommand.swift | 59 ++++++ vminitd/Sources/vminitd/Server+GRPC.swift | 1 - 4 files changed, 289 insertions(+), 160 deletions(-) create mode 100644 vminitd/Sources/vminitd/InitCommand.swift create mode 100644 vminitd/Sources/vminitd/PauseCommand.swift diff --git a/vminitd/Sources/vminitd/Application.swift b/vminitd/Sources/vminitd/Application.swift index c55e04aa..2149f08b 100644 --- a/vminitd/Sources/vminitd/Application.swift +++ b/vminitd/Sources/vminitd/Application.swift @@ -14,183 +14,70 @@ // limitations under the License. //===----------------------------------------------------------------------===// -import Cgroup -import Containerization -import ContainerizationError import ContainerizationOS import Foundation import Logging -import NIOCore -import NIOPosix - -#if os(Linux) -import Musl -import LCShim -#endif @main struct Application { - private static let foregroundEnvVar = "FOREGROUND" - private static let vsockPort = 1024 - private static let standardErrorLock = NSLock() - - private static func runInForeground(_ log: Logger) throws { - log.info("running vminitd under pid1") - - var command = Command("/sbin/vminitd") - command.attrs = .init(setsid: true) - command.stdin = .standardInput - command.stdout = .standardOutput - command.stderr = .standardError - command.environment = ["\(foregroundEnvVar)=1"] - - try command.start() - _ = try command.wait() - } - - private static func adjustLimits() throws { - var limits = rlimit() - guard getrlimit(RLIMIT_NOFILE, &limits) == 0 else { - throw POSIXError(.init(rawValue: errno)!) - } - limits.rlim_cur = 65536 - limits.rlim_max = 65536 - guard setrlimit(RLIMIT_NOFILE, &limits) == 0 else { - throw POSIXError(.init(rawValue: errno)!) - } - } - - @Sendable - private static func standardError(label: String) -> StreamLogHandler { - standardErrorLock.withLock { - StreamLogHandler.standardError(label: label) + static func main() async throws { + LoggingSystem.bootstrap(StreamLogHandler.standardError) + + // Parse command line arguments + let args = CommandLine.arguments + let command = args.count > 1 ? args[1] : "init" + + switch command { + case "pause": + let log = Logger(label: "pause") + + log.info("Running pause command") + try PauseCommand.run(log: log) + case "init": + fallthrough + default: + let log = Logger(label: "vminitd") + + log.info("Running init command") + try Self.mountProc(log: log) + try await InitCommand.run(log: log) } } - static func main() async throws { - LoggingSystem.bootstrap(standardError) - var log = Logger(label: "vminitd") - - try adjustLimits() - - // when running under debug mode, launch vminitd as a sub process of pid1 - // so that we get a chance to collect better logs and errors before pid1 exists - // and the kernel panics. - #if DEBUG - let environment = ProcessInfo.processInfo.environment - let foreground = environment[Self.foregroundEnvVar] - log.info("checking for shim var \(foregroundEnvVar)=\(String(describing: foreground))") - - if foreground == nil { - try runInForeground(log) - exit(0) + // Swift seems like it has some fun issues trying to spawn threads if /proc isn't around, so we + // do this before calling our first async function. + static func mountProc(log: Logger) throws { + // Is it already mounted (would only be true in debug builds where we re-exec ourselves)? + if isProcMounted() { + return } - // since we are not running as pid1 in this mode we must set ourselves - // as a subpreaper so that all child processes are reaped by us and not - // passed onto our parent. - CZ_set_sub_reaper() - #endif + log.info("mounting /proc") - log.logLevel = .debug - - signal(SIGPIPE, SIG_IGN) - - log.info("vminitd booting") - - // Set of mounts necessary to be mounted prior to taking any RPCs. - // 1. /proc as the sysctl rpc wouldn't make sense if it wasn't there. - // 2. /run as that is where we store container state. - // 3. /sys as we need it for /sys/fs/cgroup - // 4. /sys/fs/cgroup to add the agent to a cgroup, as well as containers later. - let mounts = [ - ContainerizationOS.Mount( - type: "proc", - source: "proc", - target: "/proc", - options: [] - ), - ContainerizationOS.Mount( - type: "tmpfs", - source: "tmpfs", - target: "/run", - options: [] - ), - ContainerizationOS.Mount( - type: "sysfs", - source: "sysfs", - target: "/sys", - options: [] - ), - ContainerizationOS.Mount( - type: "cgroup2", - source: "none", - target: "/sys/fs/cgroup", - options: [] - ), - ] - - for mnt in mounts { - log.info("mounting \(mnt.target)") - - try mnt.mount(createWithPerms: 0o755) - } - try Binfmt.mount() - - let cgManager = Cgroup2Manager( - group: URL(filePath: "/vminitd"), - logger: log + let mnt = ContainerizationOS.Mount( + type: "proc", + source: "proc", + target: "/proc", + options: [] ) - try cgManager.create() - try cgManager.toggleAllAvailableControllers(enable: true) - - // Set memory.high threshold to 75 MiB - let threshold: UInt64 = 75 * 1024 * 1024 - try cgManager.setMemoryHigh(bytes: threshold) - try cgManager.addProcess(pid: getpid()) + try mnt.mount(createWithPerms: 0o755) + } - let memoryMonitor = try MemoryMonitor( - cgroupManager: cgManager, - threshold: threshold, - logger: log - ) { [log] (currentUsage, highMark) in - log.warning( - "vminitd memory threshold exceeded", - metadata: [ - "threshold_bytes": "\(threshold)", - "current_bytes": "\(currentUsage)", - "high_events_total": "\(highMark)", - ]) + static func isProcMounted() -> Bool { + guard let data = try? String(contentsOfFile: "/proc/mounts", encoding: .utf8) else { + return false } - let t = Thread { [log] in - do { - try memoryMonitor.run() - } catch { - log.error("memory monitor failed: \(error)") + for line in data.split(separator: "\n") { + let fields = line.split(separator: " ") + if fields.count >= 2 { + let mountPoint = String(fields[1]) + if mountPoint == "/proc" { + return true + } } } - t.start() - - let eg = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) - let server = Initd(log: log, group: eg) - - do { - log.info("serving vminitd API") - try await server.serve(port: vsockPort) - log.info("vminitd API returned, syncing filesystems") - #if os(Linux) - Musl.sync() - #endif - } catch { - log.error("vminitd boot error \(error)") - - #if os(Linux) - Musl.sync() - #endif - - exit(1) - } + return false } } diff --git a/vminitd/Sources/vminitd/InitCommand.swift b/vminitd/Sources/vminitd/InitCommand.swift new file mode 100644 index 00000000..ab3775e8 --- /dev/null +++ b/vminitd/Sources/vminitd/InitCommand.swift @@ -0,0 +1,184 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2025 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Cgroup +import Containerization +import ContainerizationError +import ContainerizationOS +import Foundation +import Logging +import NIOCore +import NIOPosix + +#if os(Linux) +import Musl +import LCShim +#endif + +struct InitCommand { + private static let foregroundEnvVar = "FOREGROUND" + private static let vsockPort = 1024 + + static func run(log: Logger) async throws { + var log = log + + try Self.adjustLimits() + + // when running under debug mode, launch vminitd as a sub process of pid1 + // so that we get a chance to collect better logs and errors before pid1 exists + // and the kernel panics. + #if DEBUG + log.info("DEBUG mode active, checking FOREGROUND env var") + let environment = ProcessInfo.processInfo.environment + let foreground = environment[Self.foregroundEnvVar] + log.info("checking for shim var \(foregroundEnvVar)=\(String(describing: foreground))") + + if foreground == nil { + try runInForeground(log) + exit(0) + } + + log.info("FOREGROUND is set, running as subprocess, setting subreaper") + // since we are not running as pid1 in this mode we must set ourselves + // as a subpreaper so that all child processes are reaped by us and not + // passed onto our parent. + CZ_set_sub_reaper() + #endif + + log.logLevel = .debug + + signal(SIGPIPE, SIG_IGN) + + log.info("vminitd booting") + + // Set of mounts necessary to be mounted prior to taking any RPCs. + // 1. /proc as the sysctl rpc wouldn't make sense if it wasn't there (NOTE: This is done before this method + // due to Swift seemingly requiring /proc to be present for the async runtime to spin up). + // 2. /run as that is where we store container state. + // 3. /sys as we need it for /sys/fs/cgroup + // 4. /sys/fs/cgroup to add the agent to a cgroup, as well as containers later. + let mounts = [ + ContainerizationOS.Mount( + type: "tmpfs", + source: "tmpfs", + target: "/run", + options: [] + ), + ContainerizationOS.Mount( + type: "sysfs", + source: "sysfs", + target: "/sys", + options: [] + ), + ContainerizationOS.Mount( + type: "cgroup2", + source: "none", + target: "/sys/fs/cgroup", + options: [] + ), + ] + + for mnt in mounts { + log.info("mounting \(mnt.target)") + + try mnt.mount(createWithPerms: 0o755) + } + try Binfmt.mount() + + let cgManager = Cgroup2Manager( + group: URL(filePath: "/vminitd"), + logger: log + ) + try cgManager.create() + try cgManager.toggleAllAvailableControllers(enable: true) + + // Set memory.high threshold to 75 MiB + let threshold: UInt64 = 75 * 1024 * 1024 + try cgManager.setMemoryHigh(bytes: threshold) + try cgManager.addProcess(pid: getpid()) + + let memoryMonitor = try MemoryMonitor( + cgroupManager: cgManager, + threshold: threshold, + logger: log + ) { [log] (currentUsage, highMark) in + log.warning( + "vminitd memory threshold exceeded", + metadata: [ + "threshold_bytes": "\(threshold)", + "current_bytes": "\(currentUsage)", + "high_events_total": "\(highMark)", + ]) + } + + let t = Thread { [log] in + do { + try memoryMonitor.run() + } catch { + log.error("memory monitor failed: \(error)") + } + } + t.start() + + let eg = MultiThreadedEventLoopGroup(numberOfThreads: System.coreCount) + let server = Initd(log: log, group: eg) + + do { + log.info("serving vminitd API") + try await server.serve(port: vsockPort) + log.info("vminitd API returned, syncing filesystems") + + #if os(Linux) + Musl.sync() + #endif + } catch { + log.error("vminitd boot error \(error)") + + #if os(Linux) + Musl.sync() + #endif + + exit(1) + } + } + + private static func runInForeground(_ log: Logger) throws { + log.info("running vminitd under pid1") + + var command = Command("/sbin/vminitd") + command.attrs = .init(setsid: true) + command.stdin = .standardInput + command.stdout = .standardOutput + command.stderr = .standardError + command.environment = ["\(foregroundEnvVar)=1"] + + try command.start() + let exitCode = try command.wait() + log.info("child process exited with code: \(exitCode)") + } + + private static func adjustLimits() throws { + var limits = rlimit() + guard getrlimit(RLIMIT_NOFILE, &limits) == 0 else { + throw POSIXError(.init(rawValue: errno)!) + } + limits.rlim_cur = 65536 + limits.rlim_max = 65536 + guard setrlimit(RLIMIT_NOFILE, &limits) == 0 else { + throw POSIXError(.init(rawValue: errno)!) + } + } +} diff --git a/vminitd/Sources/vminitd/PauseCommand.swift b/vminitd/Sources/vminitd/PauseCommand.swift new file mode 100644 index 00000000..c6f2b91b --- /dev/null +++ b/vminitd/Sources/vminitd/PauseCommand.swift @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2025 Apple Inc. and the Containerization project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Dispatch +import Logging +import Musl + +struct PauseCommand { + static func run(log: Logger) throws { + if getpid() != 1 { + log.warning("pause should be the first process") + } + + // NOTE: For whatever reason, using signal() for the below causes a swift compiler issue. + // Can revert whenever that is understood. + let sigintSource = DispatchSource.makeSignalSource(signal: SIGINT) + sigintSource.setEventHandler { + log.info("Shutting down, got SIGINT") + Musl.exit(0) + } + sigintSource.resume() + + let sigtermSource = DispatchSource.makeSignalSource(signal: SIGTERM) + sigtermSource.setEventHandler { + log.info("Shutting down, got SIGTERM") + Musl.exit(0) + } + sigtermSource.resume() + + let sigchldSource = DispatchSource.makeSignalSource(signal: SIGCHLD) + sigchldSource.setEventHandler { + var status: Int32 = 0 + while waitpid(-1, &status, WNOHANG) > 0 {} + } + sigchldSource.resume() + + log.info("pause container running, waiting for signals...") + + while true { + Musl.pause() + } + + log.error("Error: infinite loop terminated") + Musl.exit(42) + } +} diff --git a/vminitd/Sources/vminitd/Server+GRPC.swift b/vminitd/Sources/vminitd/Server+GRPC.swift index 8be68389..ba1f609b 100644 --- a/vminitd/Sources/vminitd/Server+GRPC.swift +++ b/vminitd/Sources/vminitd/Server+GRPC.swift @@ -26,7 +26,6 @@ import Logging import NIOCore import NIOPosix import SwiftProtobuf -import _NIOFileSystem private let _setenv = Foundation.setenv