From 7033950efda9484a50720ac6c8ae5c3ae59528ef Mon Sep 17 00:00:00 2001 From: Jaewon Hur Date: Mon, 23 Feb 2026 17:02:36 -0800 Subject: [PATCH 01/11] Add StoppedState and manualStopped in ContainerState --- .../Server/Containers/ContainersService.swift | 58 ++++++++++++++----- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift index dc13e0623..dbe1d422f 100644 --- a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift +++ b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift @@ -31,11 +31,19 @@ import Logging import SystemPackage public actor ContainersService { + struct StoppedState { + var startError: Error? + var exitStatus: ExitStatus? + } + struct ContainerState { var snapshot: ContainerSnapshot var client: SandboxClient? var allocatedAttachments: [AllocatedAttachment] + var stoppedState: StoppedState? + var manualStopped: Bool = false + func getClient() throws -> SandboxClient { guard let client else { var message = "no sandbox client exists" @@ -108,7 +116,7 @@ public actor ContainersService { networks: [], startedDate: nil ), - allocatedAttachments: [] + allocatedAttachments: [], ) results[config.id] = state guard runtimePlugins.first(where: { $0.name == config.runtimeHandler }) != nil else { @@ -370,7 +378,10 @@ public actor ContainersService { networks: [], startedDate: nil ) - await self.setContainerState(configuration.id, ContainerState(snapshot: snapshot, allocatedAttachments: []), context: context) + await self.setContainerState( + configuration.id, + ContainerState(snapshot: snapshot, allocatedAttachments: []), + context: context) } catch { throw error } @@ -468,6 +479,10 @@ public actor ContainersService { await self.exitMonitor.stopTracking(id: id) try? ServiceManager.deregister(fullServiceLabel: label) + + state.stoppedState = StoppedState(startError: error) + await self.setContainerState(id, state, context: context) + throw error } } @@ -570,6 +585,10 @@ public actor ContainersService { } catch { await self.exitMonitor.stopTracking(id: id) try? await client.stop(options: ContainerStopOptions.default) + + state.stoppedState = StoppedState(startError: error) + await self.setContainerState(id, state, context: context) + throw error } } @@ -622,24 +641,30 @@ public actor ContainersService { ) } - let state = try self._getContainerState(id: id) + try await self.lock.withLock(logMetadata: ["acquirer": "\(#function)", "id": "\(id)"]) { context in + var state = try await self.getContainerState(id: id, context: context) - // Stop should be idempotent. - let client: SandboxClient - do { - client = try state.getClient() - } catch { - return - } + state.manualStopped = true + await self.setContainerState(id, state, context: context) - do { - try await client.stop(options: options) - } catch let err as ContainerizationError { - if err.code != .interrupted { - throw err + // Stop should be idempotent. + let client: SandboxClient + do { + client = try state.getClient() + } catch { + return + } + + do { + try await client.stop(options: options) + } catch let err as ContainerizationError { + if err.code != .interrupted { + throw err + } } + + try await self.handleContainerExit(id: id, code: nil, context: context) } - try await handleContainerExit(id: id) } public func dial(id: String, port: UInt32) async throws -> FileHandle { @@ -976,6 +1001,7 @@ public actor ContainersService { state.snapshot.networks = [] state.client = nil state.allocatedAttachments = [] + state.stoppedState = StoppedState(exitStatus: code) await self.setContainerState(id, state, context: context) let options = try getContainerCreationOptions(id: id) From ae1aad80cdcb09e0054d0a551bd580228f291f71 Mon Sep 17 00:00:00 2001 From: Jaewon Hur Date: Mon, 23 Feb 2026 17:21:57 -0800 Subject: [PATCH 02/11] Create and expose restart policy --- .../Container/ContainerCreate.swift | 2 +- .../Container/ContainerRun.swift | 2 +- .../Container/ContainerCreateOptions.swift | 23 +++++++++++++++++-- .../ContainerAPIService/Client/Flags.swift | 6 +++++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/Sources/ContainerCommands/Container/ContainerCreate.swift b/Sources/ContainerCommands/Container/ContainerCreate.swift index ac26d205a..b48642c65 100644 --- a/Sources/ContainerCommands/Container/ContainerCreate.swift +++ b/Sources/ContainerCommands/Container/ContainerCreate.swift @@ -82,7 +82,7 @@ extension Application { log: log ) - let options = ContainerCreateOptions(autoRemove: managementFlags.remove) + let options = ContainerCreateOptions(autoRemove: managementFlags.remove, restartPolicy: managementFlags.restart) let client = ContainerClient() try await client.create(configuration: ck.0, options: options, kernel: ck.1, initImage: ck.2) diff --git a/Sources/ContainerCommands/Container/ContainerRun.swift b/Sources/ContainerCommands/Container/ContainerRun.swift index c83fbf790..872d39de1 100644 --- a/Sources/ContainerCommands/Container/ContainerRun.swift +++ b/Sources/ContainerCommands/Container/ContainerRun.swift @@ -109,7 +109,7 @@ extension Application { progress.set(description: "Starting container") - let options = ContainerCreateOptions(autoRemove: managementFlags.remove) + let options = ContainerCreateOptions(autoRemove: managementFlags.remove, restartPolicy: managementFlags.restart) try await client.create( configuration: ck.0, options: options, diff --git a/Sources/ContainerResource/Container/ContainerCreateOptions.swift b/Sources/ContainerResource/Container/ContainerCreateOptions.swift index dd9da217a..af6edf24b 100644 --- a/Sources/ContainerResource/Container/ContainerCreateOptions.swift +++ b/Sources/ContainerResource/Container/ContainerCreateOptions.swift @@ -14,13 +14,32 @@ // limitations under the License. //===----------------------------------------------------------------------===// +public enum RestartPolicy: String, Sendable, Codable { + case no + case onFailure + case always +} + public struct ContainerCreateOptions: Codable, Sendable { public let autoRemove: Bool + public let restartPolicy: RestartPolicy - public init(autoRemove: Bool) { + public init(autoRemove: Bool, restartPolicy: RestartPolicy) { self.autoRemove = autoRemove + self.restartPolicy = restartPolicy + } + + public static let `default` = ContainerCreateOptions(autoRemove: false, restartPolicy: .no) + + enum CodingKeys: String, CodingKey { + case autoRemove + case restartPolicy } - public static let `default` = ContainerCreateOptions(autoRemove: false) + public init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + autoRemove = try container.decode(Bool.self, forKey: .autoRemove) + restartPolicy = try container.decodeIfPresent(RestartPolicy.self, forKey: .restartPolicy) ?? .no + } } diff --git a/Sources/Services/ContainerAPIService/Client/Flags.swift b/Sources/Services/ContainerAPIService/Client/Flags.swift index b0e00db8a..89c4859d7 100644 --- a/Sources/Services/ContainerAPIService/Client/Flags.swift +++ b/Sources/Services/ContainerAPIService/Client/Flags.swift @@ -15,9 +15,12 @@ //===----------------------------------------------------------------------===// import ArgumentParser +import ContainerResource import ContainerizationError import Foundation +extension RestartPolicy: ExpressibleByArgument {} + public struct Flags { public struct Logging: ParsableArguments { public init() {} @@ -299,6 +302,9 @@ public struct Flags { @Flag(name: [.customLong("rm"), .long], help: "Remove the container after it stops") public var remove = false + @Option(name: .long, help: "Restart policy when the container exits") + public var restart: RestartPolicy = .no + @Flag(name: .long, help: "Enable Rosetta in the container") public var rosetta = false From 0cecfbee76bf393fb0cc94f0304dddf88b6e5da5 Mon Sep 17 00:00:00 2001 From: Jaewon Hur Date: Tue, 24 Feb 2026 10:51:29 -0800 Subject: [PATCH 03/11] Add RuntimeStatus.bootstrapped Containers move to .bootstrapped state after assigning SandboxClient. Following invariant holds: 1. If .stopped, state.client = nil 2. If .bootstrapped, state.client != nil. It'd better to tightly couple the state and client. --- Sources/ContainerResource/Container/RuntimeStatus.swift | 2 ++ .../Server/Containers/ContainersService.swift | 1 + 2 files changed, 3 insertions(+) diff --git a/Sources/ContainerResource/Container/RuntimeStatus.swift b/Sources/ContainerResource/Container/RuntimeStatus.swift index 88900735f..d036f6363 100644 --- a/Sources/ContainerResource/Container/RuntimeStatus.swift +++ b/Sources/ContainerResource/Container/RuntimeStatus.swift @@ -22,6 +22,8 @@ public enum RuntimeStatus: String, CaseIterable, Sendable, Codable { case unknown /// The object is currently stopped. case stopped + /// The object is currently bootstrapped. + case bootstrapped /// The object is currently running. case running /// The object is currently stopping. diff --git a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift index dbe1d422f..5527c247f 100644 --- a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift +++ b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift @@ -456,6 +456,7 @@ public actor ContainersService { state.client = sandboxClient state.allocatedAttachments = allocatedAttachments + state.snapshot.status = .bootstrapped await self.setContainerState(id, state, context: context) } catch { for allocatedAttach in allocatedAttachments { From 3b675c44278be1e29c7fcee165a482a03cf2f308 Mon Sep 17 00:00:00 2001 From: Jaewon Hur Date: Tue, 24 Feb 2026 15:37:06 -0800 Subject: [PATCH 04/11] Split RuntimeStatus to ContainerStatus and SandboxStatus Then, add `restarting` state to ContainerStatus --- .../Builder/BuilderStart.swift | 4 +-- .../Container/ContainerList.swift | 2 +- .../Container/ContainerListFilters.swift | 4 +-- .../Container/ContainerSnapshot.swift | 4 +-- ...timeStatus.swift => ContainerStatus.swift} | 4 ++- .../Container/SandboxStatus.swift | 29 +++++++++++++++++++ .../Client/SandboxSnapshot.swift | 4 +-- .../Server/SandboxService.swift | 4 +-- 8 files changed, 43 insertions(+), 12 deletions(-) rename Sources/ContainerResource/Container/{RuntimeStatus.swift => ContainerStatus.swift} (89%) create mode 100644 Sources/ContainerResource/Container/SandboxStatus.swift diff --git a/Sources/ContainerCommands/Builder/BuilderStart.swift b/Sources/ContainerCommands/Builder/BuilderStart.swift index 855d8b4ca..52e45c2a2 100644 --- a/Sources/ContainerCommands/Builder/BuilderStart.swift +++ b/Sources/ContainerCommands/Builder/BuilderStart.swift @@ -177,7 +177,7 @@ extension Application { // If they changed, stop and delete the existing builder try await client.stop(id: existingContainer.id) try await client.delete(id: existingContainer.id) - case .stopped: + case .stopped, .bootstrapped: // If the builder is stopped and matches our requirements, start it // Otherwise, delete it and create a new one guard imageChanged || cpuChanged || memChanged || envChanged || dnsChanged else { @@ -190,7 +190,7 @@ extension Application { .invalidState, message: "builder is stopping, please wait until it is fully stopped before proceeding" ) - case .unknown: + case .unknown, .restarting: break } } diff --git a/Sources/ContainerCommands/Container/ContainerList.swift b/Sources/ContainerCommands/Container/ContainerList.swift index 53b110bfa..c64b14355 100644 --- a/Sources/ContainerCommands/Container/ContainerList.swift +++ b/Sources/ContainerCommands/Container/ContainerList.swift @@ -99,7 +99,7 @@ extension ContainerSnapshot { } struct PrintableContainer: Codable { - let status: RuntimeStatus + let status: ContainerStatus let configuration: ContainerConfiguration let networks: [Attachment] let startedDate: Date? diff --git a/Sources/ContainerResource/Container/ContainerListFilters.swift b/Sources/ContainerResource/Container/ContainerListFilters.swift index 038b76d7d..ab2273f3f 100644 --- a/Sources/ContainerResource/Container/ContainerListFilters.swift +++ b/Sources/ContainerResource/Container/ContainerListFilters.swift @@ -21,7 +21,7 @@ public struct ContainerListFilters: Sendable, Codable { /// Filter by container IDs. If non-empty, only containers with matching IDs are returned. public var ids: [String] /// Filter by container status. - public var status: RuntimeStatus? + public var status: ContainerStatus? /// Filter by labels. All specified labels must match. public var labels: [String: String] @@ -30,7 +30,7 @@ public struct ContainerListFilters: Sendable, Codable { public init( ids: [String] = [], - status: RuntimeStatus? = nil, + status: ContainerStatus? = nil, labels: [String: String] = [:] ) { self.ids = ids diff --git a/Sources/ContainerResource/Container/ContainerSnapshot.swift b/Sources/ContainerResource/Container/ContainerSnapshot.swift index bae992423..11efc6c04 100644 --- a/Sources/ContainerResource/Container/ContainerSnapshot.swift +++ b/Sources/ContainerResource/Container/ContainerSnapshot.swift @@ -34,7 +34,7 @@ public struct ContainerSnapshot: Codable, Sendable { } /// The runtime status of the container. - public var status: RuntimeStatus + public var status: ContainerStatus /// Network interfaces attached to the sandbox that are provided to the container. public var networks: [Attachment] /// When the container was started. @@ -42,7 +42,7 @@ public struct ContainerSnapshot: Codable, Sendable { public init( configuration: ContainerConfiguration, - status: RuntimeStatus, + status: ContainerStatus, networks: [Attachment], startedDate: Date? = nil ) { diff --git a/Sources/ContainerResource/Container/RuntimeStatus.swift b/Sources/ContainerResource/Container/ContainerStatus.swift similarity index 89% rename from Sources/ContainerResource/Container/RuntimeStatus.swift rename to Sources/ContainerResource/Container/ContainerStatus.swift index d036f6363..9b9423068 100644 --- a/Sources/ContainerResource/Container/RuntimeStatus.swift +++ b/Sources/ContainerResource/Container/ContainerStatus.swift @@ -17,11 +17,13 @@ import Foundation /// Runtime status for a sandbox or container. -public enum RuntimeStatus: String, CaseIterable, Sendable, Codable { +public enum ContainerStatus: String, CaseIterable, Sendable, Codable { /// The object is in an unknown status. case unknown /// The object is currently stopped. case stopped + /// The object is waiting to be restarted. + case restarting /// The object is currently bootstrapped. case bootstrapped /// The object is currently running. diff --git a/Sources/ContainerResource/Container/SandboxStatus.swift b/Sources/ContainerResource/Container/SandboxStatus.swift new file mode 100644 index 000000000..97dd27d70 --- /dev/null +++ b/Sources/ContainerResource/Container/SandboxStatus.swift @@ -0,0 +1,29 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation + +/// Runtime status for a sandbox or container. +public enum SandboxStatus: String, CaseIterable, Sendable, Codable { + /// The object is in an unknown status. + case unknown + /// The object is currently stopped. + case stopped + /// The object is currently running. + case running + /// The object is currently stopping. + case stopping +} diff --git a/Sources/Services/ContainerSandboxService/Client/SandboxSnapshot.swift b/Sources/Services/ContainerSandboxService/Client/SandboxSnapshot.swift index 1ba312b8c..9e1841421 100644 --- a/Sources/Services/ContainerSandboxService/Client/SandboxSnapshot.swift +++ b/Sources/Services/ContainerSandboxService/Client/SandboxSnapshot.swift @@ -19,14 +19,14 @@ import ContainerResource /// A snapshot of a sandbox and its resources. public struct SandboxSnapshot: Codable, Sendable { /// The runtime status of the sandbox. - public var status: RuntimeStatus + public var status: SandboxStatus /// Network attachments for the sandbox. public var networks: [Attachment] /// Containers placed in the sandbox. public var containers: [ContainerSnapshot] public init( - status: RuntimeStatus, + status: SandboxStatus, networks: [Attachment], containers: [ContainerSnapshot] ) { diff --git a/Sources/Services/ContainerSandboxService/Server/SandboxService.swift b/Sources/Services/ContainerSandboxService/Server/SandboxService.swift index 0abb93461..e6172e05e 100644 --- a/Sources/Services/ContainerSandboxService/Server/SandboxService.swift +++ b/Sources/Services/ContainerSandboxService/Server/SandboxService.swift @@ -400,7 +400,7 @@ public actor SandboxService { self.log.debug("enter", metadata: ["func": "\(#function)"]) defer { self.log.debug("exit", metadata: ["func": "\(#function)"]) } - var status: RuntimeStatus = .unknown + var status: SandboxStatus = .unknown var networks: [Attachment] = [] var cs: ContainerSnapshot? @@ -416,7 +416,7 @@ public actor SandboxService { networks = ctr.attachments cs = ContainerSnapshot( configuration: ctr.config, - status: RuntimeStatus.running, + status: .running, networks: networks ) } From c5b086e5fdb09277affd500a899a36d5dc45e72a Mon Sep 17 00:00:00 2001 From: Jaewon Hur Date: Tue, 24 Feb 2026 15:43:13 -0800 Subject: [PATCH 05/11] Add restart scheduler to ContainersService ContainersService spawn an unstructured Task for restart scheduler, which receives stopped container ids through a async queue and restart them if matching conditions. While it doesn't run under lock, it doesn't introduce a new level of race, as all the actual operations changing container state (e.g., `bootstrap`) are lock protected. --- .../Helpers/APIServer/APIServer+Start.swift | 6 +- .../Server/Containers/ContainersService.swift | 67 +++++++++++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/Sources/Helpers/APIServer/APIServer+Start.swift b/Sources/Helpers/APIServer/APIServer+Start.swift index 62fa0296f..be49aa64e 100644 --- a/Sources/Helpers/APIServer/APIServer+Start.swift +++ b/Sources/Helpers/APIServer/APIServer+Start.swift @@ -61,7 +61,7 @@ extension APIServer { var routes = [XPCRoute: XPCServer.RouteHandler]() let pluginLoader = try initializePluginLoader(log: log) try await initializePlugins(pluginLoader: pluginLoader, log: log, routes: &routes) - let containersService = try initializeContainersService( + let containersService = try await initializeContainersService( pluginLoader: pluginLoader, log: log, routes: &routes @@ -244,7 +244,7 @@ extension APIServer { routes[XPCRoute.getDefaultKernel] = harness.getDefaultKernel } - private func initializeContainersService(pluginLoader: PluginLoader, log: Logger, routes: inout [XPCRoute: XPCServer.RouteHandler]) throws -> ContainersService { + private func initializeContainersService(pluginLoader: PluginLoader, log: Logger, routes: inout [XPCRoute: XPCServer.RouteHandler]) async throws -> ContainersService { log.info("initializing containers service") let service = try ContainersService( @@ -271,6 +271,8 @@ extension APIServer { routes[XPCRoute.containerDiskUsage] = harness.diskUsage routes[XPCRoute.containerExport] = harness.export + async let _ = try service.runRestartScheduler() + return service } diff --git a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift index 5527c247f..22c16404e 100644 --- a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift +++ b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift @@ -59,6 +59,10 @@ public actor ContainersService { private static let machServicePrefix = "com.apple.container" private static let launchdDomainString = try! ServiceManager.getDomainString() + private let exitQueue: AsyncStream + private let exitQueueContinuation: AsyncStream.Continuation + private var restartScheduler: Task? + private let log: Logger private let debugHelpers: Bool private let containerRoot: URL @@ -87,7 +91,13 @@ public actor ContainersService { self.log = log self.debugHelpers = debugHelpers self.runtimePlugins = pluginLoader.findPlugins().filter { $0.hasType(.runtime) } + + (self.exitQueue, self.exitQueueContinuation) = AsyncStream.makeStream(of: String.self) self.containers = try Self.loadAtBoot(root: containerRoot, loader: pluginLoader, log: log) + + for id in self.containers.keys { + self.exitQueueContinuation.yield(id) + } } public func setNetworksService(_ service: NetworksService) async { @@ -138,6 +148,61 @@ public actor ContainersService { return results } + public func runRestartScheduler() throws { + log.debug( + "ContainersService: enter", + metadata: ["func": "\(#function)"] + ) + defer { + log.debug( + "ContainersService: exit", + metadata: ["func": "\(#function)"] + ) + } + + guard restartScheduler == nil else { + throw ContainerizationError(.invalidState, message: "already running restart scheduler") + } + + restartScheduler = Task { + for await id in self.exitQueue { + do { + let state = try self._getContainerState(id: id) + + guard state.snapshot.status == .stopped else { + throw ContainerizationError(.invalidState, message: "container not stopped: '\(id)'") + } + + let options = try self.getContainerCreationOptions(id: id) + + guard options.autoRemove == false else { + continue + } + + let startFailed = state.stoppedState?.startError != nil + let exitedWithError = (state.stoppedState?.exitStatus?.exitCode ?? 0) != 0 + let manualStopped = state.manualStopped + + switch options.restartPolicy { + case .onFailure where !startFailed && !manualStopped && exitedWithError: + break + case .always where !startFailed && !manualStopped: + break + case _: + continue + } + + try await bootstrap(id: id, stdio: [FileHandle?](repeating: nil, count: 3)) + try await startProcess(id: id, processID: id) + } catch { + log.error( + "failed to restart container", + metadata: ["id": "\(id)", "error": "\(error)"]) + } + } + } + } + /// List containers matching the given filters. public func list(filters: ContainerListFilters = .all) async throws -> [ContainerSnapshot] { log.debug( @@ -1008,6 +1073,8 @@ public actor ContainersService { let options = try getContainerCreationOptions(id: id) if options.autoRemove { try await self.cleanUp(id: id, context: context) + } else { + exitQueueContinuation.yield(id) } } From 55527460f7cbc43fd4de9a302496a978da7acb29 Mon Sep 17 00:00:00 2001 From: Jaewon Hur Date: Tue, 24 Feb 2026 16:14:44 -0800 Subject: [PATCH 06/11] Add backOff and wait before restarting --- .../Server/Containers/ContainersService.swift | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift index 22c16404e..1c7000381 100644 --- a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift +++ b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift @@ -37,12 +37,16 @@ public actor ContainersService { } struct ContainerState { + private static let initialBackOff = Duration.milliseconds(100) + private static let maxBackOff = Duration.seconds(10) + var snapshot: ContainerSnapshot var client: SandboxClient? var allocatedAttachments: [AllocatedAttachment] var stoppedState: StoppedState? var manualStopped: Bool = false + var backOff: Duration? func getClient() throws -> SandboxClient { guard let client else { @@ -54,6 +58,16 @@ public actor ContainersService { } return client } + + mutating func setStartError(error: Error) { + stoppedState = StoppedState(startError: error) + backOff = nil + } + + mutating func setExitStatus(exitStatus: ExitStatus?) { + stoppedState = StoppedState(exitStatus: exitStatus) + backOff = manualStopped ? nil : backOff.map { min($0 * 2, Self.maxBackOff) } ?? Self.initialBackOff + } } private static let machServicePrefix = "com.apple.container" @@ -192,6 +206,10 @@ public actor ContainersService { continue } + if let backOff = state.backOff { + try await Task.sleep(for: backOff) + } + try await bootstrap(id: id, stdio: [FileHandle?](repeating: nil, count: 3)) try await startProcess(id: id, processID: id) } catch { @@ -546,7 +564,7 @@ public actor ContainersService { await self.exitMonitor.stopTracking(id: id) try? ServiceManager.deregister(fullServiceLabel: label) - state.stoppedState = StoppedState(startError: error) + state.setStartError(error: error) await self.setContainerState(id, state, context: context) throw error @@ -652,7 +670,7 @@ public actor ContainersService { await self.exitMonitor.stopTracking(id: id) try? await client.stop(options: ContainerStopOptions.default) - state.stoppedState = StoppedState(startError: error) + state.setStartError(error: error) await self.setContainerState(id, state, context: context) throw error @@ -682,9 +700,15 @@ public actor ContainersService { ) } - let state = try self._getContainerState(id: id) - let client = try state.getClient() - try await client.kill(processID, signal: signal) + try await self.lock.withLock(logMetadata: ["acquirer": "\(#function)", "id": "\(id)"]) { context in + var state = try await self.getContainerState(id: id, context: context) + + state.manualStopped = true + await self.setContainerState(id, state, context: context) + + let client = try state.getClient() + try await client.kill(processID, signal: signal) + } } /// Stop all containers inside the sandbox, aborting any processes currently @@ -1067,7 +1091,7 @@ public actor ContainersService { state.snapshot.networks = [] state.client = nil state.allocatedAttachments = [] - state.stoppedState = StoppedState(exitStatus: code) + state.setExitStatus(exitStatus: code) await self.setContainerState(id, state, context: context) let options = try getContainerCreationOptions(id: id) From 743a2a40b4f285d84062e05215c3ae2fe4136c1e Mon Sep 17 00:00:00 2001 From: Jaewon Hur Date: Tue, 24 Feb 2026 16:20:56 -0800 Subject: [PATCH 07/11] Fix backOff update --- .../Server/Containers/ContainersService.swift | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift index 1c7000381..5c651d8f9 100644 --- a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift +++ b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift @@ -64,9 +64,15 @@ public actor ContainersService { backOff = nil } - mutating func setExitStatus(exitStatus: ExitStatus?) { + mutating func setExitStatus(exitStatus: ExitStatus?, restartPolicy: RestartPolicy) { stoppedState = StoppedState(exitStatus: exitStatus) - backOff = manualStopped ? nil : backOff.map { min($0 * 2, Self.maxBackOff) } ?? Self.initialBackOff + switch restartPolicy { + case .onFailure where !manualStopped && (exitStatus?.exitCode ?? 0) != 0, + .always where !manualStopped: + backOff = backOff.map { min($0 * 2, Self.maxBackOff) } ?? Self.initialBackOff + case _: + backOff = nil + } } } @@ -1087,14 +1093,15 @@ public actor ContainersService { } } + let options = try getContainerCreationOptions(id: id) + state.snapshot.status = .stopped state.snapshot.networks = [] state.client = nil state.allocatedAttachments = [] - state.setExitStatus(exitStatus: code) + state.setExitStatus(exitStatus: code, restartPolicy: options.restartPolicy) await self.setContainerState(id, state, context: context) - let options = try getContainerCreationOptions(id: id) if options.autoRemove { try await self.cleanUp(id: id, context: context) } else { From 75f35ac027cd19a39cb923bf2cfbe1f6ce2a7b41 Mon Sep 17 00:00:00 2001 From: Jaewon Hur Date: Tue, 24 Feb 2026 17:05:08 -0800 Subject: [PATCH 08/11] Add stability call to reset backOff This reuses ExitMonitor, which was not initially for this. It might be better to come up with generic Monitor structure? --- .../Server/Containers/ContainersService.swift | 104 ++++++++++++------ 1 file changed, 69 insertions(+), 35 deletions(-) diff --git a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift index 5c651d8f9..437667654 100644 --- a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift +++ b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift @@ -39,6 +39,7 @@ public actor ContainersService { struct ContainerState { private static let initialBackOff = Duration.milliseconds(100) private static let maxBackOff = Duration.seconds(10) + public static let stabilityCall = Duration.seconds(10) var snapshot: ContainerSnapshot var client: SandboxClient? @@ -82,6 +83,7 @@ public actor ContainersService { private let exitQueue: AsyncStream private let exitQueueContinuation: AsyncStream.Continuation private var restartScheduler: Task? + private var stabilityMonitor: ExitMonitor? private let log: Logger private let debugHelpers: Bool @@ -184,44 +186,64 @@ public actor ContainersService { throw ContainerizationError(.invalidState, message: "already running restart scheduler") } + stabilityMonitor = ExitMonitor(log: log) restartScheduler = Task { for await id in self.exitQueue { - do { - let state = try self._getContainerState(id: id) - - guard state.snapshot.status == .stopped else { - throw ContainerizationError(.invalidState, message: "container not stopped: '\(id)'") - } - - let options = try self.getContainerCreationOptions(id: id) - - guard options.autoRemove == false else { - continue - } - - let startFailed = state.stoppedState?.startError != nil - let exitedWithError = (state.stoppedState?.exitStatus?.exitCode ?? 0) != 0 - let manualStopped = state.manualStopped - - switch options.restartPolicy { - case .onFailure where !startFailed && !manualStopped && exitedWithError: - break - case .always where !startFailed && !manualStopped: - break - case _: - continue - } - - if let backOff = state.backOff { - try await Task.sleep(for: backOff) + Task { + do { + await stabilityMonitor?.stopTracking(id: id) + + let state = try self._getContainerState(id: id) + + guard state.snapshot.status == .stopped else { + throw ContainerizationError(.invalidState, message: "container not stopped: '\(id)'") + } + + let options = try self.getContainerCreationOptions(id: id) + + guard options.autoRemove == false else { + return + } + + let startFailed = state.stoppedState?.startError != nil + let exitedWithError = (state.stoppedState?.exitStatus?.exitCode ?? 0) != 0 + let manualStopped = state.manualStopped + + switch options.restartPolicy { + case .onFailure where !startFailed && !manualStopped && exitedWithError: + break + case .always where !startFailed && !manualStopped: + break + case _: + return + } + + if let backOff = state.backOff { + try await Task.sleep(for: backOff) + } + + try await stabilityMonitor?.registerProcess( + id: id, + onExit: { id, code in + guard code.exitCode == 0 else { + return + } + try? await self.resetBackOff(id: id) + } + ) + try await stabilityMonitor?.track(id: id) { + try await Task.sleep(for: ContainerState.stabilityCall) + return ExitStatus(exitCode: 0) + } + + try await bootstrap(id: id, stdio: [FileHandle?](repeating: nil, count: 3)) + try await startProcess(id: id, processID: id) + } catch { + try await kill(id: id, processID: id, signal: Int64(SIGKILL)) + log.error( + "failed to restart container", + metadata: ["id": "\(id)", "error": "\(error)"]) } - - try await bootstrap(id: id, stdio: [FileHandle?](repeating: nil, count: 3)) - try await startProcess(id: id, processID: id) - } catch { - log.error( - "failed to restart container", - metadata: ["id": "\(id)", "error": "\(error)"]) } } } @@ -1109,6 +1131,18 @@ public actor ContainersService { } } + private func resetBackOff(id: String) async throws { + try await self.lock.withLock { context in + var state = try await self.getContainerState(id: id, context: context) + guard state.snapshot.status == .running else { + return + } + + state.backOff = nil + await self.setContainerState(id, state, context: context) + } + } + private static func fullLaunchdServiceLabel(runtimeName: String, instanceId: String) -> String { "\(Self.launchdDomainString)/\(Self.machServicePrefix).\(runtimeName).\(instanceId)" } From 78cfc1ab25971e9c6e201b1eabaf1a9485d34eb6 Mon Sep 17 00:00:00 2001 From: Jaewon Hur Date: Tue, 24 Feb 2026 19:11:32 -0800 Subject: [PATCH 09/11] Add TestCLIRunRestart --- .../Subcommands/Run/TestCLIRunRestart.swift | 197 ++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 Tests/CLITests/Subcommands/Run/TestCLIRunRestart.swift diff --git a/Tests/CLITests/Subcommands/Run/TestCLIRunRestart.swift b/Tests/CLITests/Subcommands/Run/TestCLIRunRestart.swift new file mode 100644 index 000000000..ea55f61af --- /dev/null +++ b/Tests/CLITests/Subcommands/Run/TestCLIRunRestart.swift @@ -0,0 +1,197 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import ContainerizationError +import Foundation +import Testing + +class TestCLIRunRestart: CLITest { + func getTestName() -> String { + Test.current!.name.trimmingCharacters(in: ["(", ")"]).lowercased() + } + + // Run a container that exits with the given code on first run, then sleeps forever on restart. + // This allows waitForContainerRunning to reliably catch the restarted container. + private func runWithRestartOnce(name: String, policy: RestartPolicy, exitCode: Int) throws { + try doLongRun( + name: name, + args: ["--restart", policy.rawValue], + containerArgs: ["sh", "-c", "if [ ! -f /tmp/restarted ]; then touch /tmp/restarted; exit \(exitCode); else sleep infinity; fi"], + autoRemove: false + ) + } + + @Test func testRestartNo() async throws { + let name = getTestName() + defer { try? doRemove(name: name, force: true) } + + try runWithRestartOnce(name: name, policy: .no, exitCode: 0) + + // Give a moment for any (unexpected) restart to occur + try await Task.sleep(for: .seconds(3)) + + let status = try getContainerStatus(name) + #expect(status == "stopped", "expected container with restart policy 'no' to remain stopped, got '\(status)'") + } + + @Test func testRestartOnFailure() async throws { + let failing = "\(getTestName())-exit-fail" + + // Non-zero exit: should restart + try runWithRestartOnce(name: failing, policy: .onFailure, exitCode: 1) + defer { try? doRemove(name: failing, force: true) } + + try waitForContainerRunning(failing) + var status = try getContainerStatus(failing) + #expect(status == "running", "expected container with 'onFailure' policy to restart after non-zero exit, got '\(status)'") + + try await Task.sleep(for: .seconds(1)) + try doStop(name: failing) + + try await Task.sleep(for: .seconds(10)) + status = try getContainerStatus(failing) + #expect(status == "stopped", "expected container with 'onFailure' policy to not restart after manual stop, got '\(status)'") + try doRemove(name: failing, force: true) + + let succeeding = "\(getTestName())-exit-succeed" + + try runWithRestartOnce(name: succeeding, policy: .onFailure, exitCode: 0) + defer { try? doRemove(name: succeeding, force: true) } + + try await Task.sleep(for: .seconds(3)) + status = try getContainerStatus(succeeding) + #expect(status == "stopped", "expected container with 'onFailure' policy to not restart after zero exit, got '\(status)'") + } + + @Test func testRestartAlways() async throws { + let name = getTestName() + + try runWithRestartOnce(name: name, policy: .always, exitCode: 0) + defer { try? doRemove(name: name, force: true) } + + try waitForContainerRunning(name) + var status = try getContainerStatus(name) + #expect(status == "running", "expected container with 'always' policy to restart after zero exit, got '\(status)'") + + try await Task.sleep(for: .seconds(1)) + try doStop(name: name) + + try await Task.sleep(for: .seconds(3)) + status = try getContainerStatus(name) + #expect(status == "stopped", "expected container with 'always' policy to not restart after manual stop, got '\(status)'") + } + + @Test func testRestartMultiple() async throws { + // Multiple containers restarting must not block each other + let name1 = "\(getTestName())1" + let name2 = "\(getTestName())2" + + try runWithRestartOnce(name: name1, policy: .always, exitCode: 1) + try runWithRestartOnce(name: name2, policy: .always, exitCode: 1) + defer { + try? doStop(name: name1) + try? doStop(name: name2) + try? doRemove(name: name1, force: true) + try? doRemove(name: name2, force: true) + } + + // Both should restart independently without blocking each other + try waitForContainerRunning(name1) + try waitForContainerRunning(name2) + + let status1 = try getContainerStatus(name1) + let status2 = try getContainerStatus(name2) + #expect(status1 == "running", "expected container1 to be running, got '\(status1)'") + #expect(status2 == "running", "expected container2 to be running, got '\(status2)'") + } + + @Test func testBackOff() async throws { + let name = getTestName() + + try doLongRun( + name: name, + args: ["--restart", RestartPolicy.always.rawValue], + containerArgs: ["sh", "-c", "sleep 1; exit 1;"], + autoRemove: false + ) + defer { + try? doStop(name: name) + try? doRemove(name: name, force: true) + } + + // Poll until running (first restart) + var samples: [Bool] = [] + for _ in 0..<25 { + if (try? getContainerStatus(name)) == "running" { + samples.append(true) + } else { + samples.append(false) + } + try await Task.sleep(for: .milliseconds(500)) + } + #expect(samples.contains(true), "container did not restart for the first time") + + // If backOff is doubling correctly, the gap between restarts grows each cycle. + // With 500ms polling and 1s sleep-before-exit per cycle, the backOff doubling + // (100ms -> 200ms -> 400ms -> 800ms -> ...) means consecutive false runs grow longer. + // The maximum run of consecutive false samples must be at least 4. + var maxConsecutiveFalse = 0 + var currentRun = 0 + for sample in samples { + if !sample { + currentRun += 1 + maxConsecutiveFalse = max(maxConsecutiveFalse, currentRun) + } else { + currentRun = 0 + } + } + #expect(maxConsecutiveFalse >= 4, "expected backOff to cause at least 4 consecutive stopped samples, got \(maxConsecutiveFalse)") + } + + @Test func testStabilityCall() async throws { + let name = getTestName() + + // Each run increments a counter by appending a line to /tmp/count. + // On run 8, backOff has accumulated to 10s (capped); the container then sleeps 12s + // to stay alive past the stabilityCall window, which resets backOff to nil. + // + // BackOff sequence: 100ms, 200ms, 400ms, 800ms, 1.6s, 3.2s, 6.4s, 10s (cap) + // Total expected wait: (1+0.1)+(1+0.2)+(1+0.4)+(1+0.8)+(1+1.6)+(1+3.2)+(1+6.4)+(1+10)+12+0.5 ≈ 43.2s + try doLongRun( + name: name, + args: ["--restart", RestartPolicy.always.rawValue], + containerArgs: [ + "sh", "-c", + "echo x >> /tmp/count; n=$(wc -l < /tmp/count); if [ \"$n\" -ge 8 ]; then sleep 12; fi; exit 1", + ], + autoRemove: false + ) + defer { + try? doStop(name: name) + try? doRemove(name: name, force: true) + } + + // Wait for all 8 cycles + stability window + a small buffer. + // (1+0.1)+(1+0.2)+(1+0.4)+(1+0.8)+(1+1.6)+(1+3.2)+(1+6.4)+(1+10)+12+0.5 ≈ 43.2s + try await Task.sleep(for: .seconds(45)) + + // At this point run 8 has slept 12s, triggering stabilityCall and resetting backOff. + // The container should be running (restarted quickly after backOff reset). + let status = try getContainerStatus(name) + #expect(status == "running", "expected container to be running after stabilityCall reset backOff, got '\(status)'") + } +} From aa251f5a1167d8ff95b124317b44eac41f02cc61 Mon Sep 17 00:00:00 2001 From: Jaewon Hur Date: Tue, 24 Feb 2026 23:17:21 -0800 Subject: [PATCH 10/11] Use doKill instead of doStop doStop cannot guarantee the container is stopped in short term. --- .../Subcommands/Run/TestCLIRunRestart.swift | 18 +++++++++++------- Tests/CLITests/Utilities/CLITest.swift | 12 ++++++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/Tests/CLITests/Subcommands/Run/TestCLIRunRestart.swift b/Tests/CLITests/Subcommands/Run/TestCLIRunRestart.swift index ea55f61af..d88a83637 100644 --- a/Tests/CLITests/Subcommands/Run/TestCLIRunRestart.swift +++ b/Tests/CLITests/Subcommands/Run/TestCLIRunRestart.swift @@ -55,14 +55,16 @@ class TestCLIRunRestart: CLITest { try runWithRestartOnce(name: failing, policy: .onFailure, exitCode: 1) defer { try? doRemove(name: failing, force: true) } + // Give time for container to restart + try await Task.sleep(for: .seconds(3)) + try waitForContainerRunning(failing) var status = try getContainerStatus(failing) #expect(status == "running", "expected container with 'onFailure' policy to restart after non-zero exit, got '\(status)'") - try await Task.sleep(for: .seconds(1)) - try doStop(name: failing) + try doKill(name: failing) + try await Task.sleep(for: .seconds(3)) - try await Task.sleep(for: .seconds(10)) status = try getContainerStatus(failing) #expect(status == "stopped", "expected container with 'onFailure' policy to not restart after manual stop, got '\(status)'") try doRemove(name: failing, force: true) @@ -83,14 +85,16 @@ class TestCLIRunRestart: CLITest { try runWithRestartOnce(name: name, policy: .always, exitCode: 0) defer { try? doRemove(name: name, force: true) } + // Give time for container to restart + try await Task.sleep(for: .seconds(3)) + try waitForContainerRunning(name) var status = try getContainerStatus(name) #expect(status == "running", "expected container with 'always' policy to restart after zero exit, got '\(status)'") - try await Task.sleep(for: .seconds(1)) - try doStop(name: name) - + try doKill(name: name) try await Task.sleep(for: .seconds(3)) + status = try getContainerStatus(name) #expect(status == "stopped", "expected container with 'always' policy to not restart after manual stop, got '\(status)'") } @@ -135,7 +139,7 @@ class TestCLIRunRestart: CLITest { // Poll until running (first restart) var samples: [Bool] = [] - for _ in 0..<25 { + for _ in 0..<30 { if (try? getContainerStatus(name)) == "running" { samples.append(true) } else { diff --git a/Tests/CLITests/Utilities/CLITest.swift b/Tests/CLITests/Utilities/CLITest.swift index 02d4df177..b3227aa2a 100644 --- a/Tests/CLITests/Utilities/CLITest.swift +++ b/Tests/CLITests/Utilities/CLITest.swift @@ -264,6 +264,18 @@ class CLITest { } } + func doKill(name: String, signal: String = "SIGKILL") throws { + let (_, _, error, status) = try run(arguments: [ + "kill", + "-s", + signal, + name, + ]) + if status != 0 { + throw CLIError.executionFailed("command failed: \(error)") + } + } + func doCreate( name: String, image: String? = nil, From 0304d28a431799489c230d89b01f0e664defc925 Mon Sep 17 00:00:00 2001 From: Jaewon Hur Date: Thu, 26 Feb 2026 10:47:16 -0800 Subject: [PATCH 11/11] Proceed containers to restarting state --- .../Server/Containers/ContainersService.swift | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift index 437667654..f32c6eede 100644 --- a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift +++ b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift @@ -194,13 +194,7 @@ public actor ContainersService { await stabilityMonitor?.stopTracking(id: id) let state = try self._getContainerState(id: id) - - guard state.snapshot.status == .stopped else { - throw ContainerizationError(.invalidState, message: "container not stopped: '\(id)'") - } - let options = try self.getContainerCreationOptions(id: id) - guard options.autoRemove == false else { return } @@ -218,10 +212,15 @@ public actor ContainersService { return } + try await restart(id: id) if let backOff = state.backOff { try await Task.sleep(for: backOff) } + guard (try self._getContainerState(id: id)).snapshot.status == .restarting else { + return + } + try await stabilityMonitor?.registerProcess( id: id, onExit: { id, code in @@ -1131,6 +1130,18 @@ public actor ContainersService { } } + private func restart(id: String) async throws { + try await self.lock.withLock { context in + var state = try await self.getContainerState(id: id, context: context) + guard state.snapshot.status == .stopped else { + throw ContainerizationError(.invalidState, message: "container not stopped: '\(id)'") + } + + state.snapshot.status = .restarting + await self.setContainerState(id, state, context: context) + } + } + private func resetBackOff(id: String) async throws { try await self.lock.withLock { context in var state = try await self.getContainerState(id: id, context: context)