From 7e6a18c2472b036eefbb8be6301447391722cc55 Mon Sep 17 00:00:00 2001 From: Laura Brehm Date: Tue, 27 Aug 2024 23:48:54 +0100 Subject: [PATCH 1/3] runc-shim: remove misleading comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's not true that `s.mu` needs to be held when calling `handleProcessExit`, and indeed hasn't been the case for a while – see 892dc54bd26f6e8b9aea4672208b1ba2158d8a1b. Signed-off-by: Laura Brehm (cherry picked from commit 7f3bf993d64c3b91b5d1ab00b48d0ccbe5666714) Signed-off-by: Laura Brehm --- runtime/v2/runc/task/service.go | 1 - 1 file changed, 1 deletion(-) diff --git a/runtime/v2/runc/task/service.go b/runtime/v2/runc/task/service.go index aaa2c85d8cd6..36d3ce1b3c8d 100644 --- a/runtime/v2/runc/task/service.go +++ b/runtime/v2/runc/task/service.go @@ -705,7 +705,6 @@ func (s *service) send(evt interface{}) { s.events <- evt } -// s.mu must be locked when calling handleProcessExit func (s *service) handleProcessExit(e runcC.Exit, c *runc.Container, p process.Process) { if ip, ok := p.(*process.Init); ok { // Ensure all children are killed From 15ad6ac67216a04fcf5c996e0e10b79d9a3bff6d Mon Sep 17 00:00:00 2001 From: Cory Snider Date: Fri, 23 Aug 2024 13:30:15 -0400 Subject: [PATCH 2/3] runc-shim: refuse to start execs after init exits The runc task state machine prevents execs from being created after the init process has exited, but there are no guards against starting a created exec after the init process has exited. That leaves a small window for starting an exec to race our handling of the init process exiting. Normally this is not an issue in practice: the kernel will atomically kill all processes in a PID namespace when its "init" process terminates, and will not allow new processes to fork(2) into the PID namespace afterwards. Therefore the racing exec is guaranteed by the kernel to not be running after the init process terminates. On the other hand, when the container does not have a private PID namespace (i.e. the container's init process is not the "init" process of the container's PID namespace), the kernel does not automatically kill other container processes on init exit and will happily allow runc to start an exec process at any time. It is the runc shim's responsibility to clean up the container when the init process exits in this situation by killing all the container's remaining processes. Block execs from being started after the container's init process has exited to prevent the processes from leaking, and to avoid violating the task service's assumption that an exec can be running iff the init process is also running. Signed-off-by: Cory Snider (cherry picked from commit e7357916bbbd6574d90d5717ae3c68ccfd9aa689) Signed-off-by: Laura Brehm --- runtime/v2/runc/task/service.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/runtime/v2/runc/task/service.go b/runtime/v2/runc/task/service.go index 36d3ce1b3c8d..7e4c53680850 100644 --- a/runtime/v2/runc/task/service.go +++ b/runtime/v2/runc/task/service.go @@ -80,6 +80,7 @@ func NewTaskService(ctx context.Context, publisher shim.Publisher, sd shutdown.S containers: make(map[string]*runc.Container), running: make(map[int][]containerProcess), pendingExecs: make(map[*runc.Container]int), + execable: make(map[*runc.Container]bool), exitSubscribers: make(map[*map[int][]runcC.Exit]struct{}), } go s.processExits() @@ -116,6 +117,12 @@ type service struct { lifecycleMu sync.Mutex running map[int][]containerProcess // pid -> running process, guarded by lifecycleMu pendingExecs map[*runc.Container]int // container -> num pending execs, guarded by lifecycleMu + // container -> execs can be started, guarded by lifecycleMu. + // Execs can be started if the container's init process (read: pid, not [process.Init]) + // has been started and not yet reaped by the shim. + // Note that this flag gets updated before the container's [process.Init.Status] + // is transitioned to "stopped". + execable map[*runc.Container]bool // Subscriptions to exits for PIDs. Adding/deleting subscriptions and // dereferencing the subscription pointers must only be done while holding // lifecycleMu. @@ -229,6 +236,9 @@ func (s *service) preStart(c *runc.Container) (handleStarted func(*runc.Containe Container: c, Process: p, }) + if init { + s.execable[c] = true + } s.lifecycleMu.Unlock() } } @@ -303,6 +313,10 @@ func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI. if r.ExecID == "" { cinit = container } else { + if !s.execable[container] { + s.lifecycleMu.Unlock() + return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container %s init process is not running", container.ID) + } s.pendingExecs[container]++ } handleStarted, cleanup := s.preStart(cinit) @@ -678,6 +692,9 @@ func (s *service) processExits() { var cps, skipped []containerProcess for _, cp := range s.running[e.Pid] { _, init := cp.Process.(*process.Init) + if init { + delete(s.execable, cp.Container) + } if init && s.pendingExecs[cp.Container] != 0 { // This exit relates to a container for which we have pending execs. In // order to ensure order between execs and the init process for a given From c9617c321f016fef7678f5333075542e6e565cbb Mon Sep 17 00:00:00 2001 From: Laura Brehm Date: Tue, 27 Aug 2024 15:02:58 +0100 Subject: [PATCH 3/3] runc-shim: handle pending execs as running This commit rewrites and simplifies a lot of this logic to reduce it's complexity, and also handle the case where the container doesn't have it's own pid-namespace, which means that we're not guaranteed to receive the init exit last. This is achieved by replacing `s.pendingExecs` with `s.runningExecs`, for which both (previously) pending and de facto running execs are considered. The new exit handling logic can be summed up by: - when we receive an init exit, stash it it in `s.containerInitExit`, and if a container's init process has exited, refuse new execs. - (if the container does not have it's own pidns) kill all running processes (if the container has a private pid-namespace, then all processes will be dead already). - wait for the container's running exec count (which includes execs which have been started but might still early exit) to get to 0. - publish the stashed away init exit. Signed-off-by: Laura Brehm (cherry picked from commit 421a4b568ce8ac8f30c8b6b44e785a4063128482) Signed-off-by: Laura Brehm --- runtime/v2/runc/task/service.go | 192 +++++++++++++++++--------------- 1 file changed, 104 insertions(+), 88 deletions(-) diff --git a/runtime/v2/runc/task/service.go b/runtime/v2/runc/task/service.go index 7e4c53680850..1910201d85fa 100644 --- a/runtime/v2/runc/task/service.go +++ b/runtime/v2/runc/task/service.go @@ -72,16 +72,17 @@ func NewTaskService(ctx context.Context, publisher shim.Publisher, sd shutdown.S } go ep.Run(ctx) s := &service{ - context: ctx, - events: make(chan interface{}, 128), - ec: reaper.Default.Subscribe(), - ep: ep, - shutdown: sd, - containers: make(map[string]*runc.Container), - running: make(map[int][]containerProcess), - pendingExecs: make(map[*runc.Container]int), - execable: make(map[*runc.Container]bool), - exitSubscribers: make(map[*map[int][]runcC.Exit]struct{}), + context: ctx, + events: make(chan interface{}, 128), + ec: reaper.Default.Subscribe(), + ep: ep, + shutdown: sd, + containers: make(map[string]*runc.Container), + running: make(map[int][]containerProcess), + runningExecs: make(map[*runc.Container]int), + execCountSubscribers: make(map[*runc.Container]chan<- int), + containerInitExit: make(map[*runc.Container]runcC.Exit), + exitSubscribers: make(map[*map[int][]runcC.Exit]struct{}), } go s.processExits() runcC.Monitor = reaper.Default @@ -116,13 +117,19 @@ type service struct { lifecycleMu sync.Mutex running map[int][]containerProcess // pid -> running process, guarded by lifecycleMu - pendingExecs map[*runc.Container]int // container -> num pending execs, guarded by lifecycleMu - // container -> execs can be started, guarded by lifecycleMu. - // Execs can be started if the container's init process (read: pid, not [process.Init]) - // has been started and not yet reaped by the shim. + runningExecs map[*runc.Container]int // container -> num running execs, guarded by lifecycleMu + // container -> subscription to exec exits/changes to s.runningExecs[container], + // guarded by lifecycleMu + execCountSubscribers map[*runc.Container]chan<- int + // container -> init exits, guarded by lifecycleMu + // Used to stash container init process exits, so that we can hold them + // until after we've made sure to publish all the container's exec exits. + // Also used to prevent starting new execs from being started if the + // container's init process (read: pid, not [process.Init]) has already been + // reaped by the shim. // Note that this flag gets updated before the container's [process.Init.Status] // is transitioned to "stopped". - execable map[*runc.Container]bool + containerInitExit map[*runc.Container]runcC.Exit // Subscriptions to exits for PIDs. Adding/deleting subscriptions and // dereferencing the subscription pointers must only be done while holding // lifecycleMu. @@ -143,8 +150,7 @@ type containerProcess struct { // // The returned handleStarted closure records that the process has started so // that its exit can be handled efficiently. If the process has already exited, -// it handles the exit immediately. In addition, if the process is an exec and -// its container's init process has already exited, that exit is also processed. +// it handles the exit immediately. // handleStarted should be called after the event announcing the start of the // process has been published. Note that s.lifecycleMu must not be held when // calling handleStarted. @@ -179,44 +185,8 @@ func (s *service) preStart(c *runc.Container) (handleStarted func(*runc.Containe pid = p.Pid() } - _, init := p.(*process.Init) s.lifecycleMu.Lock() - var initExits []runcC.Exit - var initCps []containerProcess - if !init { - s.pendingExecs[c]-- - - initPid := c.Pid() - iExits, initExited := exits[initPid] - if initExited && s.pendingExecs[c] == 0 { - // c's init process has exited before handleStarted was called and - // this is the last pending exec process start - we need to process - // the exit for the init process after processing this exec, so: - // - delete c from the s.pendingExecs map - // - keep the exits for the init pid to process later (after we process - // this exec's exits) - // - get the necessary containerProcesses for the init process (that we - // need to process the exits), and remove them from s.running (which we skipped - // doing in processExits). - delete(s.pendingExecs, c) - initExits = iExits - var skipped []containerProcess - for _, initPidCp := range s.running[initPid] { - if initPidCp.Container == c { - initCps = append(initCps, initPidCp) - } else { - skipped = append(skipped, initPidCp) - } - } - if len(skipped) == 0 { - delete(s.running, initPid) - } else { - s.running[initPid] = skipped - } - } - } - ees, exited := exits[pid] delete(s.exitSubscribers, &exits) exits = nil @@ -225,20 +195,12 @@ func (s *service) preStart(c *runc.Container) (handleStarted func(*runc.Containe for _, ee := range ees { s.handleProcessExit(ee, c, p) } - for _, eee := range initExits { - for _, cp := range initCps { - s.handleProcessExit(eee, cp.Container, cp.Process) - } - } } else { // Process start was successful, add to `s.running`. s.running[pid] = append(s.running[pid], containerProcess{ Container: c, Process: p, }) - if init { - s.execable[c] = true - } s.lifecycleMu.Unlock() } } @@ -313,11 +275,11 @@ func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI. if r.ExecID == "" { cinit = container } else { - if !s.execable[container] { + if _, initExited := s.containerInitExit[container]; initExited { s.lifecycleMu.Unlock() return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container %s init process is not running", container.ID) } - s.pendingExecs[container]++ + s.runningExecs[container]++ } handleStarted, cleanup := s.preStart(cinit) s.lifecycleMu.Unlock() @@ -325,6 +287,17 @@ func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI. p, err := container.Start(ctx, r) if err != nil { + // If we failed to even start the process, s.runningExecs + // won't get decremented in s.handleProcessExit. We still need + // to update it. + if r.ExecID != "" { + s.lifecycleMu.Lock() + s.runningExecs[container]-- + if ch, ok := s.execCountSubscribers[container]; ok { + ch <- s.runningExecs[container] + } + s.lifecycleMu.Unlock() + } handleStarted(container, p) return nil, errdefs.ToGRPC(err) } @@ -689,31 +662,23 @@ func (s *service) processExits() { // Handle the exit for a created/started process. If there's more than // one, assume they've all exited. One of them will be the correct // process. - var cps, skipped []containerProcess + var cps []containerProcess for _, cp := range s.running[e.Pid] { _, init := cp.Process.(*process.Init) if init { - delete(s.execable, cp.Container) - } - if init && s.pendingExecs[cp.Container] != 0 { - // This exit relates to a container for which we have pending execs. In - // order to ensure order between execs and the init process for a given - // container, skip processing this exit here and let the `handleStarted` - // closure for the pending exec publish it. - skipped = append(skipped, cp) - } else { - cps = append(cps, cp) + s.containerInitExit[cp.Container] = e } + cps = append(cps, cp) } - if len(skipped) > 0 { - s.running[e.Pid] = skipped - } else { - delete(s.running, e.Pid) - } + delete(s.running, e.Pid) s.lifecycleMu.Unlock() for _, cp := range cps { - s.handleProcessExit(e, cp.Container, cp.Process) + if ip, ok := cp.Process.(*process.Init); ok { + s.handleInitExit(e, cp.Container, ip) + } else { + s.handleProcessExit(e, cp.Container, cp.Process) + } } } } @@ -722,17 +687,60 @@ func (s *service) send(evt interface{}) { s.events <- evt } -func (s *service) handleProcessExit(e runcC.Exit, c *runc.Container, p process.Process) { - if ip, ok := p.(*process.Init); ok { - // Ensure all children are killed - if runc.ShouldKillAllOnExit(s.context, c.Bundle) { - if err := ip.KillAll(s.context); err != nil { - logrus.WithError(err).WithField("id", ip.ID()). - Error("failed to kill init's children") - } +// handleInitExit processes container init process exits. +// This is handled separately from non-init exits, because there +// are some extra invariants we want to ensure in this case, namely: +// - for a given container, the init process exit MUST be the last exit published +// This is achieved by: +// - killing all running container processes (if the container has a shared pid +// namespace, otherwise all other processes have been reaped already). +// - waiting for the container's running exec counter to reach 0. +// - finally, publishing the init exit. +func (s *service) handleInitExit(e runcC.Exit, c *runc.Container, p *process.Init) { + // kill all running container processes + if runc.ShouldKillAllOnExit(s.context, c.Bundle) { + if err := p.KillAll(s.context); err != nil { + logrus.WithError(err).WithField("id", p.ID()). + Error("failed to kill init's children") } } + s.lifecycleMu.Lock() + numRunningExecs := s.runningExecs[c] + if numRunningExecs == 0 { + delete(s.runningExecs, c) + s.lifecycleMu.Unlock() + s.handleProcessExit(e, c, p) + return + } + + events := make(chan int, numRunningExecs) + s.execCountSubscribers[c] = events + + s.lifecycleMu.Unlock() + + go func() { + defer func() { + s.lifecycleMu.Lock() + defer s.lifecycleMu.Unlock() + delete(s.execCountSubscribers, c) + delete(s.runningExecs, c) + }() + + // wait for running processes to exit + for { + if runningExecs := <-events; runningExecs == 0 { + break + } + } + + // all running processes have exited now, and no new + // ones can start, so we can publish the init exit + s.handleProcessExit(e, c, p) + }() +} + +func (s *service) handleProcessExit(e runcC.Exit, c *runc.Container, p process.Process) { p.SetExited(e.Status) s.send(&eventstypes.TaskExit{ ContainerID: c.ID, @@ -741,6 +749,14 @@ func (s *service) handleProcessExit(e runcC.Exit, c *runc.Container, p process.P ExitStatus: uint32(e.Status), ExitedAt: p.ExitedAt(), }) + if _, init := p.(*process.Init); !init { + s.lifecycleMu.Lock() + s.runningExecs[c]-- + if ch, ok := s.execCountSubscribers[c]; ok { + ch <- s.runningExecs[c] + } + s.lifecycleMu.Unlock() + } } func (s *service) getContainerPids(ctx context.Context, container *runc.Container) ([]uint32, error) {