diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index ad904641723..dee8d121d4c 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -479,7 +479,7 @@ func (c *Container) deleteExecFifo() { // container cannot access the statedir (and the FIFO itself remains // un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited // fd, with _LIBCONTAINER_FIFOFD set to its fd number. -func (c *Container) includeExecFifo(cmd *exec.Cmd) error { +func (c *Container) includeExecFifo() error { fifoName := filepath.Join(c.stateDir, execFifoFilename) fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) if err != nil { @@ -487,44 +487,22 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error { } c.fifo = fifo - cmd.ExtraFiles = append(cmd.ExtraFiles, fifo) - cmd.Env = append(cmd.Env, - "_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) return nil } -func (c *Container) newParentProcess(p *Process) (parentProcess, error) { - comm, err := newProcessComm() - if err != nil { - return nil, err - } - - // Make sure we use a new safe copy of /proc/self/exe binary each time, this - // is called to make sure that if a container manages to overwrite the file, - // it cannot affect other containers on the system. For runc, this code will - // only ever be called once, but libcontainer users might call this more than - // once. - p.closeClonedExes() +// After https://go-review.googlesource.com/c/go/+/728642 will be merged, we need to recreate +// cmd Object to retry it. +func (c *Container) createCmdObject(p *Process, comm *processComm, cgroupFD *os.File) *exec.Cmd { var ( exePath string safeExe *os.File ) - if exeseal.IsSelfExeCloned() { - // /proc/self/exe is already a cloned binary -- no need to do anything - logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!") - // We don't need to use /proc/thread-self here because the exe mm of a - // thread-group is guaranteed to be the same for all threads by - // definition. This lets us avoid having to do runtime.LockOSThread. - exePath = "/proc/self/exe" - } else { - var err error - safeExe, err = exeseal.CloneSelfExe(c.stateDir) - if err != nil { - return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err) - } + + if len(p.clonedExes) > 0 { + safeExe = p.clonedExes[0] exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd())) - p.clonedExes = append(p.clonedExes, safeExe) - logrus.Debug("runc exeseal: using /proc/self/exe clone") // used for tests + } else { + exePath = "/proc/self/exe" } cmd := exec.Command(exePath, "init") @@ -602,22 +580,70 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal) } + if c.fifo != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, c.fifo) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) + } + + if p.Init { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + } else { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) + } + + if cgroupFD != nil { + cmd.SysProcAttr.UseCgroupFD = true + cmd.SysProcAttr.CgroupFD = int(cgroupFD.Fd()) + } + + return cmd +} + +func (c *Container) newParentProcess(p *Process) (parentProcess, error) { + comm, err := newProcessComm() + if err != nil { + return nil, err + } + + // Make sure we use a new safe copy of /proc/self/exe binary each time, this + // is called to make sure that if a container manages to overwrite the file, + // it cannot affect other containers on the system. For runc, this code will + // only ever be called once, but libcontainer users might call this more than + // once. + p.closeClonedExes() + var safeExe *os.File + if exeseal.IsSelfExeCloned() { + // /proc/self/exe is already a cloned binary -- no need to do anything + logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!") + // We don't need to use /proc/thread-self here because the exe mm of a + // thread-group is guaranteed to be the same for all threads by + // definition. This lets us avoid having to do runtime.LockOSThread. + } else { + var err error + safeExe, err = exeseal.CloneSelfExe(c.stateDir) + if err != nil { + return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err) + } + p.clonedExes = append(p.clonedExes, safeExe) + logrus.Debug("runc exeseal: using /proc/self/exe clone") // used for tests + } + if p.Init { // We only set up fifoFd if we're not doing a `runc exec`. The historic // reason for this is that previously we would pass a dirfd that allowed // for container rootfs escape (and not doing it in `runc exec` avoided // that problem), but we no longer do that. However, there's no need to do // this for `runc exec` so we just keep it this way to be safe. - if err := c.includeExecFifo(cmd); err != nil { + if err := c.includeExecFifo(); err != nil { return nil, fmt.Errorf("unable to setup exec fifo: %w", err) } - return c.newInitProcess(p, cmd, comm) + return c.newInitProcess(p, comm) } - return c.newSetnsProcess(p, cmd, comm) + return c.newSetnsProcess(p, comm) } -func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) { - cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) +func (c *Container) newInitProcess(p *Process, comm *processComm) (*initProcess, error) { nsMaps := make(map[configs.NamespaceType]string) for _, ns := range c.config.Namespaces { if ns.Path != "" { @@ -629,6 +655,8 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) return nil, err } + cmd := c.createCmdObject(p, comm, nil) + init := &initProcess{ containerProcess: containerProcess{ cmd: cmd, @@ -645,8 +673,7 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) return init, nil } -func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*setnsProcess, error) { - cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) +func (c *Container) newSetnsProcess(p *Process, comm *processComm) (*setnsProcess, error) { state := c.currentState() // for setns process, we don't have to set cloneflags as the process namespaces // will only be set via setns syscall @@ -656,7 +683,6 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm } proc := &setnsProcess{ containerProcess: containerProcess{ - cmd: cmd, comm: comm, manager: c.cgroupManager, config: c.newInitConfig(p), diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 2333dc5470b..bb9339f0853 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -13,7 +13,9 @@ import ( "strings" "syscall" "testing" + "time" + securejoin "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/systemd" "github.com/opencontainers/runc/internal/linux" @@ -21,6 +23,7 @@ import ( "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/internal/userns" + "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" @@ -230,6 +233,124 @@ func TestEnter(t *testing.T) { } } +const stateFilename = "state.json" + +// We need to dump changed state into state file. +func saveState(stateDir string, s *libcontainer.State) (retErr error) { + tmpFile, err := os.CreateTemp(stateDir, "state-") + if err != nil { + return err + } + + defer func() { + if retErr != nil { + tmpFile.Close() + os.Remove(tmpFile.Name()) + } + }() + + err = utils.WriteJSON(tmpFile, s) + if err != nil { + return err + } + err = tmpFile.Close() + if err != nil { + return err + } + + stateFilePath := filepath.Join(stateDir, stateFilename) + return os.Rename(tmpFile.Name(), stateFilePath) +} + +func TestCmdRetry(t *testing.T) { + if testing.Short() { + return + } + + if !cgroups.IsCgroup2UnifiedMode() { + t.Skip("CLONE_INTO_CGROUP works only with cgroup v2") + } + + // Create dir to "pseudo" cgroup path + pseudoPath := t.TempDir() + + config := newTemplateConfig(t, nil) + + name := strings.ReplaceAll(t.Name(), "/", "_") + strconv.FormatInt(-int64(time.Now().Nanosecond()), 35) + root := t.TempDir() + + // Container State Dir + stateDir, err := securejoin.SecureJoin(root, name) + ok(t, err) + + // Create Container + container, err := libcontainer.Create(root, name, config) + + ok(t, err) + defer destroyContainer(container) + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + var stdout, stdout2 bytes.Buffer + + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Stdout: &stdout, + Init: true, + } + + err = container.Run(&pconfig) + _ = stdinR.Close() + defer stdinW.Close() + ok(t, err) + + state, err := container.State() + ok(t, err) + + // Dump our "pseudo" cgroup path as dirPath + state.CgroupPaths[""] = pseudoPath + err = saveState(stateDir, state) + ok(t, err) + + // Load container from dumped state + container2, err := libcontainer.Load(root, name) + ok(t, err) + + // Execute another process in the container + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + pconfig2 := libcontainer.Process{ + Cwd: "/", + Env: standardEnvironment, + } + pconfig2.Args = []string{"sh", "-c", "cat"} + pconfig2.Stdin = stdinR2 + pconfig2.Stdout = &stdout2 + + // Turn on TestMode in cgroups library to not check path mode and create all necessary files. + cgroups.TestMode = true + defer func() { + cgroups.TestMode = false + }() + + err = container2.Run(&pconfig2) + _ = stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + // Wait processes + _ = stdinW2.Close() + waitProcess(&pconfig2, t) + + _ = stdinW.Close() + waitProcess(&pconfig, t) +} + func TestProcessEnv(t *testing.T) { if testing.Short() { return diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 51c7b0d877d..e2ce3f4964e 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -15,7 +15,6 @@ import ( "runtime" "strconv" "strings" - "syscall" "time" "github.com/opencontainers/runtime-spec/specs-go" @@ -357,12 +356,6 @@ func (p *setnsProcess) prepareCgroupFD() (*os.File, error) { } logrus.Debugf("using CLONE_INTO_CGROUP %q", cgroup) - if p.cmd.SysProcAttr == nil { - p.cmd.SysProcAttr = &syscall.SysProcAttr{} - } - p.cmd.SysProcAttr.UseCgroupFD = true - p.cmd.SysProcAttr.CgroupFD = int(fd.Fd()) - return fd, nil } @@ -380,11 +373,12 @@ func (p *setnsProcess) startWithCgroupFD() error { defer fd.Close() } + p.cmd = p.containerProcess.container.createCmdObject(p.process, p.comm, fd) err = p.startWithCPUAffinity() if err != nil && p.cmd.SysProcAttr.UseCgroupFD { + // We need to recreate the exec.Cmd object logrus.Debugf("exec with CLONE_INTO_CGROUP failed: %v; retrying without", err) - // SysProcAttr.CgroupFD is never used when UseCgroupFD is unset. - p.cmd.SysProcAttr.UseCgroupFD = false + p.cmd = p.containerProcess.container.createCmdObject(p.process, p.comm, nil) err = p.startWithCPUAffinity() }