diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index b498e33bce2..952e20a6c83 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1,6 +1,7 @@ package libcontainer import ( + "bufio" "bytes" "encoding/json" "errors" @@ -1018,6 +1019,7 @@ func (c *Container) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, } func (c *Container) Checkpoint(criuOpts *CriuOpts) error { + const logFile = "dump.log" c.m.Lock() defer c.m.Unlock() @@ -1042,6 +1044,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error { return err } + logDir := criuOpts.ImagesDirectory imageDir, err := os.Open(criuOpts.ImagesDirectory) if err != nil { return err @@ -1051,7 +1054,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error { rpcOpts := criurpc.CriuOpts{ ImagesDirFd: proto.Int32(int32(imageDir.Fd())), LogLevel: proto.Int32(4), - LogFile: proto.String("dump.log"), + LogFile: proto.String(logFile), Root: proto.String(c.config.Rootfs), ManageCgroups: proto.Bool(true), NotifyScripts: proto.Bool(true), @@ -1078,6 +1081,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error { } defer workDir.Close() rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd())) + logDir = criuOpts.WorkDirectory } c.handleCriuConfigurationFile(&rpcOpts) @@ -1220,6 +1224,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error { err = c.criuSwrk(nil, req, criuOpts, nil) if err != nil { + logCriuErrors(logDir, logFile) return err } return nil @@ -1372,6 +1377,7 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error { // Restore restores the checkpointed container to a running state using the // criu(8) utility. func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error { + const logFile = "restore.log" c.m.Lock() defer c.m.Unlock() @@ -1389,6 +1395,7 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error { if criuOpts.ImagesDirectory == "" { return errors.New("invalid directory to restore checkpoint") } + logDir := criuOpts.ImagesDirectory imageDir, err := os.Open(criuOpts.ImagesDirectory) if err != nil { return err @@ -1420,7 +1427,7 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error { ImagesDirFd: proto.Int32(int32(imageDir.Fd())), EvasiveDevices: proto.Bool(true), LogLevel: proto.Int32(4), - LogFile: proto.String("restore.log"), + LogFile: proto.String(logFile), RstSibling: proto.Bool(true), Root: proto.String(root), ManageCgroups: proto.Bool(true), @@ -1463,6 +1470,7 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error { } defer workDir.Close() req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd())) + logDir = criuOpts.WorkDirectory } c.handleCriuConfigurationFile(req.Opts) @@ -1536,6 +1544,9 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error { } } err = c.criuSwrk(process, req, criuOpts, extraFiles) + if err != nil { + logCriuErrors(logDir, logFile) + } // Now that CRIU is done let's close all opened FDs CRIU needed. for _, fd := range extraFiles { @@ -1545,6 +1556,59 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error { return err } +// logCriuErrors tries to find and log errors from a criu log file. +// The output is similar to what "grep -B5 Error" does. +func logCriuErrors(dir, file string) { + lookFor := []byte("Error") // Print the line that contains this... + const before = 5 // ... and a few preceding lines. + + logFile := filepath.Join(dir, file) + f, err := os.Open(logFile) + if err != nil { + logrus.Warn(err) + return + } + defer f.Close() + + s := bufio.NewScanner(f) + var lines [][]byte + + lineNo := 0 + lastLine := -1 + didHeader := false + for s.Scan() { + lineNo++ + line := s.Bytes() + lines = append(lines, line) + if len(lines)-1 > before { + // Remove the oldest line. + lines = lines[1:] + } + if !bytes.Contains(line, lookFor) { + continue + } + // Found an error, print lines. + if !didHeader { + logrus.Warnf("--- Quoting %q", logFile) + didHeader = true + } else if lineNo-before-1 > lastLine { + logrus.Warn("...") + } + for _, line := range lines { + logrus.Warnf("%s", line) + } + lastLine = lineNo + // Clear lines so we won't print them again. + lines = nil + } + if didHeader { + logrus.Warn("---") + } + if err := s.Err(); err != nil { + logrus.Warn(err) + } +} + func (c *Container) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { // need to apply cgroups only on restore if req.GetType() != criurpc.CriuReqType_RESTORE { diff --git a/tests/integration/checkpoint.bats b/tests/integration/checkpoint.bats index 0a8e58a2c9c..2d6a42e0deb 100644 --- a/tests/integration/checkpoint.bats +++ b/tests/integration/checkpoint.bats @@ -89,8 +89,6 @@ function runc_restore_with_pipes() { echo "__runc restore $name failed (status: $ret)" exec {err_w}>&- cat <&${err_r} - echo "CRIU log errors (if any):" - grep -B 5 Error "$workdir"/*.log ./image-dir/*.log || true fail "runc restore failed" fi @@ -110,7 +108,6 @@ function simple_cr() { for _ in $(seq 2); do # checkpoint the running container runc "$@" checkpoint --work-path ./work-dir test_busybox - grep -B 5 Error ./work-dir/dump.log || true [ "$status" -eq 0 ] # after checkpoint busybox is no longer running @@ -118,7 +115,6 @@ function simple_cr() { # restore from checkpoint runc "$@" restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox - grep -B 5 Error ./work-dir/restore.log || true [ "$status" -eq 0 ] # busybox should be back up and running @@ -188,7 +184,6 @@ function simple_cr() { mkdir image-dir mkdir work-dir runc checkpoint --parent-path ../parent-dir --work-path ./work-dir --image-path ./image-dir test_busybox - grep -B 5 Error ./work-dir/dump.log || true [ "$status" -eq 0 ] # check parent path is valid @@ -241,7 +236,6 @@ function simple_cr() { # shellcheck disable=SC2116,SC2086 out=$(echo $out) # rm newlines # show errors if there are any before we fail - grep -B5 Error ./work-dir/dump.log || true # expecting \0 which od prints as [ "$out" = "0000000 000000 0000001" ] @@ -302,7 +296,6 @@ function simple_cr() { # checkpoint the running container; this automatically tells CRIU to # handle the network namespace defined in config.json as an external runc checkpoint --work-path ./work-dir test_busybox - grep -B 5 Error ./work-dir/dump.log || true [ "$status" -eq 0 ] # after checkpoint busybox is no longer running @@ -310,7 +303,6 @@ function simple_cr() { # restore from checkpoint; this should restore the container into the existing network namespace runc restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox - grep -B 5 Error ./work-dir/restore.log || true [ "$status" -eq 0 ] # busybox should be back up and running @@ -353,7 +345,6 @@ function simple_cr() { # checkpoint the running container runc checkpoint --work-path ./work-dir test_busybox - grep -B 5 Error ./work-dir/dump.log || true [ "$status" -eq 0 ] ! test -f ./work-dir/"$tmplog1" test -f ./work-dir/"$tmplog2" @@ -364,7 +355,6 @@ function simple_cr() { test -f ./work-dir/"$tmplog2" && unlink ./work-dir/"$tmplog2" # restore from checkpoint runc restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox - grep -B 5 Error ./work-dir/restore.log || true [ "$status" -eq 0 ] ! test -f ./work-dir/"$tmplog1" test -f ./work-dir/"$tmplog2" @@ -398,7 +388,6 @@ function simple_cr() { # checkpoint the running container runc checkpoint --work-path ./work-dir test_busybox - grep -B 5 Error ./work-dir/dump.log || true [ "$status" -eq 0 ] # after checkpoint busybox is no longer running @@ -410,7 +399,6 @@ function simple_cr() { # restore from checkpoint runc restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox - grep -B 5 Error ./work-dir/restore.log || true [ "$status" -eq 0 ] # busybox should be back up and running @@ -430,7 +418,6 @@ function simple_cr() { test -d "$orig_path" runc checkpoint --work-path ./work-dir --manage-cgroups-mode ignore test_busybox - grep -B 5 Error ./work-dir/dump.log || true [ "$status" -eq 0 ] testcontainer test_busybox checkpointed # Check that the cgroup is gone. @@ -440,7 +427,6 @@ function simple_cr() { set_cgroups_path # Changes the path. runc restore -d --manage-cgroups-mode ignore --pid-file pid \ --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox - grep -B 5 Error ./work-dir/restore.log || true [ "$status" -eq 0 ] testcontainer test_busybox running