Skip to content

Commit

Permalink
criu checkpoint/restore: print errors from criu log
Browse files Browse the repository at this point in the history
When criu fails, it does not give us much context to understand what
was the cause of an error -- for that, we need to take a look into its
log file.

This is somewhat complicated to do (as you can see in parts of
checkpoint.bats removed by this commit), and not very convenient.

Add a function to find and log errors from criu logs, in case either
checkpoint or restore has failed.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
  • Loading branch information
kolyshkin committed Apr 7, 2023
1 parent 941e592 commit 840df2a
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 16 deletions.
68 changes: 66 additions & 2 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package libcontainer

import (
"bufio"
"bytes"
"encoding/json"
"errors"
Expand Down Expand Up @@ -1018,6 +1019,7 @@ func (c *Container) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts,
}

func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
const logFile = "dump.log"
c.m.Lock()
defer c.m.Unlock()

Expand All @@ -1042,6 +1044,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
return err
}

logDir := criuOpts.ImagesDirectory
imageDir, err := os.Open(criuOpts.ImagesDirectory)
if err != nil {
return err
Expand All @@ -1051,7 +1054,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
rpcOpts := criurpc.CriuOpts{
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
LogLevel: proto.Int32(4),
LogFile: proto.String("dump.log"),
LogFile: proto.String(logFile),
Root: proto.String(c.config.Rootfs),
ManageCgroups: proto.Bool(true),
NotifyScripts: proto.Bool(true),
Expand All @@ -1078,6 +1081,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
}
defer workDir.Close()
rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
logDir = criuOpts.WorkDirectory
}

c.handleCriuConfigurationFile(&rpcOpts)
Expand Down Expand Up @@ -1220,6 +1224,7 @@ func (c *Container) Checkpoint(criuOpts *CriuOpts) error {

err = c.criuSwrk(nil, req, criuOpts, nil)
if err != nil {
logCriuErrors(logDir, logFile)
return err
}
return nil
Expand Down Expand Up @@ -1372,6 +1377,7 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
// Restore restores the checkpointed container to a running state using the
// criu(8) utility.
func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
const logFile = "restore.log"
c.m.Lock()
defer c.m.Unlock()

Expand All @@ -1389,6 +1395,7 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
if criuOpts.ImagesDirectory == "" {
return errors.New("invalid directory to restore checkpoint")
}
logDir := criuOpts.ImagesDirectory
imageDir, err := os.Open(criuOpts.ImagesDirectory)
if err != nil {
return err
Expand Down Expand Up @@ -1420,7 +1427,7 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
EvasiveDevices: proto.Bool(true),
LogLevel: proto.Int32(4),
LogFile: proto.String("restore.log"),
LogFile: proto.String(logFile),
RstSibling: proto.Bool(true),
Root: proto.String(root),
ManageCgroups: proto.Bool(true),
Expand Down Expand Up @@ -1463,6 +1470,7 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
}
defer workDir.Close()
req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
logDir = criuOpts.WorkDirectory
}
c.handleCriuConfigurationFile(req.Opts)

Expand Down Expand Up @@ -1536,6 +1544,9 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
}
}
err = c.criuSwrk(process, req, criuOpts, extraFiles)
if err != nil {
logCriuErrors(logDir, logFile)
}

// Now that CRIU is done let's close all opened FDs CRIU needed.
for _, fd := range extraFiles {
Expand All @@ -1545,6 +1556,59 @@ func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
return err
}

// logCriuErrors tries to find and log errors from a criu log file.
// The output is similar to what "grep -B5 Error" does.
func logCriuErrors(dir, file string) {
lookFor := []byte("Error") // Print the line that contains this...
const before = 5 // ... and a few preceding lines.

logFile := filepath.Join(dir, file)
f, err := os.Open(logFile)
if err != nil {
logrus.Warn(err)
return
}
defer f.Close()

s := bufio.NewScanner(f)
var lines [][]byte

lineNo := 0
lastLine := -1
didHeader := false
for s.Scan() {
lineNo++
line := s.Bytes()
lines = append(lines, line)
if len(lines)-1 > before {
// Remove the oldest line.
lines = lines[1:]
}
if !bytes.Contains(line, lookFor) {
continue
}
// Found an error, print lines.
if !didHeader {
logrus.Warnf("--- Quoting %q", logFile)
didHeader = true
} else if lineNo-before-1 > lastLine {
logrus.Warn("...")
}
for _, line := range lines {
logrus.Warnf("%s", line)
}
lastLine = lineNo
// Clear lines so we won't print them again.
lines = nil
}
if didHeader {
logrus.Warn("---")
}
if err := s.Err(); err != nil {
logrus.Warn(err)
}
}

func (c *Container) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
// need to apply cgroups only on restore
if req.GetType() != criurpc.CriuReqType_RESTORE {
Expand Down
14 changes: 0 additions & 14 deletions tests/integration/checkpoint.bats
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,6 @@ function runc_restore_with_pipes() {
echo "__runc restore $name failed (status: $ret)"
exec {err_w}>&-
cat <&${err_r}
echo "CRIU log errors (if any):"
grep -B 5 Error "$workdir"/*.log ./image-dir/*.log || true
fail "runc restore failed"
fi

Expand All @@ -110,15 +108,13 @@ function simple_cr() {
for _ in $(seq 2); do
# checkpoint the running container
runc "$@" checkpoint --work-path ./work-dir test_busybox
grep -B 5 Error ./work-dir/dump.log || true
[ "$status" -eq 0 ]

# after checkpoint busybox is no longer running
testcontainer test_busybox checkpointed

# restore from checkpoint
runc "$@" restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox
grep -B 5 Error ./work-dir/restore.log || true
[ "$status" -eq 0 ]

# busybox should be back up and running
Expand Down Expand Up @@ -188,7 +184,6 @@ function simple_cr() {
mkdir image-dir
mkdir work-dir
runc checkpoint --parent-path ../parent-dir --work-path ./work-dir --image-path ./image-dir test_busybox
grep -B 5 Error ./work-dir/dump.log || true
[ "$status" -eq 0 ]

# check parent path is valid
Expand Down Expand Up @@ -241,7 +236,6 @@ function simple_cr() {
# shellcheck disable=SC2116,SC2086
out=$(echo $out) # rm newlines
# show errors if there are any before we fail
grep -B5 Error ./work-dir/dump.log || true
# expecting \0 which od prints as
[ "$out" = "0000000 000000 0000001" ]

Expand Down Expand Up @@ -302,15 +296,13 @@ function simple_cr() {
# checkpoint the running container; this automatically tells CRIU to
# handle the network namespace defined in config.json as an external
runc checkpoint --work-path ./work-dir test_busybox
grep -B 5 Error ./work-dir/dump.log || true
[ "$status" -eq 0 ]

# after checkpoint busybox is no longer running
testcontainer test_busybox checkpointed

# restore from checkpoint; this should restore the container into the existing network namespace
runc restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox
grep -B 5 Error ./work-dir/restore.log || true
[ "$status" -eq 0 ]

# busybox should be back up and running
Expand Down Expand Up @@ -353,7 +345,6 @@ function simple_cr() {

# checkpoint the running container
runc checkpoint --work-path ./work-dir test_busybox
grep -B 5 Error ./work-dir/dump.log || true
[ "$status" -eq 0 ]
! test -f ./work-dir/"$tmplog1"
test -f ./work-dir/"$tmplog2"
Expand All @@ -364,7 +355,6 @@ function simple_cr() {
test -f ./work-dir/"$tmplog2" && unlink ./work-dir/"$tmplog2"
# restore from checkpoint
runc restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox
grep -B 5 Error ./work-dir/restore.log || true
[ "$status" -eq 0 ]
! test -f ./work-dir/"$tmplog1"
test -f ./work-dir/"$tmplog2"
Expand Down Expand Up @@ -398,7 +388,6 @@ function simple_cr() {

# checkpoint the running container
runc checkpoint --work-path ./work-dir test_busybox
grep -B 5 Error ./work-dir/dump.log || true
[ "$status" -eq 0 ]

# after checkpoint busybox is no longer running
Expand All @@ -410,7 +399,6 @@ function simple_cr() {

# restore from checkpoint
runc restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox
grep -B 5 Error ./work-dir/restore.log || true
[ "$status" -eq 0 ]

# busybox should be back up and running
Expand All @@ -430,7 +418,6 @@ function simple_cr() {
test -d "$orig_path"

runc checkpoint --work-path ./work-dir --manage-cgroups-mode ignore test_busybox
grep -B 5 Error ./work-dir/dump.log || true
[ "$status" -eq 0 ]
testcontainer test_busybox checkpointed
# Check that the cgroup is gone.
Expand All @@ -440,7 +427,6 @@ function simple_cr() {
set_cgroups_path # Changes the path.
runc restore -d --manage-cgroups-mode ignore --pid-file pid \
--work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox
grep -B 5 Error ./work-dir/restore.log || true
[ "$status" -eq 0 ]
testcontainer test_busybox running

Expand Down

0 comments on commit 840df2a

Please sign in to comment.