safeJoin previously did textual cleaning + dest-prefix check only. That's enough to catch `../escape`, but not the symlink-ancestor attack: a malicious OCI layer plants `etc -> /tmp/probe`, a later layer writes/deletes/hardlinks against `etc/anything`, and the kernel silently dereferences the symlink so the operation lands at `/tmp/probe/anything` on the host. The daemon runs flatten as the owner UID, so anywhere that UID can write becomes a write target; anywhere it can delete (e.g. its own home) becomes a delete target. Whiteouts and hardlinks make this worse — a whiteout for `etc/.wh.victim` would `RemoveAll` the host file `/tmp/probe/victim`, and a TypeLink would expose host files inside the extracted rootfs. safeJoin now Lstat-walks every intermediate component of the joined path against the already-extracted tree, refusing if any ancestor is a symlink. Walking is race-free against the extraction loop because we process tar entries serially. Leaf components stay caller-owned (TypeSymlink writes legitimately want a symlink leaf; TypeReg RemoveAll's any prior leaf before opening; etc.). Three new tests pin the protection: write through a symlinked ancestor, whiteout through a symlinked ancestor, and hardlink target through a symlinked ancestor — each must fail and leave the host probe path untouched. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
340 lines
10 KiB
Go
340 lines
10 KiB
Go
package imagepull
|
|
|
|
import (
|
|
"archive/tar"
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
const (
|
|
whiteoutPrefix = ".wh."
|
|
// whiteoutOpaque marks the parent directory as opaque: every entry
|
|
// from previous layers should be removed, but entries from the
|
|
// current layer (siblings of this marker) are preserved.
|
|
whiteoutOpaque = ".wh..wh..opq"
|
|
)
|
|
|
|
// FileMeta captures the per-file metadata we need to reconstruct after
|
|
// mkfs.ext4 has placed the bytes on disk. Uid/Gid/Mode come straight
|
|
// from the tar header; mode carries the full set of permission bits
|
|
// including setuid/setgid/sticky.
|
|
type FileMeta struct {
|
|
Uid int
|
|
Gid int
|
|
Mode int64 // tar header mode (perm + setuid/sgid/sticky)
|
|
Type byte // tar typeflag (TypeReg, TypeDir, TypeSymlink, …)
|
|
}
|
|
|
|
// Metadata records ownership/mode for every path that made it into
|
|
// destDir. Keys are relative to destDir, never starting with "/". Order
|
|
// is the final-layer order — later layers shadow earlier ones.
|
|
type Metadata struct {
|
|
Entries map[string]FileMeta
|
|
}
|
|
|
|
func newMetadata() Metadata {
|
|
return Metadata{Entries: make(map[string]FileMeta)}
|
|
}
|
|
|
|
// FlattenTar reads a single flat tar stream (e.g. the output of
|
|
// `docker export`) into destDir, returning per-file metadata. Unlike
|
|
// Flatten this does NOT treat the input as OCI-layered — there are no
|
|
// whiteouts, no previous layers. Whiteout markers, if they somehow
|
|
// appear, are still handled by applyEntry but should never be present
|
|
// in a docker-export stream.
|
|
//
|
|
// destDir must exist. Path-traversal members and symlink targets that
|
|
// escape destDir are rejected.
|
|
func FlattenTar(ctx context.Context, r io.Reader, destDir string) (Metadata, error) {
|
|
meta := newMetadata()
|
|
absDest, err := filepath.Abs(destDir)
|
|
if err != nil {
|
|
return meta, err
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return meta, err
|
|
}
|
|
tr := tar.NewReader(r)
|
|
for {
|
|
if err := ctx.Err(); err != nil {
|
|
return meta, err
|
|
}
|
|
hdr, err := tr.Next()
|
|
if err == io.EOF {
|
|
return meta, nil
|
|
}
|
|
if err != nil {
|
|
return meta, fmt.Errorf("read tar entry: %w", err)
|
|
}
|
|
if err := applyEntry(tr, hdr, absDest, &meta); err != nil {
|
|
return meta, err
|
|
}
|
|
}
|
|
}
|
|
|
|
// Flatten replays the image's layers in oldest-first order into destDir
|
|
// and returns a Metadata record of each surviving file's tar-header
|
|
// ownership/mode. destDir must exist and ideally be empty. Path-traversal
|
|
// members and symlink targets that escape destDir are rejected.
|
|
//
|
|
// The returned Metadata feeds ApplyOwnership: Go's unprivileged
|
|
// extraction can't set real uids/gids on disk, but a debugfs pass over
|
|
// the final ext4 can.
|
|
func Flatten(ctx context.Context, img PulledImage, destDir string) (Metadata, error) {
|
|
meta := newMetadata()
|
|
absDest, err := filepath.Abs(destDir)
|
|
if err != nil {
|
|
return meta, err
|
|
}
|
|
layers, err := img.Image.Layers()
|
|
if err != nil {
|
|
return meta, fmt.Errorf("read layers: %w", err)
|
|
}
|
|
for i, layer := range layers {
|
|
if err := ctx.Err(); err != nil {
|
|
return meta, err
|
|
}
|
|
if err := applyLayer(layer, absDest, &meta); err != nil {
|
|
return meta, fmt.Errorf("apply layer %d/%d: %w", i+1, len(layers), err)
|
|
}
|
|
}
|
|
return meta, nil
|
|
}
|
|
|
|
func applyLayer(layer interface {
|
|
Uncompressed() (io.ReadCloser, error)
|
|
}, dest string, meta *Metadata) error {
|
|
rc, err := layer.Uncompressed()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer rc.Close()
|
|
|
|
tr := tar.NewReader(rc)
|
|
for {
|
|
hdr, err := tr.Next()
|
|
if err == io.EOF {
|
|
return nil
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("read tar entry: %w", err)
|
|
}
|
|
if err := applyEntry(tr, hdr, dest, meta); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
func applyEntry(tr *tar.Reader, hdr *tar.Header, dest string, meta *Metadata) error {
|
|
rel := filepath.Clean(hdr.Name)
|
|
if rel == "." || rel == string(filepath.Separator) {
|
|
return nil
|
|
}
|
|
if filepath.IsAbs(rel) || rel == ".." || strings.HasPrefix(rel, ".."+string(filepath.Separator)) {
|
|
return fmt.Errorf("unsafe path in layer: %q", hdr.Name)
|
|
}
|
|
if err := validateDebugFSPath(rel); err != nil {
|
|
return err
|
|
}
|
|
|
|
base := filepath.Base(rel)
|
|
parent := filepath.Dir(rel)
|
|
|
|
// Whiteouts come in two flavors: opaque-dir markers and per-file
|
|
// deletes. Both are resolved relative to the parent directory.
|
|
// Whiteouts erase metadata for the victim path(s).
|
|
if base == whiteoutOpaque {
|
|
parentAbs, err := safeJoin(dest, parent)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Drop metadata entries whose path is under parent.
|
|
prefix := parent + "/"
|
|
for k := range meta.Entries {
|
|
if parent == "." || parent == "" || strings.HasPrefix(k, prefix) {
|
|
delete(meta.Entries, k)
|
|
}
|
|
}
|
|
return clearDirContents(parentAbs)
|
|
}
|
|
if strings.HasPrefix(base, whiteoutPrefix) {
|
|
target := strings.TrimPrefix(base, whiteoutPrefix)
|
|
victim, err := safeJoin(dest, filepath.Join(parent, target))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
victimKey := filepath.Clean(filepath.Join(parent, target))
|
|
delete(meta.Entries, victimKey)
|
|
victimPrefix := victimKey + "/"
|
|
for k := range meta.Entries {
|
|
if strings.HasPrefix(k, victimPrefix) {
|
|
delete(meta.Entries, k)
|
|
}
|
|
}
|
|
if err := os.RemoveAll(victim); err != nil && !errors.Is(err, os.ErrNotExist) {
|
|
return fmt.Errorf("apply whiteout %s: %w", hdr.Name, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
abs, err := safeJoin(dest, rel)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
switch hdr.Typeflag {
|
|
case tar.TypeDir:
|
|
if err := os.MkdirAll(abs, 0o755); err != nil {
|
|
return err
|
|
}
|
|
meta.Entries[rel] = FileMeta{Uid: hdr.Uid, Gid: hdr.Gid, Mode: hdr.Mode, Type: tar.TypeDir}
|
|
return nil
|
|
case tar.TypeReg:
|
|
if err := os.MkdirAll(filepath.Dir(abs), 0o755); err != nil {
|
|
return err
|
|
}
|
|
// Replace any prior file/dir in this slot — later layers
|
|
// shadow earlier ones.
|
|
if err := os.RemoveAll(abs); err != nil && !errors.Is(err, os.ErrNotExist) {
|
|
return err
|
|
}
|
|
f, err := os.OpenFile(abs, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(hdr.Mode)|0o600)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if _, err := io.Copy(f, tr); err != nil {
|
|
_ = f.Close()
|
|
return err
|
|
}
|
|
if err := f.Close(); err != nil {
|
|
return err
|
|
}
|
|
meta.Entries[rel] = FileMeta{Uid: hdr.Uid, Gid: hdr.Gid, Mode: hdr.Mode, Type: tar.TypeReg}
|
|
return nil
|
|
case tar.TypeSymlink:
|
|
if err := os.MkdirAll(filepath.Dir(abs), 0o755); err != nil {
|
|
return err
|
|
}
|
|
// Container layers commonly use absolute symlink targets like
|
|
// "/usr/bin/mawk" — these are interpreted relative to the
|
|
// rootfs (`/` inside the eventual VM), so they're rooted at
|
|
// dest by construction and need no escape check.
|
|
// Relative targets, however, can escape with "../"s and must
|
|
// be checked against dest at write time (we never follow them
|
|
// during extraction, but a future caller might).
|
|
if !filepath.IsAbs(hdr.Linkname) {
|
|
resolved := filepath.Clean(filepath.Join(filepath.Dir(abs), hdr.Linkname))
|
|
if resolved != dest && !strings.HasPrefix(resolved, dest+string(filepath.Separator)) {
|
|
return fmt.Errorf("unsafe symlink in layer: %q -> %q", hdr.Name, hdr.Linkname)
|
|
}
|
|
}
|
|
if err := os.RemoveAll(abs); err != nil && !errors.Is(err, os.ErrNotExist) {
|
|
return err
|
|
}
|
|
if err := os.Symlink(hdr.Linkname, abs); err != nil {
|
|
return err
|
|
}
|
|
meta.Entries[rel] = FileMeta{Uid: hdr.Uid, Gid: hdr.Gid, Mode: hdr.Mode, Type: tar.TypeSymlink}
|
|
return nil
|
|
case tar.TypeLink:
|
|
// Hardlink: target must already exist inside dest from this or
|
|
// a previous layer, and must not escape.
|
|
linkTarget, err := safeJoin(dest, filepath.Clean(hdr.Linkname))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if _, err := os.Lstat(linkTarget); err != nil {
|
|
return fmt.Errorf("hardlink target %q missing: %w", hdr.Linkname, err)
|
|
}
|
|
if err := os.MkdirAll(filepath.Dir(abs), 0o755); err != nil {
|
|
return err
|
|
}
|
|
if err := os.RemoveAll(abs); err != nil && !errors.Is(err, os.ErrNotExist) {
|
|
return err
|
|
}
|
|
return os.Link(linkTarget, abs)
|
|
default:
|
|
// TypeChar / TypeBlock / TypeFifo / TypeXGlobalHeader / etc.
|
|
// Container layers occasionally include /dev nodes — they need
|
|
// privilege we don't have. Skip silently; udev/devtmpfs in the
|
|
// guest will create them at boot.
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// safeJoin returns dest+rel after verifying:
|
|
//
|
|
// 1. The cleaned result lies textually under dest (catches "../escape").
|
|
// 2. No INTERMEDIATE component of the result is a symlink (catches the
|
|
// OCI extraction-escape attack: a layer plants `etc -> /etc`, then a
|
|
// later layer writes `etc/passwd` — without this walk the kernel
|
|
// would dereference the symlink and the operation would land at
|
|
// /etc/passwd on the host, not at <dest>/etc/passwd).
|
|
//
|
|
// The leaf component is intentionally NOT Lstat'd here: it may legitimately
|
|
// be a symlink (TypeSymlink entries), a missing file (TypeReg about to be
|
|
// created), or an existing entry that the caller will RemoveAll before
|
|
// re-creating. Leaf type is the caller's contract.
|
|
//
|
|
// Walking against the already-extracted tree is race-free in practice:
|
|
// the only mutator is this same extraction loop, and we're processing
|
|
// entries serially.
|
|
func safeJoin(dest, rel string) (string, error) {
|
|
joined := filepath.Join(dest, rel)
|
|
if joined != dest && !strings.HasPrefix(joined, dest+string(filepath.Separator)) {
|
|
return "", fmt.Errorf("unsafe path: %q escapes %q", rel, dest)
|
|
}
|
|
if joined == dest {
|
|
return joined, nil
|
|
}
|
|
suffix := strings.TrimPrefix(joined, dest+string(filepath.Separator))
|
|
segs := strings.Split(suffix, string(filepath.Separator))
|
|
cur := dest
|
|
for i, seg := range segs {
|
|
if seg == "" {
|
|
continue
|
|
}
|
|
cur = filepath.Join(cur, seg)
|
|
if i == len(segs)-1 {
|
|
break
|
|
}
|
|
info, err := os.Lstat(cur)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
// Ancestor not yet materialised. Once an extraction
|
|
// op creates it (via this same routed code), it can't
|
|
// be a symlink — TypeSymlink writes go through this
|
|
// validator too.
|
|
return joined, nil
|
|
}
|
|
return "", err
|
|
}
|
|
if info.Mode()&os.ModeSymlink != 0 {
|
|
return "", fmt.Errorf("unsafe path: ancestor %q of %q is a symlink", cur, rel)
|
|
}
|
|
}
|
|
return joined, nil
|
|
}
|
|
|
|
// clearDirContents removes every entry under dir but leaves dir itself.
|
|
// Used for opaque-whiteout markers.
|
|
func clearDirContents(dir string) error {
|
|
entries, err := os.ReadDir(dir)
|
|
if err != nil {
|
|
if errors.Is(err, os.ErrNotExist) {
|
|
return os.MkdirAll(dir, 0o755)
|
|
}
|
|
return err
|
|
}
|
|
for _, entry := range entries {
|
|
if err := os.RemoveAll(filepath.Join(dir, entry.Name())); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|