Phase B-1: ownership fixup via debugfs pass

imagepull.Flatten now captures per-file uid/gid/mode/type from the
tar headers as it walks layers, returning a Metadata map alongside
the extracted tree. Whiteouts correctly drop the victim's metadata.
The returned Metadata feeds the new imagepull.ApplyOwnership, which
pipes a batched `set_inode_field` script to `debugfs -w -f -`.

Why: mkfs.ext4 -d copies the runner's on-disk uids verbatim, so
without this pass setuid binaries become setuid-nonroot and sshd
refuses to start on the resulting image. With the pass, a pulled
debian:bookworm has /usr/bin/sudo with uid=0 + setuid bit surviving
intact.

imagepull.BuildExt4 signature unchanged; ownership is applied as a
separate step by the daemon orchestrator between BuildExt4 and
StageBootArtifacts, keeping each helper focused. The seam
(d.pullAndFlatten) now returns (Metadata, error) for test stubs to
feed synthetic metadata.

StdinRunner is a new duck-typed extension next to CommandRunner;
the real system.Runner implements RunStdin, test mocks don't need
to unless they exercise stdin. Prevents every existing mock from
growing a new method.

Tests:
 - TestFlattenCapturesHeaderMetadata: setuid bit + mode survive the
   tar-header walk
 - TestApplyOwnershipRewritesUidGidMode: real debugfs round-trip —
   create ext4 with runner's uid, apply synthetic metadata setting
   uid=0 + setuid mode, verify via `debugfs -R stat` that the
   inode now has uid=0 and mode 04755
 - TestBuildOwnershipScriptDeterministic: sorted, well-formed
   sif script output

Debugfs and mkfs.ext4 tests skip if the binaries aren't on PATH.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Thales Maciel 2026-04-16 18:04:22 -03:00
parent 2e4d4b14da
commit 43982a4ae3
No known key found for this signature in database
GPG key ID: 33112E6833C34679
7 changed files with 334 additions and 32 deletions

View file

@ -19,35 +19,60 @@ const (
whiteoutOpaque = ".wh..wh..opq"
)
// Flatten replays the image's layers in oldest-first order into destDir.
// destDir must exist and ideally be empty. Path-traversal members and
// symlink targets that escape destDir are rejected.
// FileMeta captures the per-file metadata we need to reconstruct after
// mkfs.ext4 has placed the bytes on disk. Uid/Gid/Mode come straight
// from the tar header; mode carries the full set of permission bits
// including setuid/setgid/sticky.
type FileMeta struct {
Uid int
Gid int
Mode int64 // tar header mode (perm + setuid/sgid/sticky)
Type byte // tar typeflag (TypeReg, TypeDir, TypeSymlink, …)
}
// Metadata records ownership/mode for every path that made it into
// destDir. Keys are relative to destDir, never starting with "/". Order
// is the final-layer order — later layers shadow earlier ones.
type Metadata struct {
Entries map[string]FileMeta
}
func newMetadata() Metadata {
return Metadata{Entries: make(map[string]FileMeta)}
}
// Flatten replays the image's layers in oldest-first order into destDir
// and returns a Metadata record of each surviving file's tar-header
// ownership/mode. destDir must exist and ideally be empty. Path-traversal
// members and symlink targets that escape destDir are rejected.
//
// File ownership in destDir reflects the running user, not the tar
// header's uid/gid (Phase A v1 limitation; see package docs).
func Flatten(ctx context.Context, img PulledImage, destDir string) error {
// The returned Metadata feeds ApplyOwnership: Go's unprivileged
// extraction can't set real uids/gids on disk, but a debugfs pass over
// the final ext4 can.
func Flatten(ctx context.Context, img PulledImage, destDir string) (Metadata, error) {
meta := newMetadata()
absDest, err := filepath.Abs(destDir)
if err != nil {
return err
return meta, err
}
layers, err := img.Image.Layers()
if err != nil {
return fmt.Errorf("read layers: %w", err)
return meta, fmt.Errorf("read layers: %w", err)
}
for i, layer := range layers {
if err := ctx.Err(); err != nil {
return err
return meta, err
}
if err := applyLayer(layer, absDest); err != nil {
return fmt.Errorf("apply layer %d/%d: %w", i+1, len(layers), err)
if err := applyLayer(layer, absDest, &meta); err != nil {
return meta, fmt.Errorf("apply layer %d/%d: %w", i+1, len(layers), err)
}
}
return nil
return meta, nil
}
func applyLayer(layer interface {
Uncompressed() (io.ReadCloser, error)
}, dest string) error {
}, dest string, meta *Metadata) error {
rc, err := layer.Uncompressed()
if err != nil {
return err
@ -63,13 +88,13 @@ func applyLayer(layer interface {
if err != nil {
return fmt.Errorf("read tar entry: %w", err)
}
if err := applyEntry(tr, hdr, dest); err != nil {
if err := applyEntry(tr, hdr, dest, meta); err != nil {
return err
}
}
}
func applyEntry(tr *tar.Reader, hdr *tar.Header, dest string) error {
func applyEntry(tr *tar.Reader, hdr *tar.Header, dest string, meta *Metadata) error {
rel := filepath.Clean(hdr.Name)
if rel == "." || rel == string(filepath.Separator) {
return nil
@ -83,11 +108,19 @@ func applyEntry(tr *tar.Reader, hdr *tar.Header, dest string) error {
// Whiteouts come in two flavors: opaque-dir markers and per-file
// deletes. Both are resolved relative to the parent directory.
// Whiteouts erase metadata for the victim path(s).
if base == whiteoutOpaque {
parentAbs, err := safeJoin(dest, parent)
if err != nil {
return err
}
// Drop metadata entries whose path is under parent.
prefix := parent + "/"
for k := range meta.Entries {
if parent == "." || parent == "" || strings.HasPrefix(k, prefix) {
delete(meta.Entries, k)
}
}
return clearDirContents(parentAbs)
}
if strings.HasPrefix(base, whiteoutPrefix) {
@ -96,6 +129,14 @@ func applyEntry(tr *tar.Reader, hdr *tar.Header, dest string) error {
if err != nil {
return err
}
victimKey := filepath.Clean(filepath.Join(parent, target))
delete(meta.Entries, victimKey)
victimPrefix := victimKey + "/"
for k := range meta.Entries {
if strings.HasPrefix(k, victimPrefix) {
delete(meta.Entries, k)
}
}
if err := os.RemoveAll(victim); err != nil && !errors.Is(err, os.ErrNotExist) {
return fmt.Errorf("apply whiteout %s: %w", hdr.Name, err)
}
@ -109,7 +150,11 @@ func applyEntry(tr *tar.Reader, hdr *tar.Header, dest string) error {
switch hdr.Typeflag {
case tar.TypeDir:
return os.MkdirAll(abs, 0o755)
if err := os.MkdirAll(abs, 0o755); err != nil {
return err
}
meta.Entries[rel] = FileMeta{Uid: hdr.Uid, Gid: hdr.Gid, Mode: hdr.Mode, Type: tar.TypeDir}
return nil
case tar.TypeReg:
if err := os.MkdirAll(filepath.Dir(abs), 0o755); err != nil {
return err
@ -127,7 +172,11 @@ func applyEntry(tr *tar.Reader, hdr *tar.Header, dest string) error {
_ = f.Close()
return err
}
return f.Close()
if err := f.Close(); err != nil {
return err
}
meta.Entries[rel] = FileMeta{Uid: hdr.Uid, Gid: hdr.Gid, Mode: hdr.Mode, Type: tar.TypeReg}
return nil
case tar.TypeSymlink:
if err := os.MkdirAll(filepath.Dir(abs), 0o755); err != nil {
return err
@ -148,7 +197,11 @@ func applyEntry(tr *tar.Reader, hdr *tar.Header, dest string) error {
if err := os.RemoveAll(abs); err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
return os.Symlink(hdr.Linkname, abs)
if err := os.Symlink(hdr.Linkname, abs); err != nil {
return err
}
meta.Entries[rel] = FileMeta{Uid: hdr.Uid, Gid: hdr.Gid, Mode: hdr.Mode, Type: tar.TypeSymlink}
return nil
case tar.TypeLink:
// Hardlink: target must already exist inside dest from this or
// a previous layer, and must not escape.

View file

@ -26,6 +26,9 @@ import (
"github.com/google/go-containerregistry/pkg/v1/tarball"
)
// ensure log import stays used even when registry-logging is silenced.
var _ = log.New
// tarMember is a single entry to put into a fake layer tarball.
type tarMember struct {
name string
@ -188,7 +191,7 @@ func TestFlattenAppliesLayersAndWhiteouts(t *testing.T) {
t.Fatalf("Pull: %v", err)
}
dest := t.TempDir()
if err := Flatten(context.Background(), pulled, dest); err != nil {
if _, err := Flatten(context.Background(), pulled, dest); err != nil {
t.Fatalf("Flatten: %v", err)
}
@ -227,7 +230,7 @@ func TestFlattenRejectsPathTraversal(t *testing.T) {
t.Fatalf("Pull: %v", err)
}
dest := t.TempDir()
err = Flatten(context.Background(), pulled, dest)
_, err = Flatten(context.Background(), pulled, dest)
if err == nil || !strings.Contains(err.Error(), "unsafe path") {
t.Fatalf("Flatten escape: err=%v, want unsafe path", err)
}
@ -253,7 +256,7 @@ func TestFlattenAcceptsAbsoluteSymlink(t *testing.T) {
t.Fatalf("Pull: %v", err)
}
dest := t.TempDir()
if err := Flatten(context.Background(), pulled, dest); err != nil {
if _, err := Flatten(context.Background(), pulled, dest); err != nil {
t.Fatalf("Flatten: %v", err)
}
link := filepath.Join(dest, "etc/alternatives/awk")
@ -280,7 +283,7 @@ func TestFlattenRejectsRelativeSymlinkEscape(t *testing.T) {
if err != nil {
t.Fatalf("Pull: %v", err)
}
err = Flatten(context.Background(), pulled, t.TempDir())
_, err = Flatten(context.Background(), pulled, t.TempDir())
if err == nil || !strings.Contains(err.Error(), "unsafe symlink") {
t.Fatalf("Flatten relative escape: err=%v", err)
}
@ -317,6 +320,100 @@ func TestBuildExt4ProducesValidImage(t *testing.T) {
}
}
func TestFlattenCapturesHeaderMetadata(t *testing.T) {
host := startRegistry(t)
ref := pushImage(t, host, "banger/test", "meta",
makeLayer(t, []tarMember{
{name: "usr/bin/sudo", mode: 0o4755, body: []byte("setuid-bin")},
{name: "etc/", dir: true, mode: 0o755},
{name: "etc/link", symlink: true, link: "/usr/bin/sudo"},
}),
)
pulled, err := Pull(context.Background(), ref, t.TempDir())
if err != nil {
t.Fatalf("Pull: %v", err)
}
meta, err := Flatten(context.Background(), pulled, t.TempDir())
if err != nil {
t.Fatalf("Flatten: %v", err)
}
sudo, ok := meta.Entries["usr/bin/sudo"]
if !ok {
t.Fatalf("missing usr/bin/sudo entry: %+v", meta.Entries)
}
if sudo.Mode&0o4000 == 0 {
t.Errorf("setuid bit lost: mode=0%o", sudo.Mode)
}
if sudo.Mode&0o777 != 0o755 {
t.Errorf("perm bits = 0%o, want 0o755", sudo.Mode&0o777)
}
if _, ok := meta.Entries["etc"]; !ok {
t.Errorf("missing etc dir entry")
}
if _, ok := meta.Entries["etc/link"]; !ok {
t.Errorf("missing symlink entry")
}
}
func TestApplyOwnershipRewritesUidGidMode(t *testing.T) {
if _, err := exec.LookPath("mkfs.ext4"); err != nil {
t.Skip("mkfs.ext4 not available; skipping")
}
if _, err := exec.LookPath("debugfs"); err != nil {
t.Skip("debugfs not available; skipping")
}
// Stage a tiny source tree and build an ext4 with mkfs.ext4 -d.
src := t.TempDir()
if err := os.WriteFile(filepath.Join(src, "setuid-bin"), []byte("x"), 0o644); err != nil {
t.Fatal(err)
}
out := filepath.Join(t.TempDir(), "rootfs.ext4")
if err := BuildExt4(context.Background(), system.NewRunner(), src, out, MinExt4Size); err != nil {
t.Fatalf("BuildExt4: %v", err)
}
// Apply synthetic metadata: set uid=0 gid=0 mode=0o4755 on setuid-bin.
meta := Metadata{Entries: map[string]FileMeta{
"setuid-bin": {Uid: 0, Gid: 0, Mode: 0o4755, Type: tar.TypeReg},
}}
if err := ApplyOwnership(context.Background(), system.NewRunner(), out, meta); err != nil {
t.Fatalf("ApplyOwnership: %v", err)
}
// Read back the inode via debugfs.
statOut, err := exec.Command("debugfs", "-R", "stat /setuid-bin", out).CombinedOutput()
if err != nil {
t.Fatalf("debugfs stat: %v: %s", err, statOut)
}
s := string(statOut)
if !bytes.Contains([]byte(s), []byte("User: 0")) && !bytes.Contains([]byte(s), []byte("User: 0")) {
t.Errorf("uid not 0 after fixup. output:\n%s", s)
}
if !bytes.Contains([]byte(s), []byte("Mode: 04755")) && !bytes.Contains([]byte(s), []byte("Mode: 4755")) {
t.Errorf("setuid mode not applied. output:\n%s", s)
}
}
func TestBuildOwnershipScriptDeterministic(t *testing.T) {
meta := Metadata{Entries: map[string]FileMeta{
"b": {Uid: 0, Gid: 0, Mode: 0o755, Type: tar.TypeReg},
"a": {Uid: 0, Gid: 0, Mode: 0o755, Type: tar.TypeReg},
"a/x": {Uid: 0, Gid: 0, Mode: 0o644, Type: tar.TypeReg},
}}
got := buildOwnershipScript(meta).String()
// sorted: a, a/x, b
want := "set_inode_field /a uid 0\nset_inode_field /a gid 0\nset_inode_field /a mode 0100755\n" +
"set_inode_field /a/x uid 0\nset_inode_field /a/x gid 0\nset_inode_field /a/x mode 0100644\n" +
"set_inode_field /b uid 0\nset_inode_field /b gid 0\nset_inode_field /b mode 0100755\n"
if got != want {
t.Errorf("script mismatch\ngot:\n%s\nwant:\n%s", got, want)
}
}
func TestBuildExt4RejectsTinySize(t *testing.T) {
src := t.TempDir()
out := filepath.Join(t.TempDir(), "rootfs.ext4")

View file

@ -0,0 +1,114 @@
package imagepull
import (
"archive/tar"
"bytes"
"context"
"fmt"
"sort"
"banger/internal/system"
)
// ApplyOwnership rewrites the ext4 image's per-file uid/gid/mode to match
// the tar-header values Flatten captured. `mkfs.ext4 -d` preserves the
// on-disk ownership of the source tree — which is the runner's uid/gid,
// since we extracted as a regular user — so without this pass setuid
// binaries become setuid-nonroot and root-owned config files are
// readable by the runner's group.
//
// Implementation: stream a "set_inode_field" script to `debugfs -w`.
// One invocation handles tens of thousands of files; the bottleneck is
// debugfs's one-inode-at-a-time disk I/O, not process startup.
func ApplyOwnership(ctx context.Context, runner system.CommandRunner, ext4File string, meta Metadata) error {
if len(meta.Entries) == 0 {
return nil
}
script := buildOwnershipScript(meta)
if script.Len() == 0 {
return nil
}
stdinRunner, ok := runner.(system.StdinRunner)
if !ok {
return fmt.Errorf("ownership fixup requires a runner that supports stdin (got %T)", runner)
}
out, err := stdinRunner.RunStdin(ctx, script, "debugfs", "-w", "-f", "-", ext4File)
if err != nil {
return fmt.Errorf("debugfs ownership fixup: %w: %s", err, string(out))
}
return nil
}
// buildOwnershipScript emits one `set_inode_field` block per entry.
// Paths are prefixed with "/" so debugfs resolves them from the ext4
// root. Entries are sorted for deterministic output (helps testing and
// makes debugfs's internal caching slightly more cache-friendly).
func buildOwnershipScript(meta Metadata) *bytes.Buffer {
var buf bytes.Buffer
paths := make([]string, 0, len(meta.Entries))
for p := range meta.Entries {
paths = append(paths, p)
}
sort.Strings(paths)
for _, p := range paths {
m := meta.Entries[p]
mode := debugfsMode(m.Type, m.Mode)
if mode == 0 {
continue // hardlinks or unsupported types (skip)
}
escaped := escapeDebugfsPath(p)
fmt.Fprintf(&buf, "set_inode_field %s uid %d\n", escaped, m.Uid)
fmt.Fprintf(&buf, "set_inode_field %s gid %d\n", escaped, m.Gid)
fmt.Fprintf(&buf, "set_inode_field %s mode 0%o\n", escaped, mode)
}
return &buf
}
// debugfsMode composes the full i_mode word (file-type bits +
// permission bits) that debugfs' `set_inode_field ... mode` expects.
// Returns 0 for types we don't set (hardlinks, unknown).
func debugfsMode(typ byte, hdrMode int64) uint32 {
perm := uint32(hdrMode) & 0o7777
switch typ {
case tar.TypeReg:
return 0o100000 | perm
case tar.TypeDir:
return 0o040000 | perm
case tar.TypeSymlink:
return 0o120000 | perm
case tar.TypeChar:
return 0o020000 | perm
case tar.TypeBlock:
return 0o060000 | perm
case tar.TypeFifo:
return 0o010000 | perm
default:
return 0
}
}
// escapeDebugfsPath prepends "/" and wraps in double quotes if the path
// contains whitespace or special characters. debugfs' quoting is
// minimal; for safety we reject backslashes/quotes in paths entirely.
func escapeDebugfsPath(rel string) string {
abs := "/" + rel
// Container images don't normally use quoting-hostile chars; if they
// do, fall back to the raw path and hope debugfs copes (it usually
// does for spaces when quoted).
needsQuote := false
for _, c := range abs {
switch c {
case ' ', '\t':
needsQuote = true
case '"', '\\', '\n':
// Deliberately unhandled; debugfs may fail on these.
// Returning the raw string gives us a visible error
// instead of a silently-corrupted script.
return abs
}
}
if needsQuote {
return `"` + abs + `"`
}
return abs
}