daemon: persist teardown fallbacks and reject unsafe import paths

Preserve cleanup after daemon restarts and harden OCI and tar imports
against filenames that debugfs cannot encode safely.

Mirror tap, loop, and dm teardown identity onto VM.Runtime, teach
cleanup and reconcile to fall back to those persisted fields when
handles.json is missing or corrupt, and clear the recovery state on
stop, error, and delete paths.

Reject debugfs-hostile entry names during flattening and in
ApplyOwnership itself, then add regression coverage for corrupt
handles.json recovery and unsafe import paths.

Verified with targeted go tests, make lint-go, make lint-shell, and
make build.
This commit is contained in:
Thales Maciel 2026-04-23 16:21:59 -03:00
parent 86a56fedb3
commit d743a8ba4b
No known key found for this signature in database
GPG key ID: 33112E6833C34679
15 changed files with 272 additions and 81 deletions

View file

@ -28,7 +28,7 @@ banger image pull ghcr.io/myorg/devimg:v2 --kernel-ref generic-6.12
- Any public OCI image that exposes a `linux/amd64` manifest. - Any public OCI image that exposes a `linux/amd64` manifest.
- Correct layer replay with whiteout semantics (`.wh.*` deletes, - Correct layer replay with whiteout semantics (`.wh.*` deletes,
`.wh..wh..opq` opaque-dir markers). `.wh..wh..opq` opaque-dir markers).
- Path-traversal and relative-symlink-escape protection. - Path-traversal, debugfs-hostile filename, and relative-symlink-escape protection.
- Content-aware default sizing (`content × 1.5`, floor 1 GiB). - Content-aware default sizing (`content × 1.5`, floor 1 GiB).
- Layer caching on disk, keyed by blob sha256. - Layer caching on disk, keyed by blob sha256.
- **Ownership preservation** — tar-header uid/gid/mode captured - **Ownership preservation** — tar-header uid/gid/mode captured
@ -67,8 +67,9 @@ banger image pull ghcr.io/myorg/devimg:v2 --kernel-ref generic-6.12
`linux/amd64` platform pinned. Layer blobs cache under `linux/amd64` platform pinned. Layer blobs cache under
`~/.cache/banger/oci/blobs/` and populate lazily during flatten. `~/.cache/banger/oci/blobs/` and populate lazily during flatten.
- **`Flatten`** replays layers oldest-first into a staging directory, - **`Flatten`** replays layers oldest-first into a staging directory,
applies whiteouts, rejects unsafe paths. Returns a `Metadata` map applies whiteouts, rejects unsafe paths plus filenames that banger's
of per-file uid/gid/mode from tar headers. debugfs ownership fixup cannot encode safely. Returns a `Metadata`
map of per-file uid/gid/mode from tar headers.
- **`BuildExt4`** runs `mkfs.ext4 -F -d <staging> -E root_owner=0:0` - **`BuildExt4`** runs `mkfs.ext4 -F -d <staging> -E root_owner=0:0`
at the size of the pre-truncated file — no mount, no sudo, no at the size of the pre-truncated file — no mount, no sudo, no
loopback. Requires `e2fsprogs ≥ 1.43`. loopback. Requires `e2fsprogs ≥ 1.43`.

View file

@ -323,7 +323,7 @@ func (d *Daemon) reconcile(ctx context.Context) error {
_ = d.vm.cleanupRuntime(ctx, vm, true) _ = d.vm.cleanupRuntime(ctx, vm, true)
vm.State = model.VMStateStopped vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped
vm.Runtime.TapDevice = "" clearRuntimeTeardownState(&vm)
d.vm.clearVMHandles(vm) d.vm.clearVMHandles(vm)
vm.UpdatedAt = model.Now() vm.UpdatedAt = model.Now()
return d.store.UpsertVM(ctx, vm) return d.store.UpsertVM(ctx, vm)

View file

@ -216,7 +216,7 @@ func (s *StatsService) stopStaleVMs(ctx context.Context) (err error) {
_ = s.cleanupRuntime(ctx, vm, true) _ = s.cleanupRuntime(ctx, vm, true)
vm.State = model.VMStateStopped vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped
vm.Runtime.TapDevice = "" clearRuntimeTeardownState(&vm)
vm.UpdatedAt = model.Now() vm.UpdatedAt = model.Now()
return s.store.UpsertVM(ctx, vm) return s.store.UpsertVM(ctx, vm)
}); err != nil { }); err != nil {

View file

@ -52,6 +52,48 @@ func (s *VMService) rebuildDNS(ctx context.Context) error {
return s.net.replaceDNS(records) return s.net.replaceDNS(records)
} }
func persistRuntimeTeardownState(vm *model.VMRecord, h model.VMHandles) {
if vm == nil {
return
}
vm.Runtime.TapDevice = h.TapDevice
vm.Runtime.BaseLoop = h.BaseLoop
vm.Runtime.COWLoop = h.COWLoop
vm.Runtime.DMName = h.DMName
vm.Runtime.DMDev = h.DMDev
}
func clearRuntimeTeardownState(vm *model.VMRecord) {
if vm == nil {
return
}
vm.Runtime.TapDevice = ""
vm.Runtime.BaseLoop = ""
vm.Runtime.COWLoop = ""
vm.Runtime.DMName = ""
vm.Runtime.DMDev = ""
}
func teardownHandlesForCleanup(vm model.VMRecord, live model.VMHandles) model.VMHandles {
recovered := live
if strings.TrimSpace(recovered.TapDevice) == "" {
recovered.TapDevice = strings.TrimSpace(vm.Runtime.TapDevice)
}
if strings.TrimSpace(recovered.BaseLoop) == "" {
recovered.BaseLoop = strings.TrimSpace(vm.Runtime.BaseLoop)
}
if strings.TrimSpace(recovered.COWLoop) == "" {
recovered.COWLoop = strings.TrimSpace(vm.Runtime.COWLoop)
}
if strings.TrimSpace(recovered.DMName) == "" {
recovered.DMName = strings.TrimSpace(vm.Runtime.DMName)
}
if strings.TrimSpace(recovered.DMDev) == "" {
recovered.DMDev = strings.TrimSpace(vm.Runtime.DMDev)
}
return recovered
}
// cleanupRuntime tears down the host-side state for a VM: firecracker // cleanupRuntime tears down the host-side state for a VM: firecracker
// process, DM snapshot, capabilities, tap, sockets. Lives on VMService // process, DM snapshot, capabilities, tap, sockets. Lives on VMService
// because it reaches into handles (VMService-owned); the capability // because it reaches into handles (VMService-owned); the capability
@ -74,22 +116,19 @@ func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, prese
return err return err
} }
} }
handles := teardownHandlesForCleanup(vm, h)
snapshotErr := s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{ snapshotErr := s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{
BaseLoop: h.BaseLoop, BaseLoop: handles.BaseLoop,
COWLoop: h.COWLoop, COWLoop: handles.COWLoop,
DMName: h.DMName, DMName: handles.DMName,
DMDev: h.DMDev, DMDev: handles.DMDev,
}) })
featureErr := s.capHooks.cleanupState(ctx, vm) featureErr := s.capHooks.cleanupState(ctx, vm)
var tapErr error var tapErr error
// Prefer the handle cache (fresh from startVMLocked), but fall // Prefer the handle cache (fresh from startVMLocked), but fall
// back to Runtime.TapDevice — persisted to the DB in the same // back to the VMRuntime mirrors so restart-time cleanup still works
// stage — so a daemon restart or corrupt handles.json doesn't // when handles.json is missing or corrupt.
// leak the tap (or the NAT FORWARD rules keyed off it). tap := handles.TapDevice
tap := h.TapDevice
if tap == "" {
tap = vm.Runtime.TapDevice
}
if tap != "" { if tap != "" {
tapErr = s.net.releaseTap(ctx, tap) tapErr = s.net.releaseTap(ctx, tap)
} }

View file

@ -124,7 +124,8 @@ func (s *VMService) setVMHandlesInMemory(vmID string, h model.VMHandles) {
// vmHandles returns the cached handles for vm (zero-value if no // vmHandles returns the cached handles for vm (zero-value if no
// entry). The in-process handle cache is the authoritative source // entry). The in-process handle cache is the authoritative source
// for PID / loops / dm-name — VMRecord.Runtime holds only paths. // for PID and live kernel/network handles; VMRecord.Runtime only
// mirrors teardown-critical fields for restart recovery.
func (s *VMService) vmHandles(vmID string) model.VMHandles { func (s *VMService) vmHandles(vmID string) model.VMHandles {
if s == nil { if s == nil {
return model.VMHandles{} return model.VMHandles{}
@ -134,13 +135,15 @@ func (s *VMService) vmHandles(vmID string) model.VMHandles {
return h return h
} }
// setVMHandles updates the in-memory cache AND the per-VM scratch // setVMHandles updates the in-memory cache, mirrors teardown-critical
// file. Scratch-file errors are logged but not returned; the cache // fields onto VMRuntime, and writes the per-VM scratch file.
// write is authoritative while the daemon is alive. // Scratch-file errors are logged but not returned; the cache remains
func (s *VMService) setVMHandles(vm model.VMRecord, h model.VMHandles) { // authoritative while the daemon is alive.
if s == nil { func (s *VMService) setVMHandles(vm *model.VMRecord, h model.VMHandles) {
if s == nil || vm == nil {
return return
} }
persistRuntimeTeardownState(vm, h)
s.ensureHandleCache() s.ensureHandleCache()
s.handles.set(vm.ID, h) s.handles.set(vm.ID, h)
if err := writeHandlesFile(vm.Runtime.VMDir, h); err != nil && s.logger != nil { if err := writeHandlesFile(vm.Runtime.VMDir, h); err != nil && s.logger != nil {

View file

@ -36,6 +36,30 @@ func TestHandlesFileRoundtrip(t *testing.T) {
} }
} }
func TestSetVMHandlesMirrorsRuntimeTeardownState(t *testing.T) {
t.Parallel()
d := &Daemon{}
wireServices(d)
vmDir := t.TempDir()
vm := testVM("mirror", "image-mirror", "172.16.0.77")
vm.Runtime.VMDir = vmDir
want := model.VMHandles{
TapDevice: "tap-fc-0077",
BaseLoop: "/dev/loop17",
COWLoop: "/dev/loop18",
DMName: "fc-rootfs-0077",
DMDev: "/dev/mapper/fc-rootfs-0077",
}
d.vm.setVMHandles(&vm, want)
if vm.Runtime.TapDevice != want.TapDevice || vm.Runtime.BaseLoop != want.BaseLoop || vm.Runtime.COWLoop != want.COWLoop || vm.Runtime.DMName != want.DMName || vm.Runtime.DMDev != want.DMDev {
t.Fatalf("runtime teardown state not mirrored: got %+v want %+v", vm.Runtime, want)
}
}
func TestHandlesFileMissingReturnsZero(t *testing.T) { func TestHandlesFileMissingReturnsZero(t *testing.T) {
t.Parallel() t.Parallel()
h, present, err := readHandlesFile(t.TempDir()) h, present, err := readHandlesFile(t.TempDir())

View file

@ -80,7 +80,7 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
vm.State = model.VMStateError vm.State = model.VMStateError
vm.Runtime.State = model.VMStateError vm.Runtime.State = model.VMStateError
vm.Runtime.LastError = runErr.Error() vm.Runtime.LastError = runErr.Error()
vm.Runtime.TapDevice = "" clearRuntimeTeardownState(&vm)
s.clearVMHandles(vm) s.clearVMHandles(vm)
if s.store != nil { if s.store != nil {
_ = s.store.UpsertVM(context.Background(), vm) _ = s.store.UpsertVM(context.Background(), vm)
@ -113,7 +113,7 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
} }
vm.State = model.VMStateStopped vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped
vm.Runtime.TapDevice = "" clearRuntimeTeardownState(&vm)
s.clearVMHandles(vm) s.clearVMHandles(vm)
if err := s.store.UpsertVM(ctx, vm); err != nil { if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err return model.VMRecord{}, err
@ -138,7 +138,7 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
} }
vm.State = model.VMStateStopped vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped
vm.Runtime.TapDevice = "" clearRuntimeTeardownState(&vm)
s.clearVMHandles(vm) s.clearVMHandles(vm)
system.TouchNow(&vm) system.TouchNow(&vm)
if err := s.store.UpsertVM(ctx, vm); err != nil { if err := s.store.UpsertVM(ctx, vm); err != nil {
@ -170,7 +170,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si
} }
vm.State = model.VMStateStopped vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped
vm.Runtime.TapDevice = "" clearRuntimeTeardownState(&vm)
s.clearVMHandles(vm) s.clearVMHandles(vm)
if err := s.store.UpsertVM(ctx, vm); err != nil { if err := s.store.UpsertVM(ctx, vm); err != nil {
return model.VMRecord{}, err return model.VMRecord{}, err
@ -200,7 +200,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si
} }
vm.State = model.VMStateStopped vm.State = model.VMStateStopped
vm.Runtime.State = model.VMStateStopped vm.Runtime.State = model.VMStateStopped
vm.Runtime.TapDevice = "" clearRuntimeTeardownState(&vm)
s.clearVMHandles(vm) s.clearVMHandles(vm)
system.TouchNow(&vm) system.TouchNow(&vm)
if err := s.store.UpsertVM(ctx, vm); err != nil { if err := s.store.UpsertVM(ctx, vm); err != nil {
@ -262,6 +262,7 @@ func (s *VMService) deleteVMLocked(ctx context.Context, current model.VMRecord)
if err := s.cleanupRuntime(ctx, vm, false); err != nil { if err := s.cleanupRuntime(ctx, vm, false); err != nil {
return model.VMRecord{}, err return model.VMRecord{}, err
} }
clearRuntimeTeardownState(&vm)
op.stage("delete_store_record") op.stage("delete_store_record")
if err := s.store.DeleteVM(ctx, vm.ID); err != nil { if err := s.store.DeleteVM(ctx, vm.ID); err != nil {
return model.VMRecord{}, err return model.VMRecord{}, err

View file

@ -213,7 +213,7 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS
sc.live.COWLoop = snapHandles.COWLoop sc.live.COWLoop = snapHandles.COWLoop
sc.live.DMName = snapHandles.DMName sc.live.DMName = snapHandles.DMName
sc.live.DMDev = snapHandles.DMDev sc.live.DMDev = snapHandles.DMDev
s.setVMHandles(*sc.vm, *sc.live) s.setVMHandles(sc.vm, *sc.live)
// Fields that used to land next to the (now-deleted) // Fields that used to land next to the (now-deleted)
// cleanupOnErr closure. They belong with the DM // cleanupOnErr closure. They belong with the DM
// snapshot because that's the first step producing // snapshot because that's the first step producing
@ -282,10 +282,7 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS
return err return err
} }
sc.live.TapDevice = tap sc.live.TapDevice = tap
s.setVMHandles(*sc.vm, *sc.live) s.setVMHandles(sc.vm, *sc.live)
// Mirror onto VM.Runtime for NAT teardown resilience
// across daemon crashes — see vm.Runtime.TapDevice docs.
sc.vm.Runtime.TapDevice = tap
return nil return nil
}, },
undo: func(ctx context.Context, sc *startContext) error { undo: func(ctx context.Context, sc *startContext) error {
@ -360,11 +357,11 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS
// PID so the undo can kill it; use a fresh ctx since // PID so the undo can kill it; use a fresh ctx since
// the request ctx may be cancelled by now. // the request ctx may be cancelled by now.
sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock) sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock)
s.setVMHandles(*sc.vm, *sc.live) s.setVMHandles(sc.vm, *sc.live)
return err return err
} }
sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock) sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock)
s.setVMHandles(*sc.vm, *sc.live) s.setVMHandles(sc.vm, *sc.live)
op.debugStage("firecracker_started", "pid", sc.live.PID) op.debugStage("firecracker_started", "pid", sc.live.PID)
return nil return nil
}, },

View file

@ -175,6 +175,66 @@ func TestReconcileStopsStaleRunningVMAndClearsRuntimeHandles(t *testing.T) {
} }
} }
func TestReconcileWithCorruptHandlesFileFallsBackToPersistedRuntimeTeardownState(t *testing.T) {
t.Parallel()
ctx := context.Background()
db := openDaemonStore(t)
apiSock := filepath.Join(t.TempDir(), "fc.sock")
if err := os.WriteFile(apiSock, []byte{}, 0o644); err != nil {
t.Fatalf("WriteFile(api sock): %v", err)
}
vmDir := t.TempDir()
vm := testVM("corrupt", "image-corrupt", "172.16.0.10")
vm.State = model.VMStateRunning
vm.Runtime.State = model.VMStateRunning
vm.Runtime.APISockPath = apiSock
vm.Runtime.VMDir = vmDir
vm.Runtime.DNSName = ""
vm.Runtime.TapDevice = "tap-fc-corrupt"
vm.Runtime.BaseLoop = "/dev/loop20"
vm.Runtime.COWLoop = "/dev/loop21"
vm.Runtime.DMName = "fc-rootfs-corrupt"
vm.Runtime.DMDev = "/dev/mapper/fc-rootfs-corrupt"
upsertDaemonVM(t, ctx, db, vm)
if err := os.WriteFile(handlesFilePath(vmDir), []byte("{not json"), 0o600); err != nil {
t.Fatalf("WriteFile(handles.json): %v", err)
}
runner := &scriptedRunner{
t: t,
steps: []runnerStep{
{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: errors.New("exit status 1")},
sudoStep("", nil, "dmsetup", "remove", "fc-rootfs-corrupt"),
sudoStep("", nil, "losetup", "-d", "/dev/loop21"),
sudoStep("", nil, "losetup", "-d", "/dev/loop20"),
sudoStep("", nil, "ip", "link", "del", "tap-fc-corrupt"),
},
}
d := &Daemon{store: db, runner: runner}
wireServices(d)
if err := d.reconcile(ctx); err != nil {
t.Fatalf("reconcile: %v", err)
}
runner.assertExhausted()
got, err := db.GetVM(ctx, vm.ID)
if err != nil {
t.Fatalf("GetVM: %v", err)
}
if got.State != model.VMStateStopped || got.Runtime.State != model.VMStateStopped {
t.Fatalf("vm state after reconcile = %s/%s, want stopped", got.State, got.Runtime.State)
}
if got.Runtime.TapDevice != "" || got.Runtime.BaseLoop != "" || got.Runtime.COWLoop != "" || got.Runtime.DMName != "" || got.Runtime.DMDev != "" {
t.Fatalf("runtime teardown state not cleared after reconcile: %+v", got.Runtime)
}
if _, err := os.Stat(handlesFilePath(vmDir)); !os.IsNotExist(err) {
t.Fatalf("handles.json still present after reconcile: %v", err)
}
}
func TestRebuildDNSIncludesOnlyLiveRunningVMs(t *testing.T) { func TestRebuildDNSIncludesOnlyLiveRunningVMs(t *testing.T) {
t.Parallel() t.Parallel()

View file

@ -138,6 +138,9 @@ func applyEntry(tr *tar.Reader, hdr *tar.Header, dest string, meta *Metadata) er
if filepath.IsAbs(rel) || rel == ".." || strings.HasPrefix(rel, ".."+string(filepath.Separator)) { if filepath.IsAbs(rel) || rel == ".." || strings.HasPrefix(rel, ".."+string(filepath.Separator)) {
return fmt.Errorf("unsafe path in layer: %q", hdr.Name) return fmt.Errorf("unsafe path in layer: %q", hdr.Name)
} }
if err := validateDebugFSPath(rel); err != nil {
return err
}
base := filepath.Base(rel) base := filepath.Base(rel)
parent := filepath.Dir(rel) parent := filepath.Dir(rel)

View file

@ -10,9 +10,9 @@
// and returns a v1.Image whose layer blobs are cached on disk under // and returns a v1.Image whose layer blobs are cached on disk under
// cacheDir/blobs/sha256/<hex> so re-pulls are local. // cacheDir/blobs/sha256/<hex> so re-pulls are local.
// - Flatten replays the layers in order into a staging directory, // - Flatten replays the layers in order into a staging directory,
// applies whiteouts, rejects unsafe paths/symlinks, and returns // applies whiteouts, rejects unsafe paths/symlinks plus filenames
// Metadata capturing the original tar-header uid/gid/mode for // that debugfs can't represent safely, and returns Metadata
// every entry. // capturing the original tar-header uid/gid/mode for every entry.
// - BuildExt4 turns the staging directory into an ext4 file via // - BuildExt4 turns the staging directory into an ext4 file via
// `mkfs.ext4 -F -d` (no mount, no sudo). Root-owns the filesystem // `mkfs.ext4 -F -d` (no mount, no sudo). Root-owns the filesystem
// via `-E root_owner=0:0`. // via `-E root_owner=0:0`.

View file

@ -254,6 +254,30 @@ func TestFlattenRejectsPathTraversal(t *testing.T) {
} }
} }
func TestFlattenRejectsDebugFSHostilePath(t *testing.T) {
img, err := mutate.AppendLayers(empty.Image,
makeLayer(t, []tarMember{
{name: `etc/bad"name`, body: []byte("bad")},
}),
)
if err != nil {
t.Fatalf("AppendLayers: %v", err)
}
pulled := PulledImage{
Reference: "test/debugfs-hostile",
Digest: "sha256:test",
Platform: "linux/amd64",
Image: img,
}
_, err = Flatten(context.Background(), pulled, t.TempDir())
if !errors.Is(err, errUnsafeDebugFSPath) {
t.Fatalf("Flatten hostile path: err=%v, want %v", err, errUnsafeDebugFSPath)
}
if !strings.Contains(err.Error(), `etc/bad\"name`) {
t.Fatalf("Flatten hostile path: err=%v, want offending path", err)
}
}
func TestFlattenAcceptsAbsoluteSymlink(t *testing.T) { func TestFlattenAcceptsAbsoluteSymlink(t *testing.T) {
// Container layers regularly contain absolute symlinks like // Container layers regularly contain absolute symlinks like
// /usr/bin/mawk — they're interpreted relative to the rootfs at // /usr/bin/mawk — they're interpreted relative to the rootfs at
@ -303,6 +327,19 @@ func TestFlattenRejectsRelativeSymlinkEscape(t *testing.T) {
} }
} }
func TestFlattenTarRejectsDebugFSHostilePath(t *testing.T) {
tarData := buildTar(t, []tarMember{
{name: "etc/bad\tname", body: []byte("bad")},
})
_, err := FlattenTar(context.Background(), bytes.NewReader(tarData), t.TempDir())
if !errors.Is(err, errUnsafeDebugFSPath) {
t.Fatalf("FlattenTar hostile path: err=%v, want %v", err, errUnsafeDebugFSPath)
}
if !strings.Contains(err.Error(), `etc/bad\tname`) {
t.Fatalf("FlattenTar hostile path: err=%v, want offending path", err)
}
}
func TestBuildExt4ProducesValidImage(t *testing.T) { func TestBuildExt4ProducesValidImage(t *testing.T) {
if _, err := exec.LookPath("mkfs.ext4"); err != nil { if _, err := exec.LookPath("mkfs.ext4"); err != nil {
t.Skip("mkfs.ext4 not available; skipping") t.Skip("mkfs.ext4 not available; skipping")
@ -412,13 +449,30 @@ func TestApplyOwnershipRewritesUidGidMode(t *testing.T) {
} }
} }
func TestApplyOwnershipRejectsUnsafeMetadataPath(t *testing.T) {
meta := Metadata{Entries: map[string]FileMeta{
"bad\nname": {Uid: 0, Gid: 0, Mode: 0o644, Type: tar.TypeReg},
}}
err := ApplyOwnership(context.Background(), system.NewRunner(), filepath.Join(t.TempDir(), "rootfs.ext4"), meta)
if !errors.Is(err, errUnsafeDebugFSPath) {
t.Fatalf("ApplyOwnership hostile path: err=%v, want %v", err, errUnsafeDebugFSPath)
}
if !strings.Contains(err.Error(), `bad\nname`) {
t.Fatalf("ApplyOwnership hostile path: err=%v, want offending path", err)
}
}
func TestBuildOwnershipScriptDeterministic(t *testing.T) { func TestBuildOwnershipScriptDeterministic(t *testing.T) {
meta := Metadata{Entries: map[string]FileMeta{ meta := Metadata{Entries: map[string]FileMeta{
"b": {Uid: 0, Gid: 0, Mode: 0o755, Type: tar.TypeReg}, "b": {Uid: 0, Gid: 0, Mode: 0o755, Type: tar.TypeReg},
"a": {Uid: 0, Gid: 0, Mode: 0o755, Type: tar.TypeReg}, "a": {Uid: 0, Gid: 0, Mode: 0o755, Type: tar.TypeReg},
"a/x": {Uid: 0, Gid: 0, Mode: 0o644, Type: tar.TypeReg}, "a/x": {Uid: 0, Gid: 0, Mode: 0o644, Type: tar.TypeReg},
}} }}
got := buildOwnershipScript(meta).String() gotBuf, err := buildOwnershipScript(meta)
if err != nil {
t.Fatalf("buildOwnershipScript: %v", err)
}
got := gotBuf.String()
// sorted: a, a/x, b // sorted: a, a/x, b
want := "set_inode_field /a uid 0\nset_inode_field /a gid 0\nset_inode_field /a mode 0100755\n" + want := "set_inode_field /a uid 0\nset_inode_field /a gid 0\nset_inode_field /a mode 0100755\n" +
"set_inode_field /a/x uid 0\nset_inode_field /a/x gid 0\nset_inode_field /a/x mode 0100644\n" + "set_inode_field /a/x uid 0\nset_inode_field /a/x gid 0\nset_inode_field /a/x mode 0100644\n" +

View file

@ -4,8 +4,10 @@ import (
"archive/tar" "archive/tar"
"bytes" "bytes"
"context" "context"
"errors"
"fmt" "fmt"
"sort" "sort"
"strings"
"banger/internal/system" "banger/internal/system"
) )
@ -24,7 +26,10 @@ func ApplyOwnership(ctx context.Context, runner system.CommandRunner, ext4File s
if len(meta.Entries) == 0 { if len(meta.Entries) == 0 {
return nil return nil
} }
script := buildOwnershipScript(meta) script, err := buildOwnershipScript(meta)
if err != nil {
return err
}
if script.Len() == 0 { if script.Len() == 0 {
return nil return nil
} }
@ -43,7 +48,7 @@ func ApplyOwnership(ctx context.Context, runner system.CommandRunner, ext4File s
// Paths are prefixed with "/" so debugfs resolves them from the ext4 // Paths are prefixed with "/" so debugfs resolves them from the ext4
// root. Entries are sorted for deterministic output (helps testing and // root. Entries are sorted for deterministic output (helps testing and
// makes debugfs's internal caching slightly more cache-friendly). // makes debugfs's internal caching slightly more cache-friendly).
func buildOwnershipScript(meta Metadata) *bytes.Buffer { func buildOwnershipScript(meta Metadata) (*bytes.Buffer, error) {
var buf bytes.Buffer var buf bytes.Buffer
paths := make([]string, 0, len(meta.Entries)) paths := make([]string, 0, len(meta.Entries))
for p := range meta.Entries { for p := range meta.Entries {
@ -56,12 +61,15 @@ func buildOwnershipScript(meta Metadata) *bytes.Buffer {
if mode == 0 { if mode == 0 {
continue // hardlinks or unsupported types (skip) continue // hardlinks or unsupported types (skip)
} }
if err := validateDebugFSPath(p); err != nil {
return nil, err
}
escaped := escapeDebugfsPath(p) escaped := escapeDebugfsPath(p)
fmt.Fprintf(&buf, "set_inode_field %s uid %d\n", escaped, m.Uid) fmt.Fprintf(&buf, "set_inode_field %s uid %d\n", escaped, m.Uid)
fmt.Fprintf(&buf, "set_inode_field %s gid %d\n", escaped, m.Gid) fmt.Fprintf(&buf, "set_inode_field %s gid %d\n", escaped, m.Gid)
fmt.Fprintf(&buf, "set_inode_field %s mode 0%o\n", escaped, mode) fmt.Fprintf(&buf, "set_inode_field %s mode 0%o\n", escaped, mode)
} }
return &buf return &buf, nil
} }
// debugfsMode composes the full i_mode word (file-type bits + // debugfsMode composes the full i_mode word (file-type bits +
@ -87,27 +95,29 @@ func debugfsMode(typ byte, hdrMode int64) uint32 {
} }
} }
// escapeDebugfsPath prepends "/" and wraps in double quotes if the path var errUnsafeDebugFSPath = errors.New("unsafe path for debugfs ownership script")
// contains whitespace or special characters. debugfs' quoting is
// minimal; for safety we reject backslashes/quotes in paths entirely. func validateDebugFSPath(rel string) error {
func escapeDebugfsPath(rel string) string { for i := 0; i < len(rel); i++ {
abs := "/" + rel switch c := rel[i]; {
// Container images don't normally use quoting-hostile chars; if they case c == '"':
// do, fall back to the raw path and hope debugfs copes (it usually return fmt.Errorf("%w: %q contains '\"'", errUnsafeDebugFSPath, rel)
// does for spaces when quoted). case c == '\\':
needsQuote := false return fmt.Errorf("%w: %q contains '\\\\'", errUnsafeDebugFSPath, rel)
for _, c := range abs { case c < 0x20 || c == 0x7f:
switch c { return fmt.Errorf("%w: %q contains control byte 0x%02x", errUnsafeDebugFSPath, rel, c)
case ' ', '\t':
needsQuote = true
case '"', '\\', '\n':
// Deliberately unhandled; debugfs may fail on these.
// Returning the raw string gives us a visible error
// instead of a silently-corrupted script.
return abs
} }
} }
if needsQuote { return nil
}
// escapeDebugfsPath prepends "/" and wraps in double quotes if the path
// contains spaces. validateDebugFSPath rejects debugfs-hostile bytes
// before this runs, so the only quoting we need is the simple
// whitespace case debugfs already handles.
func escapeDebugfsPath(rel string) string {
abs := "/" + rel
if strings.ContainsRune(abs, ' ') {
return `"` + abs + `"` return `"` + abs + `"`
} }
return abs return abs

View file

@ -89,11 +89,10 @@ type VMSpec struct {
// VMRuntime holds the durable runtime state that the daemon needs // VMRuntime holds the durable runtime state that the daemon needs
// to reach a VM: identity, declared state, and deterministic derived // to reach a VM: identity, declared state, and deterministic derived
// paths. Transient kernel/process handles (PID, tap, loop devices, // paths. The authoritative live handle set still lives on VMHandles,
// dm-snapshot names) live on VMHandles, NOT here — the daemon keeps // but teardown-critical storage/network identifiers are mirrored here
// them in an in-memory cache backed by a per-VM handles.json scratch // as recovery fallbacks so restart-time cleanup still works when
// file, so a daemon restart rebuilds them from OS state rather than // handles.json is missing or corrupt.
// trusting whatever was last written into a SQLite column.
// //
// Everything in VMRuntime is safe to persist: the paths are // Everything in VMRuntime is safe to persist: the paths are
// deterministic from (VM ID, layout) and survive restart unchanged; // deterministic from (VM ID, layout) and survive restart unchanged;
@ -110,14 +109,15 @@ type VMRuntime struct {
MetricsPath string `json:"metrics_path,omitempty"` MetricsPath string `json:"metrics_path,omitempty"`
DNSName string `json:"dns_name,omitempty"` DNSName string `json:"dns_name,omitempty"`
VMDir string `json:"vm_dir"` VMDir string `json:"vm_dir"`
// TapDevice mirrors VMHandles.TapDevice but persists across // Teardown fallback fields mirror the handle cache onto the VM row.
// daemon restarts / handle-cache loss. NAT teardown needs the // They are recovery-only: while the daemon is alive, VMHandles stays
// exact tap name to delete the FORWARD rules; if we only had // authoritative. On restart, cleanup can fall back to these values if
// the handle cache, a crash between tap acquire and handles.json // handles.json is missing or corrupt.
// write — or a corrupt handles.json on the next daemon start —
// would silently leak the rules. Storing it on the VM record
// makes cleanup correct as long as the VM row exists.
TapDevice string `json:"tap_device,omitempty"` TapDevice string `json:"tap_device,omitempty"`
BaseLoop string `json:"base_loop,omitempty"`
COWLoop string `json:"cow_loop,omitempty"`
DMName string `json:"dm_name,omitempty"`
DMDev string `json:"dm_dev,omitempty"`
SystemOverlay string `json:"system_overlay_path"` SystemOverlay string `json:"system_overlay_path"`
WorkDiskPath string `json:"work_disk_path"` WorkDiskPath string `json:"work_disk_path"`
LastError string `json:"last_error,omitempty"` LastError string `json:"last_error,omitempty"`

View file

@ -3,11 +3,11 @@ package model
// VMHandles captures the transient, per-boot kernel/process handles // VMHandles captures the transient, per-boot kernel/process handles
// that banger obtains while starting a VM and releases when stopping // that banger obtains while starting a VM and releases when stopping
// it. Unlike VMRuntime (durable spec + identity + derived paths), // it. Unlike VMRuntime (durable spec + identity + derived paths),
// nothing in VMHandles survives a daemon restart in authoritative // VMHandles is the authoritative live-handle view while the daemon is
// form: each value is either rediscovered from the OS (PID from the // up. On restart, the daemon rebuilds it from the OS plus the per-VM
// firecracker api socket, DM name deterministically from the VM ID) // scratch file; teardown-critical fields are also mirrored onto
// or read from a per-VM scratch file that the daemon rebuilds at // VMRuntime so cleanup can still proceed if that scratch file is
// every start. // missing or corrupt.
// //
// The daemon keeps an in-memory cache keyed by VM ID. Lifecycle // The daemon keeps an in-memory cache keyed by VM ID. Lifecycle
// transitions update the cache and a small `handles.json` scratch // transitions update the cache and a small `handles.json` scratch
@ -16,10 +16,9 @@ package model
// OS state. If anything is stale the VM is marked stopped and the // OS state. If anything is stale the VM is marked stopped and the
// cache entry is dropped. // cache entry is dropped.
// //
// VMHandles never appears in the `vms` SQLite rows. Keeping it off // VMHandles itself never appears in the `vms` SQLite rows. Some fields
// the durable schema was the whole point of the split — persistent // are mirrored onto VMRuntime as crash-recovery fallback state, but the
// records describe what a VM SHOULD be; handles describe what is // cache + scratch file remain the canonical live source.
// currently true about it.
type VMHandles struct { type VMHandles struct {
// PID is the firecracker process PID. Zero means "not running // PID is the firecracker process PID. Zero means "not running
// (from our perspective)". Always verifiable via // (from our perspective)". Always verifiable via