daemon: persist teardown fallbacks and reject unsafe import paths
Preserve cleanup after daemon restarts and harden OCI and tar imports against filenames that debugfs cannot encode safely. Mirror tap, loop, and dm teardown identity onto VM.Runtime, teach cleanup and reconcile to fall back to those persisted fields when handles.json is missing or corrupt, and clear the recovery state on stop, error, and delete paths. Reject debugfs-hostile entry names during flattening and in ApplyOwnership itself, then add regression coverage for corrupt handles.json recovery and unsafe import paths. Verified with targeted go tests, make lint-go, make lint-shell, and make build.
This commit is contained in:
parent
86a56fedb3
commit
d743a8ba4b
15 changed files with 272 additions and 81 deletions
|
|
@ -28,7 +28,7 @@ banger image pull ghcr.io/myorg/devimg:v2 --kernel-ref generic-6.12
|
||||||
- Any public OCI image that exposes a `linux/amd64` manifest.
|
- Any public OCI image that exposes a `linux/amd64` manifest.
|
||||||
- Correct layer replay with whiteout semantics (`.wh.*` deletes,
|
- Correct layer replay with whiteout semantics (`.wh.*` deletes,
|
||||||
`.wh..wh..opq` opaque-dir markers).
|
`.wh..wh..opq` opaque-dir markers).
|
||||||
- Path-traversal and relative-symlink-escape protection.
|
- Path-traversal, debugfs-hostile filename, and relative-symlink-escape protection.
|
||||||
- Content-aware default sizing (`content × 1.5`, floor 1 GiB).
|
- Content-aware default sizing (`content × 1.5`, floor 1 GiB).
|
||||||
- Layer caching on disk, keyed by blob sha256.
|
- Layer caching on disk, keyed by blob sha256.
|
||||||
- **Ownership preservation** — tar-header uid/gid/mode captured
|
- **Ownership preservation** — tar-header uid/gid/mode captured
|
||||||
|
|
@ -67,8 +67,9 @@ banger image pull ghcr.io/myorg/devimg:v2 --kernel-ref generic-6.12
|
||||||
`linux/amd64` platform pinned. Layer blobs cache under
|
`linux/amd64` platform pinned. Layer blobs cache under
|
||||||
`~/.cache/banger/oci/blobs/` and populate lazily during flatten.
|
`~/.cache/banger/oci/blobs/` and populate lazily during flatten.
|
||||||
- **`Flatten`** replays layers oldest-first into a staging directory,
|
- **`Flatten`** replays layers oldest-first into a staging directory,
|
||||||
applies whiteouts, rejects unsafe paths. Returns a `Metadata` map
|
applies whiteouts, rejects unsafe paths plus filenames that banger's
|
||||||
of per-file uid/gid/mode from tar headers.
|
debugfs ownership fixup cannot encode safely. Returns a `Metadata`
|
||||||
|
map of per-file uid/gid/mode from tar headers.
|
||||||
- **`BuildExt4`** runs `mkfs.ext4 -F -d <staging> -E root_owner=0:0`
|
- **`BuildExt4`** runs `mkfs.ext4 -F -d <staging> -E root_owner=0:0`
|
||||||
at the size of the pre-truncated file — no mount, no sudo, no
|
at the size of the pre-truncated file — no mount, no sudo, no
|
||||||
loopback. Requires `e2fsprogs ≥ 1.43`.
|
loopback. Requires `e2fsprogs ≥ 1.43`.
|
||||||
|
|
|
||||||
|
|
@ -323,7 +323,7 @@ func (d *Daemon) reconcile(ctx context.Context) error {
|
||||||
_ = d.vm.cleanupRuntime(ctx, vm, true)
|
_ = d.vm.cleanupRuntime(ctx, vm, true)
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
vm.Runtime.TapDevice = ""
|
clearRuntimeTeardownState(&vm)
|
||||||
d.vm.clearVMHandles(vm)
|
d.vm.clearVMHandles(vm)
|
||||||
vm.UpdatedAt = model.Now()
|
vm.UpdatedAt = model.Now()
|
||||||
return d.store.UpsertVM(ctx, vm)
|
return d.store.UpsertVM(ctx, vm)
|
||||||
|
|
|
||||||
|
|
@ -216,7 +216,7 @@ func (s *StatsService) stopStaleVMs(ctx context.Context) (err error) {
|
||||||
_ = s.cleanupRuntime(ctx, vm, true)
|
_ = s.cleanupRuntime(ctx, vm, true)
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
vm.Runtime.TapDevice = ""
|
clearRuntimeTeardownState(&vm)
|
||||||
vm.UpdatedAt = model.Now()
|
vm.UpdatedAt = model.Now()
|
||||||
return s.store.UpsertVM(ctx, vm)
|
return s.store.UpsertVM(ctx, vm)
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,48 @@ func (s *VMService) rebuildDNS(ctx context.Context) error {
|
||||||
return s.net.replaceDNS(records)
|
return s.net.replaceDNS(records)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func persistRuntimeTeardownState(vm *model.VMRecord, h model.VMHandles) {
|
||||||
|
if vm == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
vm.Runtime.TapDevice = h.TapDevice
|
||||||
|
vm.Runtime.BaseLoop = h.BaseLoop
|
||||||
|
vm.Runtime.COWLoop = h.COWLoop
|
||||||
|
vm.Runtime.DMName = h.DMName
|
||||||
|
vm.Runtime.DMDev = h.DMDev
|
||||||
|
}
|
||||||
|
|
||||||
|
func clearRuntimeTeardownState(vm *model.VMRecord) {
|
||||||
|
if vm == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
vm.Runtime.TapDevice = ""
|
||||||
|
vm.Runtime.BaseLoop = ""
|
||||||
|
vm.Runtime.COWLoop = ""
|
||||||
|
vm.Runtime.DMName = ""
|
||||||
|
vm.Runtime.DMDev = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func teardownHandlesForCleanup(vm model.VMRecord, live model.VMHandles) model.VMHandles {
|
||||||
|
recovered := live
|
||||||
|
if strings.TrimSpace(recovered.TapDevice) == "" {
|
||||||
|
recovered.TapDevice = strings.TrimSpace(vm.Runtime.TapDevice)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(recovered.BaseLoop) == "" {
|
||||||
|
recovered.BaseLoop = strings.TrimSpace(vm.Runtime.BaseLoop)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(recovered.COWLoop) == "" {
|
||||||
|
recovered.COWLoop = strings.TrimSpace(vm.Runtime.COWLoop)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(recovered.DMName) == "" {
|
||||||
|
recovered.DMName = strings.TrimSpace(vm.Runtime.DMName)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(recovered.DMDev) == "" {
|
||||||
|
recovered.DMDev = strings.TrimSpace(vm.Runtime.DMDev)
|
||||||
|
}
|
||||||
|
return recovered
|
||||||
|
}
|
||||||
|
|
||||||
// cleanupRuntime tears down the host-side state for a VM: firecracker
|
// cleanupRuntime tears down the host-side state for a VM: firecracker
|
||||||
// process, DM snapshot, capabilities, tap, sockets. Lives on VMService
|
// process, DM snapshot, capabilities, tap, sockets. Lives on VMService
|
||||||
// because it reaches into handles (VMService-owned); the capability
|
// because it reaches into handles (VMService-owned); the capability
|
||||||
|
|
@ -74,22 +116,19 @@ func (s *VMService) cleanupRuntime(ctx context.Context, vm model.VMRecord, prese
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
handles := teardownHandlesForCleanup(vm, h)
|
||||||
snapshotErr := s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{
|
snapshotErr := s.net.cleanupDMSnapshot(ctx, dmSnapshotHandles{
|
||||||
BaseLoop: h.BaseLoop,
|
BaseLoop: handles.BaseLoop,
|
||||||
COWLoop: h.COWLoop,
|
COWLoop: handles.COWLoop,
|
||||||
DMName: h.DMName,
|
DMName: handles.DMName,
|
||||||
DMDev: h.DMDev,
|
DMDev: handles.DMDev,
|
||||||
})
|
})
|
||||||
featureErr := s.capHooks.cleanupState(ctx, vm)
|
featureErr := s.capHooks.cleanupState(ctx, vm)
|
||||||
var tapErr error
|
var tapErr error
|
||||||
// Prefer the handle cache (fresh from startVMLocked), but fall
|
// Prefer the handle cache (fresh from startVMLocked), but fall
|
||||||
// back to Runtime.TapDevice — persisted to the DB in the same
|
// back to the VMRuntime mirrors so restart-time cleanup still works
|
||||||
// stage — so a daemon restart or corrupt handles.json doesn't
|
// when handles.json is missing or corrupt.
|
||||||
// leak the tap (or the NAT FORWARD rules keyed off it).
|
tap := handles.TapDevice
|
||||||
tap := h.TapDevice
|
|
||||||
if tap == "" {
|
|
||||||
tap = vm.Runtime.TapDevice
|
|
||||||
}
|
|
||||||
if tap != "" {
|
if tap != "" {
|
||||||
tapErr = s.net.releaseTap(ctx, tap)
|
tapErr = s.net.releaseTap(ctx, tap)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -124,7 +124,8 @@ func (s *VMService) setVMHandlesInMemory(vmID string, h model.VMHandles) {
|
||||||
|
|
||||||
// vmHandles returns the cached handles for vm (zero-value if no
|
// vmHandles returns the cached handles for vm (zero-value if no
|
||||||
// entry). The in-process handle cache is the authoritative source
|
// entry). The in-process handle cache is the authoritative source
|
||||||
// for PID / loops / dm-name — VMRecord.Runtime holds only paths.
|
// for PID and live kernel/network handles; VMRecord.Runtime only
|
||||||
|
// mirrors teardown-critical fields for restart recovery.
|
||||||
func (s *VMService) vmHandles(vmID string) model.VMHandles {
|
func (s *VMService) vmHandles(vmID string) model.VMHandles {
|
||||||
if s == nil {
|
if s == nil {
|
||||||
return model.VMHandles{}
|
return model.VMHandles{}
|
||||||
|
|
@ -134,13 +135,15 @@ func (s *VMService) vmHandles(vmID string) model.VMHandles {
|
||||||
return h
|
return h
|
||||||
}
|
}
|
||||||
|
|
||||||
// setVMHandles updates the in-memory cache AND the per-VM scratch
|
// setVMHandles updates the in-memory cache, mirrors teardown-critical
|
||||||
// file. Scratch-file errors are logged but not returned; the cache
|
// fields onto VMRuntime, and writes the per-VM scratch file.
|
||||||
// write is authoritative while the daemon is alive.
|
// Scratch-file errors are logged but not returned; the cache remains
|
||||||
func (s *VMService) setVMHandles(vm model.VMRecord, h model.VMHandles) {
|
// authoritative while the daemon is alive.
|
||||||
if s == nil {
|
func (s *VMService) setVMHandles(vm *model.VMRecord, h model.VMHandles) {
|
||||||
|
if s == nil || vm == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
persistRuntimeTeardownState(vm, h)
|
||||||
s.ensureHandleCache()
|
s.ensureHandleCache()
|
||||||
s.handles.set(vm.ID, h)
|
s.handles.set(vm.ID, h)
|
||||||
if err := writeHandlesFile(vm.Runtime.VMDir, h); err != nil && s.logger != nil {
|
if err := writeHandlesFile(vm.Runtime.VMDir, h); err != nil && s.logger != nil {
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,30 @@ func TestHandlesFileRoundtrip(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSetVMHandlesMirrorsRuntimeTeardownState(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
d := &Daemon{}
|
||||||
|
wireServices(d)
|
||||||
|
|
||||||
|
vmDir := t.TempDir()
|
||||||
|
vm := testVM("mirror", "image-mirror", "172.16.0.77")
|
||||||
|
vm.Runtime.VMDir = vmDir
|
||||||
|
|
||||||
|
want := model.VMHandles{
|
||||||
|
TapDevice: "tap-fc-0077",
|
||||||
|
BaseLoop: "/dev/loop17",
|
||||||
|
COWLoop: "/dev/loop18",
|
||||||
|
DMName: "fc-rootfs-0077",
|
||||||
|
DMDev: "/dev/mapper/fc-rootfs-0077",
|
||||||
|
}
|
||||||
|
d.vm.setVMHandles(&vm, want)
|
||||||
|
|
||||||
|
if vm.Runtime.TapDevice != want.TapDevice || vm.Runtime.BaseLoop != want.BaseLoop || vm.Runtime.COWLoop != want.COWLoop || vm.Runtime.DMName != want.DMName || vm.Runtime.DMDev != want.DMDev {
|
||||||
|
t.Fatalf("runtime teardown state not mirrored: got %+v want %+v", vm.Runtime, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandlesFileMissingReturnsZero(t *testing.T) {
|
func TestHandlesFileMissingReturnsZero(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
h, present, err := readHandlesFile(t.TempDir())
|
h, present, err := readHandlesFile(t.TempDir())
|
||||||
|
|
|
||||||
|
|
@ -80,7 +80,7 @@ func (s *VMService) startVMLocked(ctx context.Context, vm model.VMRecord, image
|
||||||
vm.State = model.VMStateError
|
vm.State = model.VMStateError
|
||||||
vm.Runtime.State = model.VMStateError
|
vm.Runtime.State = model.VMStateError
|
||||||
vm.Runtime.LastError = runErr.Error()
|
vm.Runtime.LastError = runErr.Error()
|
||||||
vm.Runtime.TapDevice = ""
|
clearRuntimeTeardownState(&vm)
|
||||||
s.clearVMHandles(vm)
|
s.clearVMHandles(vm)
|
||||||
if s.store != nil {
|
if s.store != nil {
|
||||||
_ = s.store.UpsertVM(context.Background(), vm)
|
_ = s.store.UpsertVM(context.Background(), vm)
|
||||||
|
|
@ -113,7 +113,7 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
vm.Runtime.TapDevice = ""
|
clearRuntimeTeardownState(&vm)
|
||||||
s.clearVMHandles(vm)
|
s.clearVMHandles(vm)
|
||||||
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
|
|
@ -138,7 +138,7 @@ func (s *VMService) stopVMLocked(ctx context.Context, current model.VMRecord) (v
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
vm.Runtime.TapDevice = ""
|
clearRuntimeTeardownState(&vm)
|
||||||
s.clearVMHandles(vm)
|
s.clearVMHandles(vm)
|
||||||
system.TouchNow(&vm)
|
system.TouchNow(&vm)
|
||||||
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
||||||
|
|
@ -170,7 +170,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
vm.Runtime.TapDevice = ""
|
clearRuntimeTeardownState(&vm)
|
||||||
s.clearVMHandles(vm)
|
s.clearVMHandles(vm)
|
||||||
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
|
|
@ -200,7 +200,7 @@ func (s *VMService) killVMLocked(ctx context.Context, current model.VMRecord, si
|
||||||
}
|
}
|
||||||
vm.State = model.VMStateStopped
|
vm.State = model.VMStateStopped
|
||||||
vm.Runtime.State = model.VMStateStopped
|
vm.Runtime.State = model.VMStateStopped
|
||||||
vm.Runtime.TapDevice = ""
|
clearRuntimeTeardownState(&vm)
|
||||||
s.clearVMHandles(vm)
|
s.clearVMHandles(vm)
|
||||||
system.TouchNow(&vm)
|
system.TouchNow(&vm)
|
||||||
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
if err := s.store.UpsertVM(ctx, vm); err != nil {
|
||||||
|
|
@ -262,6 +262,7 @@ func (s *VMService) deleteVMLocked(ctx context.Context, current model.VMRecord)
|
||||||
if err := s.cleanupRuntime(ctx, vm, false); err != nil {
|
if err := s.cleanupRuntime(ctx, vm, false); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
}
|
}
|
||||||
|
clearRuntimeTeardownState(&vm)
|
||||||
op.stage("delete_store_record")
|
op.stage("delete_store_record")
|
||||||
if err := s.store.DeleteVM(ctx, vm.ID); err != nil {
|
if err := s.store.DeleteVM(ctx, vm.ID); err != nil {
|
||||||
return model.VMRecord{}, err
|
return model.VMRecord{}, err
|
||||||
|
|
|
||||||
|
|
@ -213,7 +213,7 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS
|
||||||
sc.live.COWLoop = snapHandles.COWLoop
|
sc.live.COWLoop = snapHandles.COWLoop
|
||||||
sc.live.DMName = snapHandles.DMName
|
sc.live.DMName = snapHandles.DMName
|
||||||
sc.live.DMDev = snapHandles.DMDev
|
sc.live.DMDev = snapHandles.DMDev
|
||||||
s.setVMHandles(*sc.vm, *sc.live)
|
s.setVMHandles(sc.vm, *sc.live)
|
||||||
// Fields that used to land next to the (now-deleted)
|
// Fields that used to land next to the (now-deleted)
|
||||||
// cleanupOnErr closure. They belong with the DM
|
// cleanupOnErr closure. They belong with the DM
|
||||||
// snapshot because that's the first step producing
|
// snapshot because that's the first step producing
|
||||||
|
|
@ -282,10 +282,7 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
sc.live.TapDevice = tap
|
sc.live.TapDevice = tap
|
||||||
s.setVMHandles(*sc.vm, *sc.live)
|
s.setVMHandles(sc.vm, *sc.live)
|
||||||
// Mirror onto VM.Runtime for NAT teardown resilience
|
|
||||||
// across daemon crashes — see vm.Runtime.TapDevice docs.
|
|
||||||
sc.vm.Runtime.TapDevice = tap
|
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
undo: func(ctx context.Context, sc *startContext) error {
|
undo: func(ctx context.Context, sc *startContext) error {
|
||||||
|
|
@ -360,11 +357,11 @@ func (s *VMService) buildStartSteps(op *operationLog, sc *startContext) []startS
|
||||||
// PID so the undo can kill it; use a fresh ctx since
|
// PID so the undo can kill it; use a fresh ctx since
|
||||||
// the request ctx may be cancelled by now.
|
// the request ctx may be cancelled by now.
|
||||||
sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock)
|
sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock)
|
||||||
s.setVMHandles(*sc.vm, *sc.live)
|
s.setVMHandles(sc.vm, *sc.live)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock)
|
sc.live.PID = s.net.resolveFirecrackerPID(context.Background(), machine, sc.apiSock)
|
||||||
s.setVMHandles(*sc.vm, *sc.live)
|
s.setVMHandles(sc.vm, *sc.live)
|
||||||
op.debugStage("firecracker_started", "pid", sc.live.PID)
|
op.debugStage("firecracker_started", "pid", sc.live.PID)
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -175,6 +175,66 @@ func TestReconcileStopsStaleRunningVMAndClearsRuntimeHandles(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestReconcileWithCorruptHandlesFileFallsBackToPersistedRuntimeTeardownState(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
db := openDaemonStore(t)
|
||||||
|
apiSock := filepath.Join(t.TempDir(), "fc.sock")
|
||||||
|
if err := os.WriteFile(apiSock, []byte{}, 0o644); err != nil {
|
||||||
|
t.Fatalf("WriteFile(api sock): %v", err)
|
||||||
|
}
|
||||||
|
vmDir := t.TempDir()
|
||||||
|
vm := testVM("corrupt", "image-corrupt", "172.16.0.10")
|
||||||
|
vm.State = model.VMStateRunning
|
||||||
|
vm.Runtime.State = model.VMStateRunning
|
||||||
|
vm.Runtime.APISockPath = apiSock
|
||||||
|
vm.Runtime.VMDir = vmDir
|
||||||
|
vm.Runtime.DNSName = ""
|
||||||
|
vm.Runtime.TapDevice = "tap-fc-corrupt"
|
||||||
|
vm.Runtime.BaseLoop = "/dev/loop20"
|
||||||
|
vm.Runtime.COWLoop = "/dev/loop21"
|
||||||
|
vm.Runtime.DMName = "fc-rootfs-corrupt"
|
||||||
|
vm.Runtime.DMDev = "/dev/mapper/fc-rootfs-corrupt"
|
||||||
|
upsertDaemonVM(t, ctx, db, vm)
|
||||||
|
|
||||||
|
if err := os.WriteFile(handlesFilePath(vmDir), []byte("{not json"), 0o600); err != nil {
|
||||||
|
t.Fatalf("WriteFile(handles.json): %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
runner := &scriptedRunner{
|
||||||
|
t: t,
|
||||||
|
steps: []runnerStep{
|
||||||
|
{call: runnerCall{name: "pgrep", args: []string{"-n", "-f", apiSock}}, err: errors.New("exit status 1")},
|
||||||
|
sudoStep("", nil, "dmsetup", "remove", "fc-rootfs-corrupt"),
|
||||||
|
sudoStep("", nil, "losetup", "-d", "/dev/loop21"),
|
||||||
|
sudoStep("", nil, "losetup", "-d", "/dev/loop20"),
|
||||||
|
sudoStep("", nil, "ip", "link", "del", "tap-fc-corrupt"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
d := &Daemon{store: db, runner: runner}
|
||||||
|
wireServices(d)
|
||||||
|
|
||||||
|
if err := d.reconcile(ctx); err != nil {
|
||||||
|
t.Fatalf("reconcile: %v", err)
|
||||||
|
}
|
||||||
|
runner.assertExhausted()
|
||||||
|
|
||||||
|
got, err := db.GetVM(ctx, vm.ID)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("GetVM: %v", err)
|
||||||
|
}
|
||||||
|
if got.State != model.VMStateStopped || got.Runtime.State != model.VMStateStopped {
|
||||||
|
t.Fatalf("vm state after reconcile = %s/%s, want stopped", got.State, got.Runtime.State)
|
||||||
|
}
|
||||||
|
if got.Runtime.TapDevice != "" || got.Runtime.BaseLoop != "" || got.Runtime.COWLoop != "" || got.Runtime.DMName != "" || got.Runtime.DMDev != "" {
|
||||||
|
t.Fatalf("runtime teardown state not cleared after reconcile: %+v", got.Runtime)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(handlesFilePath(vmDir)); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("handles.json still present after reconcile: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRebuildDNSIncludesOnlyLiveRunningVMs(t *testing.T) {
|
func TestRebuildDNSIncludesOnlyLiveRunningVMs(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -138,6 +138,9 @@ func applyEntry(tr *tar.Reader, hdr *tar.Header, dest string, meta *Metadata) er
|
||||||
if filepath.IsAbs(rel) || rel == ".." || strings.HasPrefix(rel, ".."+string(filepath.Separator)) {
|
if filepath.IsAbs(rel) || rel == ".." || strings.HasPrefix(rel, ".."+string(filepath.Separator)) {
|
||||||
return fmt.Errorf("unsafe path in layer: %q", hdr.Name)
|
return fmt.Errorf("unsafe path in layer: %q", hdr.Name)
|
||||||
}
|
}
|
||||||
|
if err := validateDebugFSPath(rel); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
base := filepath.Base(rel)
|
base := filepath.Base(rel)
|
||||||
parent := filepath.Dir(rel)
|
parent := filepath.Dir(rel)
|
||||||
|
|
|
||||||
|
|
@ -10,9 +10,9 @@
|
||||||
// and returns a v1.Image whose layer blobs are cached on disk under
|
// and returns a v1.Image whose layer blobs are cached on disk under
|
||||||
// cacheDir/blobs/sha256/<hex> so re-pulls are local.
|
// cacheDir/blobs/sha256/<hex> so re-pulls are local.
|
||||||
// - Flatten replays the layers in order into a staging directory,
|
// - Flatten replays the layers in order into a staging directory,
|
||||||
// applies whiteouts, rejects unsafe paths/symlinks, and returns
|
// applies whiteouts, rejects unsafe paths/symlinks plus filenames
|
||||||
// Metadata capturing the original tar-header uid/gid/mode for
|
// that debugfs can't represent safely, and returns Metadata
|
||||||
// every entry.
|
// capturing the original tar-header uid/gid/mode for every entry.
|
||||||
// - BuildExt4 turns the staging directory into an ext4 file via
|
// - BuildExt4 turns the staging directory into an ext4 file via
|
||||||
// `mkfs.ext4 -F -d` (no mount, no sudo). Root-owns the filesystem
|
// `mkfs.ext4 -F -d` (no mount, no sudo). Root-owns the filesystem
|
||||||
// via `-E root_owner=0:0`.
|
// via `-E root_owner=0:0`.
|
||||||
|
|
|
||||||
|
|
@ -254,6 +254,30 @@ func TestFlattenRejectsPathTraversal(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFlattenRejectsDebugFSHostilePath(t *testing.T) {
|
||||||
|
img, err := mutate.AppendLayers(empty.Image,
|
||||||
|
makeLayer(t, []tarMember{
|
||||||
|
{name: `etc/bad"name`, body: []byte("bad")},
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("AppendLayers: %v", err)
|
||||||
|
}
|
||||||
|
pulled := PulledImage{
|
||||||
|
Reference: "test/debugfs-hostile",
|
||||||
|
Digest: "sha256:test",
|
||||||
|
Platform: "linux/amd64",
|
||||||
|
Image: img,
|
||||||
|
}
|
||||||
|
_, err = Flatten(context.Background(), pulled, t.TempDir())
|
||||||
|
if !errors.Is(err, errUnsafeDebugFSPath) {
|
||||||
|
t.Fatalf("Flatten hostile path: err=%v, want %v", err, errUnsafeDebugFSPath)
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), `etc/bad\"name`) {
|
||||||
|
t.Fatalf("Flatten hostile path: err=%v, want offending path", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestFlattenAcceptsAbsoluteSymlink(t *testing.T) {
|
func TestFlattenAcceptsAbsoluteSymlink(t *testing.T) {
|
||||||
// Container layers regularly contain absolute symlinks like
|
// Container layers regularly contain absolute symlinks like
|
||||||
// /usr/bin/mawk — they're interpreted relative to the rootfs at
|
// /usr/bin/mawk — they're interpreted relative to the rootfs at
|
||||||
|
|
@ -303,6 +327,19 @@ func TestFlattenRejectsRelativeSymlinkEscape(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFlattenTarRejectsDebugFSHostilePath(t *testing.T) {
|
||||||
|
tarData := buildTar(t, []tarMember{
|
||||||
|
{name: "etc/bad\tname", body: []byte("bad")},
|
||||||
|
})
|
||||||
|
_, err := FlattenTar(context.Background(), bytes.NewReader(tarData), t.TempDir())
|
||||||
|
if !errors.Is(err, errUnsafeDebugFSPath) {
|
||||||
|
t.Fatalf("FlattenTar hostile path: err=%v, want %v", err, errUnsafeDebugFSPath)
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), `etc/bad\tname`) {
|
||||||
|
t.Fatalf("FlattenTar hostile path: err=%v, want offending path", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBuildExt4ProducesValidImage(t *testing.T) {
|
func TestBuildExt4ProducesValidImage(t *testing.T) {
|
||||||
if _, err := exec.LookPath("mkfs.ext4"); err != nil {
|
if _, err := exec.LookPath("mkfs.ext4"); err != nil {
|
||||||
t.Skip("mkfs.ext4 not available; skipping")
|
t.Skip("mkfs.ext4 not available; skipping")
|
||||||
|
|
@ -412,13 +449,30 @@ func TestApplyOwnershipRewritesUidGidMode(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestApplyOwnershipRejectsUnsafeMetadataPath(t *testing.T) {
|
||||||
|
meta := Metadata{Entries: map[string]FileMeta{
|
||||||
|
"bad\nname": {Uid: 0, Gid: 0, Mode: 0o644, Type: tar.TypeReg},
|
||||||
|
}}
|
||||||
|
err := ApplyOwnership(context.Background(), system.NewRunner(), filepath.Join(t.TempDir(), "rootfs.ext4"), meta)
|
||||||
|
if !errors.Is(err, errUnsafeDebugFSPath) {
|
||||||
|
t.Fatalf("ApplyOwnership hostile path: err=%v, want %v", err, errUnsafeDebugFSPath)
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), `bad\nname`) {
|
||||||
|
t.Fatalf("ApplyOwnership hostile path: err=%v, want offending path", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBuildOwnershipScriptDeterministic(t *testing.T) {
|
func TestBuildOwnershipScriptDeterministic(t *testing.T) {
|
||||||
meta := Metadata{Entries: map[string]FileMeta{
|
meta := Metadata{Entries: map[string]FileMeta{
|
||||||
"b": {Uid: 0, Gid: 0, Mode: 0o755, Type: tar.TypeReg},
|
"b": {Uid: 0, Gid: 0, Mode: 0o755, Type: tar.TypeReg},
|
||||||
"a": {Uid: 0, Gid: 0, Mode: 0o755, Type: tar.TypeReg},
|
"a": {Uid: 0, Gid: 0, Mode: 0o755, Type: tar.TypeReg},
|
||||||
"a/x": {Uid: 0, Gid: 0, Mode: 0o644, Type: tar.TypeReg},
|
"a/x": {Uid: 0, Gid: 0, Mode: 0o644, Type: tar.TypeReg},
|
||||||
}}
|
}}
|
||||||
got := buildOwnershipScript(meta).String()
|
gotBuf, err := buildOwnershipScript(meta)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("buildOwnershipScript: %v", err)
|
||||||
|
}
|
||||||
|
got := gotBuf.String()
|
||||||
// sorted: a, a/x, b
|
// sorted: a, a/x, b
|
||||||
want := "set_inode_field /a uid 0\nset_inode_field /a gid 0\nset_inode_field /a mode 0100755\n" +
|
want := "set_inode_field /a uid 0\nset_inode_field /a gid 0\nset_inode_field /a mode 0100755\n" +
|
||||||
"set_inode_field /a/x uid 0\nset_inode_field /a/x gid 0\nset_inode_field /a/x mode 0100644\n" +
|
"set_inode_field /a/x uid 0\nset_inode_field /a/x gid 0\nset_inode_field /a/x mode 0100644\n" +
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,10 @@ import (
|
||||||
"archive/tar"
|
"archive/tar"
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"banger/internal/system"
|
"banger/internal/system"
|
||||||
)
|
)
|
||||||
|
|
@ -24,7 +26,10 @@ func ApplyOwnership(ctx context.Context, runner system.CommandRunner, ext4File s
|
||||||
if len(meta.Entries) == 0 {
|
if len(meta.Entries) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
script := buildOwnershipScript(meta)
|
script, err := buildOwnershipScript(meta)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
if script.Len() == 0 {
|
if script.Len() == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
@ -43,7 +48,7 @@ func ApplyOwnership(ctx context.Context, runner system.CommandRunner, ext4File s
|
||||||
// Paths are prefixed with "/" so debugfs resolves them from the ext4
|
// Paths are prefixed with "/" so debugfs resolves them from the ext4
|
||||||
// root. Entries are sorted for deterministic output (helps testing and
|
// root. Entries are sorted for deterministic output (helps testing and
|
||||||
// makes debugfs's internal caching slightly more cache-friendly).
|
// makes debugfs's internal caching slightly more cache-friendly).
|
||||||
func buildOwnershipScript(meta Metadata) *bytes.Buffer {
|
func buildOwnershipScript(meta Metadata) (*bytes.Buffer, error) {
|
||||||
var buf bytes.Buffer
|
var buf bytes.Buffer
|
||||||
paths := make([]string, 0, len(meta.Entries))
|
paths := make([]string, 0, len(meta.Entries))
|
||||||
for p := range meta.Entries {
|
for p := range meta.Entries {
|
||||||
|
|
@ -56,12 +61,15 @@ func buildOwnershipScript(meta Metadata) *bytes.Buffer {
|
||||||
if mode == 0 {
|
if mode == 0 {
|
||||||
continue // hardlinks or unsupported types (skip)
|
continue // hardlinks or unsupported types (skip)
|
||||||
}
|
}
|
||||||
|
if err := validateDebugFSPath(p); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
escaped := escapeDebugfsPath(p)
|
escaped := escapeDebugfsPath(p)
|
||||||
fmt.Fprintf(&buf, "set_inode_field %s uid %d\n", escaped, m.Uid)
|
fmt.Fprintf(&buf, "set_inode_field %s uid %d\n", escaped, m.Uid)
|
||||||
fmt.Fprintf(&buf, "set_inode_field %s gid %d\n", escaped, m.Gid)
|
fmt.Fprintf(&buf, "set_inode_field %s gid %d\n", escaped, m.Gid)
|
||||||
fmt.Fprintf(&buf, "set_inode_field %s mode 0%o\n", escaped, mode)
|
fmt.Fprintf(&buf, "set_inode_field %s mode 0%o\n", escaped, mode)
|
||||||
}
|
}
|
||||||
return &buf
|
return &buf, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// debugfsMode composes the full i_mode word (file-type bits +
|
// debugfsMode composes the full i_mode word (file-type bits +
|
||||||
|
|
@ -87,27 +95,29 @@ func debugfsMode(typ byte, hdrMode int64) uint32 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// escapeDebugfsPath prepends "/" and wraps in double quotes if the path
|
var errUnsafeDebugFSPath = errors.New("unsafe path for debugfs ownership script")
|
||||||
// contains whitespace or special characters. debugfs' quoting is
|
|
||||||
// minimal; for safety we reject backslashes/quotes in paths entirely.
|
func validateDebugFSPath(rel string) error {
|
||||||
func escapeDebugfsPath(rel string) string {
|
for i := 0; i < len(rel); i++ {
|
||||||
abs := "/" + rel
|
switch c := rel[i]; {
|
||||||
// Container images don't normally use quoting-hostile chars; if they
|
case c == '"':
|
||||||
// do, fall back to the raw path and hope debugfs copes (it usually
|
return fmt.Errorf("%w: %q contains '\"'", errUnsafeDebugFSPath, rel)
|
||||||
// does for spaces when quoted).
|
case c == '\\':
|
||||||
needsQuote := false
|
return fmt.Errorf("%w: %q contains '\\\\'", errUnsafeDebugFSPath, rel)
|
||||||
for _, c := range abs {
|
case c < 0x20 || c == 0x7f:
|
||||||
switch c {
|
return fmt.Errorf("%w: %q contains control byte 0x%02x", errUnsafeDebugFSPath, rel, c)
|
||||||
case ' ', '\t':
|
|
||||||
needsQuote = true
|
|
||||||
case '"', '\\', '\n':
|
|
||||||
// Deliberately unhandled; debugfs may fail on these.
|
|
||||||
// Returning the raw string gives us a visible error
|
|
||||||
// instead of a silently-corrupted script.
|
|
||||||
return abs
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if needsQuote {
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// escapeDebugfsPath prepends "/" and wraps in double quotes if the path
|
||||||
|
// contains spaces. validateDebugFSPath rejects debugfs-hostile bytes
|
||||||
|
// before this runs, so the only quoting we need is the simple
|
||||||
|
// whitespace case debugfs already handles.
|
||||||
|
func escapeDebugfsPath(rel string) string {
|
||||||
|
abs := "/" + rel
|
||||||
|
if strings.ContainsRune(abs, ' ') {
|
||||||
return `"` + abs + `"`
|
return `"` + abs + `"`
|
||||||
}
|
}
|
||||||
return abs
|
return abs
|
||||||
|
|
|
||||||
|
|
@ -89,11 +89,10 @@ type VMSpec struct {
|
||||||
|
|
||||||
// VMRuntime holds the durable runtime state that the daemon needs
|
// VMRuntime holds the durable runtime state that the daemon needs
|
||||||
// to reach a VM: identity, declared state, and deterministic derived
|
// to reach a VM: identity, declared state, and deterministic derived
|
||||||
// paths. Transient kernel/process handles (PID, tap, loop devices,
|
// paths. The authoritative live handle set still lives on VMHandles,
|
||||||
// dm-snapshot names) live on VMHandles, NOT here — the daemon keeps
|
// but teardown-critical storage/network identifiers are mirrored here
|
||||||
// them in an in-memory cache backed by a per-VM handles.json scratch
|
// as recovery fallbacks so restart-time cleanup still works when
|
||||||
// file, so a daemon restart rebuilds them from OS state rather than
|
// handles.json is missing or corrupt.
|
||||||
// trusting whatever was last written into a SQLite column.
|
|
||||||
//
|
//
|
||||||
// Everything in VMRuntime is safe to persist: the paths are
|
// Everything in VMRuntime is safe to persist: the paths are
|
||||||
// deterministic from (VM ID, layout) and survive restart unchanged;
|
// deterministic from (VM ID, layout) and survive restart unchanged;
|
||||||
|
|
@ -110,14 +109,15 @@ type VMRuntime struct {
|
||||||
MetricsPath string `json:"metrics_path,omitempty"`
|
MetricsPath string `json:"metrics_path,omitempty"`
|
||||||
DNSName string `json:"dns_name,omitempty"`
|
DNSName string `json:"dns_name,omitempty"`
|
||||||
VMDir string `json:"vm_dir"`
|
VMDir string `json:"vm_dir"`
|
||||||
// TapDevice mirrors VMHandles.TapDevice but persists across
|
// Teardown fallback fields mirror the handle cache onto the VM row.
|
||||||
// daemon restarts / handle-cache loss. NAT teardown needs the
|
// They are recovery-only: while the daemon is alive, VMHandles stays
|
||||||
// exact tap name to delete the FORWARD rules; if we only had
|
// authoritative. On restart, cleanup can fall back to these values if
|
||||||
// the handle cache, a crash between tap acquire and handles.json
|
// handles.json is missing or corrupt.
|
||||||
// write — or a corrupt handles.json on the next daemon start —
|
|
||||||
// would silently leak the rules. Storing it on the VM record
|
|
||||||
// makes cleanup correct as long as the VM row exists.
|
|
||||||
TapDevice string `json:"tap_device,omitempty"`
|
TapDevice string `json:"tap_device,omitempty"`
|
||||||
|
BaseLoop string `json:"base_loop,omitempty"`
|
||||||
|
COWLoop string `json:"cow_loop,omitempty"`
|
||||||
|
DMName string `json:"dm_name,omitempty"`
|
||||||
|
DMDev string `json:"dm_dev,omitempty"`
|
||||||
SystemOverlay string `json:"system_overlay_path"`
|
SystemOverlay string `json:"system_overlay_path"`
|
||||||
WorkDiskPath string `json:"work_disk_path"`
|
WorkDiskPath string `json:"work_disk_path"`
|
||||||
LastError string `json:"last_error,omitempty"`
|
LastError string `json:"last_error,omitempty"`
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,11 @@ package model
|
||||||
// VMHandles captures the transient, per-boot kernel/process handles
|
// VMHandles captures the transient, per-boot kernel/process handles
|
||||||
// that banger obtains while starting a VM and releases when stopping
|
// that banger obtains while starting a VM and releases when stopping
|
||||||
// it. Unlike VMRuntime (durable spec + identity + derived paths),
|
// it. Unlike VMRuntime (durable spec + identity + derived paths),
|
||||||
// nothing in VMHandles survives a daemon restart in authoritative
|
// VMHandles is the authoritative live-handle view while the daemon is
|
||||||
// form: each value is either rediscovered from the OS (PID from the
|
// up. On restart, the daemon rebuilds it from the OS plus the per-VM
|
||||||
// firecracker api socket, DM name deterministically from the VM ID)
|
// scratch file; teardown-critical fields are also mirrored onto
|
||||||
// or read from a per-VM scratch file that the daemon rebuilds at
|
// VMRuntime so cleanup can still proceed if that scratch file is
|
||||||
// every start.
|
// missing or corrupt.
|
||||||
//
|
//
|
||||||
// The daemon keeps an in-memory cache keyed by VM ID. Lifecycle
|
// The daemon keeps an in-memory cache keyed by VM ID. Lifecycle
|
||||||
// transitions update the cache and a small `handles.json` scratch
|
// transitions update the cache and a small `handles.json` scratch
|
||||||
|
|
@ -16,10 +16,9 @@ package model
|
||||||
// OS state. If anything is stale the VM is marked stopped and the
|
// OS state. If anything is stale the VM is marked stopped and the
|
||||||
// cache entry is dropped.
|
// cache entry is dropped.
|
||||||
//
|
//
|
||||||
// VMHandles never appears in the `vms` SQLite rows. Keeping it off
|
// VMHandles itself never appears in the `vms` SQLite rows. Some fields
|
||||||
// the durable schema was the whole point of the split — persistent
|
// are mirrored onto VMRuntime as crash-recovery fallback state, but the
|
||||||
// records describe what a VM SHOULD be; handles describe what is
|
// cache + scratch file remain the canonical live source.
|
||||||
// currently true about it.
|
|
||||||
type VMHandles struct {
|
type VMHandles struct {
|
||||||
// PID is the firecracker process PID. Zero means "not running
|
// PID is the firecracker process PID. Zero means "not running
|
||||||
// (from our perspective)". Always verifiable via
|
// (from our perspective)". Always verifiable via
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue