diff --git a/internal/daemon/tap_pool.go b/internal/daemon/tap_pool.go index c0e5f60..d91debf 100644 --- a/internal/daemon/tap_pool.go +++ b/internal/daemon/tap_pool.go @@ -6,6 +6,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" ) const tapPoolPrefix = "tap-pool-" @@ -16,8 +17,16 @@ type tapPool struct { mu sync.Mutex entries []string next int + warming bool } +// maxConcurrentTapWarmup caps the number of `priv.create_tap` RPCs the +// warmup loop runs in parallel. Each tap creation is ~4 root-helper +// shell-outs serialized within one RPC handler; running too many at +// once just contends on netlink. 8 is the production sweet spot for +// SMOKE_JOBS=8. +const maxConcurrentTapWarmup = 8 + // initializeTapPool seeds the monotonic pool index from the set of // tap names already in use by running/stopped VMs, so newly warmed // pool entries don't collide with existing ones. Callers (Daemon.Open) @@ -41,6 +50,23 @@ func (n *HostNetwork) ensureTapPool(ctx context.Context) { if n.config.TapPoolSize <= 0 { return } + + // Dedupe concurrent warmup invocations. Releases trigger a fresh + // ensureTapPool in a goroutine; without this, N parallel releases + // would each spin up their own warmup loop racing on n.tapPool.next. + n.tapPool.mu.Lock() + if n.tapPool.warming { + n.tapPool.mu.Unlock() + return + } + n.tapPool.warming = true + n.tapPool.mu.Unlock() + defer func() { + n.tapPool.mu.Lock() + n.tapPool.warming = false + n.tapPool.mu.Unlock() + }() + for { select { case <-ctx.Done(): @@ -51,27 +77,53 @@ func (n *HostNetwork) ensureTapPool(ctx context.Context) { } n.tapPool.mu.Lock() - if len(n.tapPool.entries) >= n.config.TapPoolSize { + deficit := n.config.TapPoolSize - len(n.tapPool.entries) + if deficit <= 0 { n.tapPool.mu.Unlock() return } - tapName := fmt.Sprintf("%s%d", tapPoolPrefix, n.tapPool.next) - n.tapPool.next++ - n.tapPool.mu.Unlock() - - if err := n.createTap(ctx, tapName); err != nil { - if n.logger != nil { - n.logger.Warn("tap pool warmup failed", "tap_device", tapName, "error", err.Error()) - } - return + batch := deficit + if batch > maxConcurrentTapWarmup { + batch = maxConcurrentTapWarmup + } + // Reserve names up front so concurrent goroutines can't collide + // on n.tapPool.next. + names := make([]string, batch) + for i := range names { + names[i] = fmt.Sprintf("%s%d", tapPoolPrefix, n.tapPool.next) + n.tapPool.next++ } - - n.tapPool.mu.Lock() - n.tapPool.entries = append(n.tapPool.entries, tapName) n.tapPool.mu.Unlock() - if n.logger != nil { - n.logger.Debug("tap added to idle pool", "tap_device", tapName) + var ( + wg sync.WaitGroup + progress atomic.Int32 + ) + for _, tapName := range names { + wg.Add(1) + go func(tapName string) { + defer wg.Done() + if err := n.createTap(ctx, tapName); err != nil { + if n.logger != nil { + n.logger.Warn("tap pool warmup failed", "tap_device", tapName, "error", err.Error()) + } + return + } + n.tapPool.mu.Lock() + n.tapPool.entries = append(n.tapPool.entries, tapName) + n.tapPool.mu.Unlock() + progress.Add(1) + if n.logger != nil { + n.logger.Debug("tap added to idle pool", "tap_device", tapName) + } + }(tapName) + } + wg.Wait() + + // Whole batch failed → bail rather than burn names indefinitely + // (the original sequential loop bailed on first error too). + if progress.Load() == 0 { + return } } }