// Package switcher implements the account-switcher state machine. // It is the only component allowed to flip the active Claude account. package switcher import ( "context" "errors" "fmt" "log" "os" "path/filepath" "regexp" "strconv" "strings" "sync/atomic" "time" "forge.secuaas.ovh/olivier/claude-failover/internal/config" "forge.secuaas.ovh/olivier/claude-failover/internal/notify" "forge.secuaas.ovh/olivier/claude-failover/internal/quota" "forge.secuaas.ovh/olivier/claude-failover/internal/safety" "forge.secuaas.ovh/olivier/claude-failover/internal/state" "forge.secuaas.ovh/olivier/claude-failover/internal/symlinks" "forge.secuaas.ovh/olivier/claude-failover/internal/tmux" ) // ErrPartialSwap is returned (and wrapped) when the switcher flipped // ~/.claude to the target home, EnsureForAccount failed on the target, // and the rollback flip back to the previous home ALSO failed. The daemon // is in a documented degraded state: the active-account setter was NOT // called, but the filesystem symlink may point at an account whose shared // state is inconsistent. Operator intervention is required. Callers can // interrogate AccountSwitcher.IsPartialSwap() to expose the flag to // health-checks / watchdogs. var ErrPartialSwap = errors.New("switcher: partial swap — flip succeeded but ensure + rollback both failed") // SwitchState represents the current phase of a failover operation. type SwitchState string const ( StateNormal SwitchState = "normal" StateSaving SwitchState = "saving" StateSwitching SwitchState = "switching" StateResuming SwitchState = "resuming" ) // resumeRe matches `claude --resume ` in pane capture output. var resumeRe = regexp.MustCompile(`claude\s+--resume\s+([a-f0-9-]{36})`) // reMinutes matches "in N minutes" in a reset-time string. var reMinutes = regexp.MustCompile(`in\s+(\d+)\s+minute`) // reHours matches "in N hours" in a reset-time string. var reHours = regexp.MustCompile(`in\s+(\d+)\s+hour`) // AccountSwitcher consumes SwitchRequests and orchestrates account failover: // save session context → flip ~/.claude symlink → restart sessions. type AccountSwitcher struct { tmux tmux.Client state *state.State config *config.Config switchCh <-chan quota.SwitchRequest notifier *notify.Notifier currentState SwitchState logger *log.Logger // homeDir is the directory containing the .claude symlink. Overridable for tests. // When empty, os.UserHomeDir() is used. homeDir string // sharedSymlinks is the list of shared-state links reconciled on the // target account home after every flip. Overridable for tests so the // suite never touches the operator's real /home/ubuntu/.claude-* // shared directories. When nil, symlinks.RequiredShared is used. sharedSymlinks []symlinks.SharedSymlink // partialSwap is set to 1 when a flip+ensure+rollback sequence left the // daemon in an inconsistent state (symlink possibly flipped, but active // account NOT updated, and rollback flip ALSO failed). Health-checks / // watchdogs read this flag via IsPartialSwap(). It is sticky: once set, // it stays set until the operator restarts the daemon after fixing the // filesystem state. We use atomic access so watchdog goroutines can read // it without blocking the switcher. partialSwap atomic.Bool } // New creates an AccountSwitcher. // notifier may be nil; notifications are skipped when absent. func New( tc tmux.Client, s *state.State, cfg *config.Config, switchCh <-chan quota.SwitchRequest, notifier *notify.Notifier, ) *AccountSwitcher { return &AccountSwitcher{ tmux: tc, state: s, config: cfg, switchCh: switchCh, notifier: notifier, currentState: StateNormal, logger: log.Default(), } } // Run starts the switcher event loop until ctx is cancelled. func (a *AccountSwitcher) Run(ctx context.Context) { for { select { case <-ctx.Done(): return case req := <-a.switchCh: a.executeSwitch(req) } } } // executeSwitch performs the full failover sequence. func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { if err := a.executeSwitchE(req); err != nil { // executeSwitchE already logs the detail; we swallow the error here // because the public Run loop has no return channel. The partialSwap // flag (if set) remains visible via IsPartialSwap(). a.logger.Printf("[switcher] SWAP aborted: %v", err) } } // executeSwitchE runs the swap and returns an error describing any abort or // partial-swap condition. Split out from executeSwitch so tests can assert // on the error value without routing through a channel. func (a *AccountSwitcher) executeSwitchE(req quota.SwitchRequest) error { a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime) // Refuse to proceed if a previous swap left the daemon in an // inconsistent state. The operator must intervene (fix the filesystem, // restart the daemon) before any further failover can be attempted — // otherwise we'd stack symlink flips on top of a broken state. if a.partialSwap.Load() { err := fmt.Errorf("refusing swap: daemon is in partial-swap degraded state (operator intervention required)") a.logger.Printf("[switcher] %v", err) return err } // 1. SAVING — capture resume UUIDs from all working sessions plus // every dedicated session unconditionally (dedicated sessions are // user-driven and may not be tracked as "working" in state, but their // UUIDs are the most valuable to preserve across a swap). a.currentState = StateSaving a.saveAllSessions() a.saveDedicatedUUIDs() // 2. SWITCHING — find target, flip symlink, restart sessions. a.currentState = StateSwitching target := a.findTargetAccount(req.From) if target == nil { a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From) a.currentState = StateNormal return nil } previous := a.findAccountByName(req.From) if err := a.flipSymlink(target.Home); err != nil { a.logger.Printf("[switcher] flipSymlink error: %v", err) } // Ensure the target account home exposes the three shared-state // symlinks (session-env, file-history, projects). If this fails we // MUST NOT proceed with SetActiveAccount — the daemon would otherwise // declare the target "active" while its shared state is divergent, // silently writing transcripts into private /projects directories and // breaking `claude --resume` across sessions. Instead we attempt to // roll back the ~/.claude flip to the previous account. If the // rollback also fails, the daemon is in a documented degraded state // (ErrPartialSwap) and the operator must intervene. if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil { a.logger.Printf("[switcher] ensure shared symlinks for %q failed: %v — attempting rollback", target.Home, err) if previous == nil || previous.Home == "" { // No known previous home to roll back to — set the degraded // flag and bail out. This is equivalent to a rollback failure // because the filesystem is pointed at a broken target. a.partialSwap.Store(true) a.currentState = StateNormal return fmt.Errorf("%w: ensure failed (%v) and no previous account home is known for rollback", ErrPartialSwap, err) } if rbErr := a.flipSymlink(previous.Home); rbErr != nil { // Both the ensure AND the rollback failed. The daemon is now // in a documented inconsistent state: ~/.claude may point at // target whose shared-state is divergent, but SetActiveAccount // has NOT been called so state.ActiveAccount is still the // previous account. No further failover can be attempted // until the operator intervenes. a.partialSwap.Store(true) a.logger.Printf("[switcher] CRITICAL partial swap: ensure=%v rollback=%v — daemon in degraded state, operator intervention required", err, rbErr) a.currentState = StateNormal return fmt.Errorf("%w: ensure=%v rollback=%v", ErrPartialSwap, err, rbErr) } // Rollback succeeded — symlink is back on the previous account, // SetActiveAccount was NEVER called, state is consistent with // "no swap happened". Return an explicit error so the caller // knows the swap was cancelled. a.logger.Printf("[switcher] rollback successful: ~/.claude → %s (swap cancelled)", previous.Home) a.currentState = StateNormal return fmt.Errorf("swap cancelled: ensure shared symlinks failed on target %q: %w", target.Home, err) } a.killAllPoolSessions() a.recreatePoolSessions() a.relaunchDedicatedSessions(target.Home) // Update active account and record the swap timestamp so the quota // monitor can enforce a cooldown before requesting another one. a.state.SetActiveAccount(target.Name) a.state.RecordSwap(req.From, target.Name) // 3. RESUMING — sessions are alive, dispatcher will fill them. a.currentState = StateResuming // 4. Notify. msg := fmt.Sprintf("Switch %s → %s (reset: %s)", req.From, target.Name, req.ResetTime) a.logger.Printf("[switcher] SWAP complete: %s", msg) if a.notifier != nil { a.notifier.Telegram("🔄 " + msg) //nolint:errcheck } // 5. Schedule return to primary account if reset time is known. if req.ResetTime != "" { go a.scheduleReturn(req.From, req.ResetTime) } a.currentState = StateNormal return nil } // IsPartialSwap reports whether the switcher is in a degraded state after a // flip+ensure+rollback sequence all failed. Health-checks and watchdogs use // this signal to surface an operator-actionable alert. The flag is sticky // for the lifetime of the process: once set, it remains set until the daemon // is restarted (after the operator has fixed the filesystem). func (a *AccountSwitcher) IsPartialSwap() bool { return a.partialSwap.Load() } // findAccountByName returns the account config entry matching name, or nil. // Unlike findTargetAccount (which returns the first NON-matching account), // this is used by the rollback path to recover the previous home. func (a *AccountSwitcher) findAccountByName(name string) *config.AccountConfig { if name == "" { return nil } for i := range a.config.Accounts { if a.config.Accounts[i].Name == name { return &a.config.Accounts[i] } } return nil } // saveDedicatedUUIDs captures the resume UUID for every configured dedicated // session, regardless of its tracked state. Dedicated sessions are typically // user-driven and not in state="working", but their UUIDs are the most // valuable to preserve across a swap so the user's work is not lost. func (a *AccountSwitcher) saveDedicatedUUIDs() { for _, ds := range a.config.Pool.Dedicated { if !a.tmux.HasSession(ds.Name) { continue } tail, err := a.tmux.CapturePaneTail(ds.Name, 200) if err != nil { continue } uuid := extractResumeUUID(tail) if uuid == "" { continue } dir := a.resumeContextDir() if err := os.MkdirAll(dir, 0700); err != nil { a.logger.Printf("[switcher] mkdir %s: %v", dir, err) continue } path := filepath.Join(dir, ds.Name+"-resume-id.txt") if err := os.WriteFile(path, []byte(uuid), 0600); err != nil { a.logger.Printf("[switcher] write %s: %v", path, err) continue } a.logger.Printf("[switcher] saved dedicated resume UUID for %q: %s", ds.Name, uuid) } } // relaunchDedicatedSessions sends a `claude --resume ` command to each // dedicated session after recreation, using the target account's home via // CLAUDE_CONFIG_DIR so the session follows the active account. If no UUID was // captured for a session, it is left at the bash prompt for manual restart. func (a *AccountSwitcher) relaunchDedicatedSessions(targetHome string) { for _, ds := range a.config.Pool.Dedicated { path := filepath.Join(a.resumeContextDir(), ds.Name+"-resume-id.txt") data, err := os.ReadFile(path) if err != nil { a.logger.Printf("[switcher] no saved resume UUID for %q (%v) — leaving at shell", ds.Name, err) continue } uuid := strings.TrimSpace(string(data)) if !isValidResumeUUID(uuid) { a.logger.Printf("[switcher] invalid UUID for %q: %q", ds.Name, uuid) continue } // FNDG-04b: redeploy the PreToolUse safety hook into the dedicated // session's project before --resume. Failure is logged, not fatal: // we'd rather relaunch the session without a refreshed hook than // leave the user stuck after a quota failover. The existing hook // (deployed at initial launch) remains in place. if ds.Project != "" { if err := safety.EnsureHookDeployed(ds.Project); err != nil { a.logger.Printf("[switcher] safety hook redeploy %q: %v (continuing)", ds.Name, err) } } // targetHome is operator-controlled (config file); uuid is regex-validated. // Neither is user-supplied runtime input, so shell interpolation is safe. cmd := fmt.Sprintf("CLAUDE_CONFIG_DIR=%s claude --dangerously-skip-permissions --resume %s", targetHome, uuid) if err := a.tmux.SendKeys(ds.Name, cmd); err != nil { a.logger.Printf("[switcher] relaunch %q: %v", ds.Name, err) continue } a.logger.Printf("[switcher] relaunched %q on %s (resume=%s)", ds.Name, targetHome, uuid) } } // isValidResumeUUID defends against corrupted resume-id files by requiring // the canonical 36-char lowercase hex+dash UUID format. func isValidResumeUUID(s string) bool { if len(s) != 36 { return false } return resumeRe.MatchString("claude --resume " + s) } // saveAllSessions captures the resume UUID for every working session. func (a *AccountSwitcher) saveAllSessions() { a.state.ForEachWorking(func(name string, _ *state.SessionState) { tail, err := a.tmux.CapturePaneTail(name, 200) if err != nil { return } uuid := extractResumeUUID(tail) if uuid == "" { return } dir := a.resumeContextDir() os.MkdirAll(dir, 0700) path := filepath.Join(dir, name+"-resume-id.txt") os.WriteFile(path, []byte(uuid), 0600) a.logger.Printf("[switcher] saved resume UUID for %q", name) }) } // requiredShared returns the shared-symlink list used when reconciling the // target account home after a flip. Tests may set a.sharedSymlinks to a // tmpdir-scoped list so they never touch /home/ubuntu/.claude-*-shared. func (a *AccountSwitcher) requiredShared() []symlinks.SharedSymlink { if a.sharedSymlinks != nil { return a.sharedSymlinks } return symlinks.RequiredShared } // resolveHomeDir returns the configured homeDir (test override) or the real // user home. Tests MUST set a.homeDir to a tmpdir to avoid clobbering the // production ~/.claude symlink. func (a *AccountSwitcher) resolveHomeDir() (string, error) { if a.homeDir != "" { return a.homeDir, nil } home, err := os.UserHomeDir() if err != nil { return "", fmt.Errorf("UserHomeDir: %w", err) } return home, nil } // flipSymlink replaces ~/.claude with a symlink to targetHome. // All paths come from config — no hardcoded values. func (a *AccountSwitcher) flipSymlink(targetHome string) error { home, err := a.resolveHomeDir() if err != nil { return err } claudeLink := filepath.Join(home, ".claude") os.Remove(claudeLink) if err := os.Symlink(targetHome, claudeLink); err != nil { return fmt.Errorf("symlink %s → %s: %w", claudeLink, targetHome, err) } a.logger.Printf("[switcher] ~/.claude → %s", targetHome) return nil } // killAllPoolSessions kills all autonomous and dedicated pool sessions // managed by this daemon. Sessions outside the configured // StartIndex..StartIndex+Max range (e.g. manual operator sessions // `ccl-0..ccl-9`) are left untouched. func (a *AccountSwitcher) killAllPoolSessions() { prefix := a.config.Pool.Autonomous.Prefix if prefix == "" { prefix = "ccl-auto-" } start := a.config.Pool.Autonomous.StartIndex for i := start; i < start+a.config.Pool.Autonomous.Max; i++ { a.tmux.KillSession(sessionName(prefix, i)) //nolint:errcheck } for _, ds := range a.config.Pool.Dedicated { a.tmux.KillSession(ds.Name) //nolint:errcheck } } // recreatePoolSessions creates fresh pool sessions after a switch, // respecting StartIndex so the pool stays within its configured range. func (a *AccountSwitcher) recreatePoolSessions() { prefix := a.config.Pool.Autonomous.Prefix if prefix == "" { prefix = "ccl-auto-" } start := a.config.Pool.Autonomous.StartIndex for i := start; i < start+a.config.Pool.Autonomous.Min; i++ { name := sessionName(prefix, i) if err := a.tmux.CreateSession(name, ""); err != nil { a.logger.Printf("[switcher] recreate autonomous %q: %v", name, err) } } for _, ds := range a.config.Pool.Dedicated { if err := a.tmux.CreateSession(ds.Name, ds.Project); err != nil { a.logger.Printf("[switcher] recreate dedicated %q: %v", ds.Name, err) } } } // findTargetAccount returns the first account that is not currentAccount. func (a *AccountSwitcher) findTargetAccount(currentAccount string) *config.AccountConfig { for i := range a.config.Accounts { if a.config.Accounts[i].Name != currentAccount { return &a.config.Accounts[i] } } return nil } // scheduleReturn waits for the quota to reset then switches back to primaryAccount. func (a *AccountSwitcher) scheduleReturn(primaryAccount, resetTime string) { dur := timeUntilReset(resetTime) + 5*time.Minute a.logger.Printf("[switcher] return to %q scheduled in %v", primaryAccount, dur.Round(time.Minute)) time.Sleep(dur) a.executeSwitch(quota.SwitchRequest{ From: a.state.ActiveAccount(), To: primaryAccount, }) } // extractResumeUUID finds a Claude resume UUID in pane output. func extractResumeUUID(content string) string { m := resumeRe.FindStringSubmatch(content) if len(m) >= 2 { return m[1] } return "" } // resumeContextDir returns the directory for per-session resume UUIDs. // Honours a.homeDir override so tests never write to the real ~/.claude-context. func (a *AccountSwitcher) resumeContextDir() string { home, _ := a.resolveHomeDir() return filepath.Join(home, ".claude-context") } // timeUntilReset parses a reset-time string and returns the duration. // Returns a 2-hour fallback when parsing fails. func timeUntilReset(resetTime string) time.Duration { lower := strings.ToLower(strings.TrimSpace(resetTime)) if m := reMinutes.FindStringSubmatch(lower); len(m) >= 2 { n, _ := strconv.Atoi(m[1]) return time.Duration(n) * time.Minute } if m := reHours.FindStringSubmatch(lower); len(m) >= 2 { n, _ := strconv.Atoi(m[1]) return time.Duration(n) * time.Hour } return 2 * time.Hour } func sessionName(prefix string, i int) string { return prefix + itoa(i) } func itoa(n int) string { if n == 0 { return "0" } b := make([]byte, 0, 10) for n > 0 { b = append([]byte{byte('0' + n%10)}, b...) n /= 10 } return string(b) }