claude-failover/internal/switcher/account_switcher.go
Ubuntu 8eaf0bbd35 feat(switcher): ensure shared symlinks on target home after flip (A3)
Wire symlinks.EnsureForAccount into executeSwitch, called immediately
after the ~/.claude flip. Guarantees the three shared-state links
(session-env, file-history, projects) exist on the target account home
even for freshly-provisioned accounts, preventing silent transcript
duplication and undo-history divergence on first resume.

Best-effort: errors are logged as WARN but never abort the swap. If we
returned here the daemon would be left inconsistent (symlink flipped,
SetActiveAccount never called). Operator sees the warning in logs and
resolves divergent links manually.

Tests:
- TestFlipReconcilesSharedSymlinksOnTargetHome: empty target home gets
  all three links pointing at canonical targets after the flip.
- TestFlipEnsureSymlinksFailureDoesNotAbortSwap: a planted divergent
  link triggers the symlinks-package error; the swap completes anyway
  and the active account is updated.

Hermetic: added AccountSwitcher.sharedSymlinks override so tests scope
the reconcile inside t.TempDir() and never touch
/home/ubuntu/.claude-*-shared. Existing tests migrated to this pattern
and hardcoded /tmp/claude-*-xxxx paths replaced with tmpdirs.

Phase 1 / Chantier A — task A3.
2026-04-16 19:34:03 +00:00

389 lines
13 KiB
Go

// Package switcher implements the account-switcher state machine.
// It is the only component allowed to flip the active Claude account.
package switcher
import (
"context"
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"time"
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
"forge.secuaas.ovh/olivier/claude-failover/internal/notify"
"forge.secuaas.ovh/olivier/claude-failover/internal/quota"
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
"forge.secuaas.ovh/olivier/claude-failover/internal/symlinks"
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
)
// SwitchState represents the current phase of a failover operation.
type SwitchState string
const (
StateNormal SwitchState = "normal"
StateSaving SwitchState = "saving"
StateSwitching SwitchState = "switching"
StateResuming SwitchState = "resuming"
)
// resumeRe matches `claude --resume <uuid>` in pane capture output.
var resumeRe = regexp.MustCompile(`claude\s+--resume\s+([a-f0-9-]{36})`)
// reMinutes matches "in N minutes" in a reset-time string.
var reMinutes = regexp.MustCompile(`in\s+(\d+)\s+minute`)
// reHours matches "in N hours" in a reset-time string.
var reHours = regexp.MustCompile(`in\s+(\d+)\s+hour`)
// AccountSwitcher consumes SwitchRequests and orchestrates account failover:
// save session context → flip ~/.claude symlink → restart sessions.
type AccountSwitcher struct {
tmux tmux.Client
state *state.State
config *config.Config
switchCh <-chan quota.SwitchRequest
notifier *notify.Notifier
currentState SwitchState
logger *log.Logger
// homeDir is the directory containing the .claude symlink. Overridable for tests.
// When empty, os.UserHomeDir() is used.
homeDir string
// sharedSymlinks is the list of shared-state links reconciled on the
// target account home after every flip. Overridable for tests so the
// suite never touches the operator's real /home/ubuntu/.claude-*
// shared directories. When nil, symlinks.RequiredShared is used.
sharedSymlinks []symlinks.SharedSymlink
}
// New creates an AccountSwitcher.
// notifier may be nil; notifications are skipped when absent.
func New(
tc tmux.Client,
s *state.State,
cfg *config.Config,
switchCh <-chan quota.SwitchRequest,
notifier *notify.Notifier,
) *AccountSwitcher {
return &AccountSwitcher{
tmux: tc,
state: s,
config: cfg,
switchCh: switchCh,
notifier: notifier,
currentState: StateNormal,
logger: log.Default(),
}
}
// Run starts the switcher event loop until ctx is cancelled.
func (a *AccountSwitcher) Run(ctx context.Context) {
for {
select {
case <-ctx.Done():
return
case req := <-a.switchCh:
a.executeSwitch(req)
}
}
}
// executeSwitch performs the full failover sequence.
func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) {
a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime)
// 1. SAVING — capture resume UUIDs from all working sessions plus
// every dedicated session unconditionally (dedicated sessions are
// user-driven and may not be tracked as "working" in state, but their
// UUIDs are the most valuable to preserve across a swap).
a.currentState = StateSaving
a.saveAllSessions()
a.saveDedicatedUUIDs()
// 2. SWITCHING — find target, flip symlink, restart sessions.
a.currentState = StateSwitching
target := a.findTargetAccount(req.From)
if target == nil {
a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From)
a.currentState = StateNormal
return
}
if err := a.flipSymlink(target.Home); err != nil {
a.logger.Printf("[switcher] flipSymlink error: %v", err)
}
// Best-effort: make sure the target account home exposes the three
// shared-state symlinks (session-env, file-history, projects). The main
// ~/.claude flip is already done, so an error here must NOT abort the
// swap — we just log it so the operator can investigate. Without this
// call, a fresh target account with no shared links would silently
// start writing into private /projects/session-env/file-history dirs
// and diverge from the primary account's transcripts.
if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil {
a.logger.Printf("[switcher] WARN ensure shared symlinks for %q: %v", target.Home, err)
}
a.killAllPoolSessions()
a.recreatePoolSessions()
a.relaunchDedicatedSessions(target.Home)
// Update active account and record the swap timestamp so the quota
// monitor can enforce a cooldown before requesting another one.
a.state.SetActiveAccount(target.Name)
a.state.RecordSwap(req.From, target.Name)
// 3. RESUMING — sessions are alive, dispatcher will fill them.
a.currentState = StateResuming
// 4. Notify.
msg := fmt.Sprintf("Switch %s → %s (reset: %s)", req.From, target.Name, req.ResetTime)
a.logger.Printf("[switcher] SWAP complete: %s", msg)
if a.notifier != nil {
a.notifier.Telegram("🔄 " + msg) //nolint:errcheck
}
// 5. Schedule return to primary account if reset time is known.
if req.ResetTime != "" {
go a.scheduleReturn(req.From, req.ResetTime)
}
a.currentState = StateNormal
}
// saveDedicatedUUIDs captures the resume UUID for every configured dedicated
// session, regardless of its tracked state. Dedicated sessions are typically
// user-driven and not in state="working", but their UUIDs are the most
// valuable to preserve across a swap so the user's work is not lost.
func (a *AccountSwitcher) saveDedicatedUUIDs() {
for _, ds := range a.config.Pool.Dedicated {
if !a.tmux.HasSession(ds.Name) {
continue
}
tail, err := a.tmux.CapturePaneTail(ds.Name, 200)
if err != nil {
continue
}
uuid := extractResumeUUID(tail)
if uuid == "" {
continue
}
dir := a.resumeContextDir()
if err := os.MkdirAll(dir, 0700); err != nil {
a.logger.Printf("[switcher] mkdir %s: %v", dir, err)
continue
}
path := filepath.Join(dir, ds.Name+"-resume-id.txt")
if err := os.WriteFile(path, []byte(uuid), 0600); err != nil {
a.logger.Printf("[switcher] write %s: %v", path, err)
continue
}
a.logger.Printf("[switcher] saved dedicated resume UUID for %q: %s", ds.Name, uuid)
}
}
// relaunchDedicatedSessions sends a `claude --resume <uuid>` command to each
// dedicated session after recreation, using the target account's home via
// CLAUDE_CONFIG_DIR so the session follows the active account. If no UUID was
// captured for a session, it is left at the bash prompt for manual restart.
func (a *AccountSwitcher) relaunchDedicatedSessions(targetHome string) {
for _, ds := range a.config.Pool.Dedicated {
path := filepath.Join(a.resumeContextDir(), ds.Name+"-resume-id.txt")
data, err := os.ReadFile(path)
if err != nil {
a.logger.Printf("[switcher] no saved resume UUID for %q (%v) — leaving at shell", ds.Name, err)
continue
}
uuid := strings.TrimSpace(string(data))
if !isValidResumeUUID(uuid) {
a.logger.Printf("[switcher] invalid UUID for %q: %q", ds.Name, uuid)
continue
}
// targetHome is operator-controlled (config file); uuid is regex-validated.
// Neither is user-supplied runtime input, so shell interpolation is safe.
cmd := fmt.Sprintf("CLAUDE_CONFIG_DIR=%s claude --dangerously-skip-permissions --resume %s",
targetHome, uuid)
if err := a.tmux.SendKeys(ds.Name, cmd); err != nil {
a.logger.Printf("[switcher] relaunch %q: %v", ds.Name, err)
continue
}
a.logger.Printf("[switcher] relaunched %q on %s (resume=%s)", ds.Name, targetHome, uuid)
}
}
// isValidResumeUUID defends against corrupted resume-id files by requiring
// the canonical 36-char lowercase hex+dash UUID format.
func isValidResumeUUID(s string) bool {
if len(s) != 36 {
return false
}
return resumeRe.MatchString("claude --resume " + s)
}
// saveAllSessions captures the resume UUID for every working session.
func (a *AccountSwitcher) saveAllSessions() {
a.state.ForEachWorking(func(name string, _ *state.SessionState) {
tail, err := a.tmux.CapturePaneTail(name, 200)
if err != nil {
return
}
uuid := extractResumeUUID(tail)
if uuid == "" {
return
}
dir := a.resumeContextDir()
os.MkdirAll(dir, 0700)
path := filepath.Join(dir, name+"-resume-id.txt")
os.WriteFile(path, []byte(uuid), 0600)
a.logger.Printf("[switcher] saved resume UUID for %q", name)
})
}
// requiredShared returns the shared-symlink list used when reconciling the
// target account home after a flip. Tests may set a.sharedSymlinks to a
// tmpdir-scoped list so they never touch /home/ubuntu/.claude-*-shared.
func (a *AccountSwitcher) requiredShared() []symlinks.SharedSymlink {
if a.sharedSymlinks != nil {
return a.sharedSymlinks
}
return symlinks.RequiredShared
}
// resolveHomeDir returns the configured homeDir (test override) or the real
// user home. Tests MUST set a.homeDir to a tmpdir to avoid clobbering the
// production ~/.claude symlink.
func (a *AccountSwitcher) resolveHomeDir() (string, error) {
if a.homeDir != "" {
return a.homeDir, nil
}
home, err := os.UserHomeDir()
if err != nil {
return "", fmt.Errorf("UserHomeDir: %w", err)
}
return home, nil
}
// flipSymlink replaces ~/.claude with a symlink to targetHome.
// All paths come from config — no hardcoded values.
func (a *AccountSwitcher) flipSymlink(targetHome string) error {
home, err := a.resolveHomeDir()
if err != nil {
return err
}
claudeLink := filepath.Join(home, ".claude")
os.Remove(claudeLink)
if err := os.Symlink(targetHome, claudeLink); err != nil {
return fmt.Errorf("symlink %s → %s: %w", claudeLink, targetHome, err)
}
a.logger.Printf("[switcher] ~/.claude → %s", targetHome)
return nil
}
// killAllPoolSessions kills all autonomous and dedicated pool sessions
// managed by this daemon. Sessions outside the configured
// StartIndex..StartIndex+Max range (e.g. manual operator sessions
// `ccl-0..ccl-9`) are left untouched.
func (a *AccountSwitcher) killAllPoolSessions() {
prefix := a.config.Pool.Autonomous.Prefix
if prefix == "" {
prefix = "ccl-auto-"
}
start := a.config.Pool.Autonomous.StartIndex
for i := start; i < start+a.config.Pool.Autonomous.Max; i++ {
a.tmux.KillSession(sessionName(prefix, i)) //nolint:errcheck
}
for _, ds := range a.config.Pool.Dedicated {
a.tmux.KillSession(ds.Name) //nolint:errcheck
}
}
// recreatePoolSessions creates fresh pool sessions after a switch,
// respecting StartIndex so the pool stays within its configured range.
func (a *AccountSwitcher) recreatePoolSessions() {
prefix := a.config.Pool.Autonomous.Prefix
if prefix == "" {
prefix = "ccl-auto-"
}
start := a.config.Pool.Autonomous.StartIndex
for i := start; i < start+a.config.Pool.Autonomous.Min; i++ {
name := sessionName(prefix, i)
if err := a.tmux.CreateSession(name, ""); err != nil {
a.logger.Printf("[switcher] recreate autonomous %q: %v", name, err)
}
}
for _, ds := range a.config.Pool.Dedicated {
if err := a.tmux.CreateSession(ds.Name, ds.Project); err != nil {
a.logger.Printf("[switcher] recreate dedicated %q: %v", ds.Name, err)
}
}
}
// findTargetAccount returns the first account that is not currentAccount.
func (a *AccountSwitcher) findTargetAccount(currentAccount string) *config.AccountConfig {
for i := range a.config.Accounts {
if a.config.Accounts[i].Name != currentAccount {
return &a.config.Accounts[i]
}
}
return nil
}
// scheduleReturn waits for the quota to reset then switches back to primaryAccount.
func (a *AccountSwitcher) scheduleReturn(primaryAccount, resetTime string) {
dur := timeUntilReset(resetTime) + 5*time.Minute
a.logger.Printf("[switcher] return to %q scheduled in %v", primaryAccount, dur.Round(time.Minute))
time.Sleep(dur)
a.executeSwitch(quota.SwitchRequest{
From: a.state.ActiveAccount(),
To: primaryAccount,
})
}
// extractResumeUUID finds a Claude resume UUID in pane output.
func extractResumeUUID(content string) string {
m := resumeRe.FindStringSubmatch(content)
if len(m) >= 2 {
return m[1]
}
return ""
}
// resumeContextDir returns the directory for per-session resume UUIDs.
// Honours a.homeDir override so tests never write to the real ~/.claude-context.
func (a *AccountSwitcher) resumeContextDir() string {
home, _ := a.resolveHomeDir()
return filepath.Join(home, ".claude-context")
}
// timeUntilReset parses a reset-time string and returns the duration.
// Returns a 2-hour fallback when parsing fails.
func timeUntilReset(resetTime string) time.Duration {
lower := strings.ToLower(strings.TrimSpace(resetTime))
if m := reMinutes.FindStringSubmatch(lower); len(m) >= 2 {
n, _ := strconv.Atoi(m[1])
return time.Duration(n) * time.Minute
}
if m := reHours.FindStringSubmatch(lower); len(m) >= 2 {
n, _ := strconv.Atoi(m[1])
return time.Duration(n) * time.Hour
}
return 2 * time.Hour
}
func sessionName(prefix string, i int) string {
return prefix + itoa(i)
}
func itoa(n int) string {
if n == 0 {
return "0"
}
b := make([]byte, 0, 10)
for n > 0 {
b = append([]byte{byte('0' + n%10)}, b...)
n /= 10
}
return string(b)
}