claude-failover/internal/switcher/account_switcher.go

487 lines
18 KiB
Go
Raw Permalink Normal View History

// Package switcher implements the account-switcher state machine.
// It is the only component allowed to flip the active Claude account.
package switcher
import (
"context"
fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00
"errors"
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00
"sync/atomic"
"time"
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
"forge.secuaas.ovh/olivier/claude-failover/internal/notify"
"forge.secuaas.ovh/olivier/claude-failover/internal/quota"
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
"forge.secuaas.ovh/olivier/claude-failover/internal/symlinks"
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
)
fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00
// ErrPartialSwap is returned (and wrapped) when the switcher flipped
// ~/.claude to the target home, EnsureForAccount failed on the target,
// and the rollback flip back to the previous home ALSO failed. The daemon
// is in a documented degraded state: the active-account setter was NOT
// called, but the filesystem symlink may point at an account whose shared
// state is inconsistent. Operator intervention is required. Callers can
// interrogate AccountSwitcher.IsPartialSwap() to expose the flag to
// health-checks / watchdogs.
var ErrPartialSwap = errors.New("switcher: partial swap — flip succeeded but ensure + rollback both failed")
// SwitchState represents the current phase of a failover operation.
type SwitchState string
const (
StateNormal SwitchState = "normal"
StateSaving SwitchState = "saving"
StateSwitching SwitchState = "switching"
StateResuming SwitchState = "resuming"
)
// resumeRe matches `claude --resume <uuid>` in pane capture output.
var resumeRe = regexp.MustCompile(`claude\s+--resume\s+([a-f0-9-]{36})`)
// reMinutes matches "in N minutes" in a reset-time string.
var reMinutes = regexp.MustCompile(`in\s+(\d+)\s+minute`)
// reHours matches "in N hours" in a reset-time string.
var reHours = regexp.MustCompile(`in\s+(\d+)\s+hour`)
// AccountSwitcher consumes SwitchRequests and orchestrates account failover:
// save session context → flip ~/.claude symlink → restart sessions.
type AccountSwitcher struct {
tmux tmux.Client
state *state.State
config *config.Config
switchCh <-chan quota.SwitchRequest
notifier *notify.Notifier
currentState SwitchState
logger *log.Logger
// homeDir is the directory containing the .claude symlink. Overridable for tests.
// When empty, os.UserHomeDir() is used.
homeDir string
// sharedSymlinks is the list of shared-state links reconciled on the
// target account home after every flip. Overridable for tests so the
// suite never touches the operator's real /home/ubuntu/.claude-*
// shared directories. When nil, symlinks.RequiredShared is used.
sharedSymlinks []symlinks.SharedSymlink
fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00
// partialSwap is set to 1 when a flip+ensure+rollback sequence left the
// daemon in an inconsistent state (symlink possibly flipped, but active
// account NOT updated, and rollback flip ALSO failed). Health-checks /
// watchdogs read this flag via IsPartialSwap(). It is sticky: once set,
// it stays set until the operator restarts the daemon after fixing the
// filesystem state. We use atomic access so watchdog goroutines can read
// it without blocking the switcher.
partialSwap atomic.Bool
}
// New creates an AccountSwitcher.
// notifier may be nil; notifications are skipped when absent.
func New(
tc tmux.Client,
s *state.State,
cfg *config.Config,
switchCh <-chan quota.SwitchRequest,
notifier *notify.Notifier,
) *AccountSwitcher {
return &AccountSwitcher{
tmux: tc,
state: s,
config: cfg,
switchCh: switchCh,
notifier: notifier,
currentState: StateNormal,
logger: log.Default(),
}
}
// Run starts the switcher event loop until ctx is cancelled.
func (a *AccountSwitcher) Run(ctx context.Context) {
for {
select {
case <-ctx.Done():
return
case req := <-a.switchCh:
a.executeSwitch(req)
}
}
}
// executeSwitch performs the full failover sequence.
func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) {
fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00
if err := a.executeSwitchE(req); err != nil {
// executeSwitchE already logs the detail; we swallow the error here
// because the public Run loop has no return channel. The partialSwap
// flag (if set) remains visible via IsPartialSwap().
a.logger.Printf("[switcher] SWAP aborted: %v", err)
}
}
// executeSwitchE runs the swap and returns an error describing any abort or
// partial-swap condition. Split out from executeSwitch so tests can assert
// on the error value without routing through a channel.
func (a *AccountSwitcher) executeSwitchE(req quota.SwitchRequest) error {
a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime)
fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00
// Refuse to proceed if a previous swap left the daemon in an
// inconsistent state. The operator must intervene (fix the filesystem,
// restart the daemon) before any further failover can be attempted —
// otherwise we'd stack symlink flips on top of a broken state.
if a.partialSwap.Load() {
err := fmt.Errorf("refusing swap: daemon is in partial-swap degraded state (operator intervention required)")
a.logger.Printf("[switcher] %v", err)
return err
}
// 1. SAVING — capture resume UUIDs from all working sessions plus
// every dedicated session unconditionally (dedicated sessions are
// user-driven and may not be tracked as "working" in state, but their
// UUIDs are the most valuable to preserve across a swap).
a.currentState = StateSaving
a.saveAllSessions()
a.saveDedicatedUUIDs()
// 2. SWITCHING — find target, flip symlink, restart sessions.
a.currentState = StateSwitching
target := a.findTargetAccount(req.From)
if target == nil {
a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From)
a.currentState = StateNormal
fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00
return nil
}
fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00
previous := a.findAccountByName(req.From)
if err := a.flipSymlink(target.Home); err != nil {
a.logger.Printf("[switcher] flipSymlink error: %v", err)
}
fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00
// Ensure the target account home exposes the three shared-state
// symlinks (session-env, file-history, projects). If this fails we
// MUST NOT proceed with SetActiveAccount — the daemon would otherwise
// declare the target "active" while its shared state is divergent,
// silently writing transcripts into private /projects directories and
// breaking `claude --resume` across sessions. Instead we attempt to
// roll back the ~/.claude flip to the previous account. If the
// rollback also fails, the daemon is in a documented degraded state
// (ErrPartialSwap) and the operator must intervene.
if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil {
fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00
a.logger.Printf("[switcher] ensure shared symlinks for %q failed: %v — attempting rollback", target.Home, err)
if previous == nil || previous.Home == "" {
// No known previous home to roll back to — set the degraded
// flag and bail out. This is equivalent to a rollback failure
// because the filesystem is pointed at a broken target.
a.partialSwap.Store(true)
a.currentState = StateNormal
return fmt.Errorf("%w: ensure failed (%v) and no previous account home is known for rollback", ErrPartialSwap, err)
}
if rbErr := a.flipSymlink(previous.Home); rbErr != nil {
// Both the ensure AND the rollback failed. The daemon is now
// in a documented inconsistent state: ~/.claude may point at
// target whose shared-state is divergent, but SetActiveAccount
// has NOT been called so state.ActiveAccount is still the
// previous account. No further failover can be attempted
// until the operator intervenes.
a.partialSwap.Store(true)
a.logger.Printf("[switcher] CRITICAL partial swap: ensure=%v rollback=%v — daemon in degraded state, operator intervention required", err, rbErr)
a.currentState = StateNormal
return fmt.Errorf("%w: ensure=%v rollback=%v", ErrPartialSwap, err, rbErr)
}
// Rollback succeeded — symlink is back on the previous account,
// SetActiveAccount was NEVER called, state is consistent with
// "no swap happened". Return an explicit error so the caller
// knows the swap was cancelled.
a.logger.Printf("[switcher] rollback successful: ~/.claude → %s (swap cancelled)", previous.Home)
a.currentState = StateNormal
return fmt.Errorf("swap cancelled: ensure shared symlinks failed on target %q: %w", target.Home, err)
}
a.killAllPoolSessions()
a.recreatePoolSessions()
a.relaunchDedicatedSessions(target.Home)
// Update active account and record the swap timestamp so the quota
// monitor can enforce a cooldown before requesting another one.
a.state.SetActiveAccount(target.Name)
a.state.RecordSwap(req.From, target.Name)
// 3. RESUMING — sessions are alive, dispatcher will fill them.
a.currentState = StateResuming
// 4. Notify.
msg := fmt.Sprintf("Switch %s → %s (reset: %s)", req.From, target.Name, req.ResetTime)
a.logger.Printf("[switcher] SWAP complete: %s", msg)
if a.notifier != nil {
a.notifier.Telegram("🔄 " + msg) //nolint:errcheck
}
// 5. Schedule return to primary account if reset time is known.
if req.ResetTime != "" {
go a.scheduleReturn(req.From, req.ResetTime)
}
a.currentState = StateNormal
fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00
return nil
}
// IsPartialSwap reports whether the switcher is in a degraded state after a
// flip+ensure+rollback sequence all failed. Health-checks and watchdogs use
// this signal to surface an operator-actionable alert. The flag is sticky
// for the lifetime of the process: once set, it remains set until the daemon
// is restarted (after the operator has fixed the filesystem).
func (a *AccountSwitcher) IsPartialSwap() bool {
return a.partialSwap.Load()
}
// findAccountByName returns the account config entry matching name, or nil.
// Unlike findTargetAccount (which returns the first NON-matching account),
// this is used by the rollback path to recover the previous home.
func (a *AccountSwitcher) findAccountByName(name string) *config.AccountConfig {
if name == "" {
return nil
}
for i := range a.config.Accounts {
if a.config.Accounts[i].Name == name {
return &a.config.Accounts[i]
}
}
return nil
}
// saveDedicatedUUIDs captures the resume UUID for every configured dedicated
// session, regardless of its tracked state. Dedicated sessions are typically
// user-driven and not in state="working", but their UUIDs are the most
// valuable to preserve across a swap so the user's work is not lost.
func (a *AccountSwitcher) saveDedicatedUUIDs() {
for _, ds := range a.config.Pool.Dedicated {
if !a.tmux.HasSession(ds.Name) {
continue
}
tail, err := a.tmux.CapturePaneTail(ds.Name, 200)
if err != nil {
continue
}
uuid := extractResumeUUID(tail)
if uuid == "" {
continue
}
dir := a.resumeContextDir()
if err := os.MkdirAll(dir, 0700); err != nil {
a.logger.Printf("[switcher] mkdir %s: %v", dir, err)
continue
}
path := filepath.Join(dir, ds.Name+"-resume-id.txt")
if err := os.WriteFile(path, []byte(uuid), 0600); err != nil {
a.logger.Printf("[switcher] write %s: %v", path, err)
continue
}
a.logger.Printf("[switcher] saved dedicated resume UUID for %q: %s", ds.Name, uuid)
}
}
// relaunchDedicatedSessions sends a `claude --resume <uuid>` command to each
// dedicated session after recreation, using the target account's home via
// CLAUDE_CONFIG_DIR so the session follows the active account. If no UUID was
// captured for a session, it is left at the bash prompt for manual restart.
func (a *AccountSwitcher) relaunchDedicatedSessions(targetHome string) {
for _, ds := range a.config.Pool.Dedicated {
path := filepath.Join(a.resumeContextDir(), ds.Name+"-resume-id.txt")
data, err := os.ReadFile(path)
if err != nil {
a.logger.Printf("[switcher] no saved resume UUID for %q (%v) — leaving at shell", ds.Name, err)
continue
}
uuid := strings.TrimSpace(string(data))
if !isValidResumeUUID(uuid) {
a.logger.Printf("[switcher] invalid UUID for %q: %q", ds.Name, uuid)
continue
}
// targetHome is operator-controlled (config file); uuid is regex-validated.
// Neither is user-supplied runtime input, so shell interpolation is safe.
cmd := fmt.Sprintf("CLAUDE_CONFIG_DIR=%s claude --dangerously-skip-permissions --resume %s",
targetHome, uuid)
if err := a.tmux.SendKeys(ds.Name, cmd); err != nil {
a.logger.Printf("[switcher] relaunch %q: %v", ds.Name, err)
continue
}
a.logger.Printf("[switcher] relaunched %q on %s (resume=%s)", ds.Name, targetHome, uuid)
}
}
// isValidResumeUUID defends against corrupted resume-id files by requiring
// the canonical 36-char lowercase hex+dash UUID format.
func isValidResumeUUID(s string) bool {
if len(s) != 36 {
return false
}
return resumeRe.MatchString("claude --resume " + s)
}
// saveAllSessions captures the resume UUID for every working session.
func (a *AccountSwitcher) saveAllSessions() {
a.state.ForEachWorking(func(name string, _ *state.SessionState) {
tail, err := a.tmux.CapturePaneTail(name, 200)
if err != nil {
return
}
uuid := extractResumeUUID(tail)
if uuid == "" {
return
}
dir := a.resumeContextDir()
os.MkdirAll(dir, 0700)
path := filepath.Join(dir, name+"-resume-id.txt")
os.WriteFile(path, []byte(uuid), 0600)
a.logger.Printf("[switcher] saved resume UUID for %q", name)
})
}
// requiredShared returns the shared-symlink list used when reconciling the
// target account home after a flip. Tests may set a.sharedSymlinks to a
// tmpdir-scoped list so they never touch /home/ubuntu/.claude-*-shared.
func (a *AccountSwitcher) requiredShared() []symlinks.SharedSymlink {
if a.sharedSymlinks != nil {
return a.sharedSymlinks
}
return symlinks.RequiredShared
}
// resolveHomeDir returns the configured homeDir (test override) or the real
// user home. Tests MUST set a.homeDir to a tmpdir to avoid clobbering the
// production ~/.claude symlink.
func (a *AccountSwitcher) resolveHomeDir() (string, error) {
if a.homeDir != "" {
return a.homeDir, nil
}
home, err := os.UserHomeDir()
if err != nil {
return "", fmt.Errorf("UserHomeDir: %w", err)
}
return home, nil
}
// flipSymlink replaces ~/.claude with a symlink to targetHome.
// All paths come from config — no hardcoded values.
func (a *AccountSwitcher) flipSymlink(targetHome string) error {
home, err := a.resolveHomeDir()
if err != nil {
return err
}
claudeLink := filepath.Join(home, ".claude")
os.Remove(claudeLink)
if err := os.Symlink(targetHome, claudeLink); err != nil {
return fmt.Errorf("symlink %s → %s: %w", claudeLink, targetHome, err)
}
a.logger.Printf("[switcher] ~/.claude → %s", targetHome)
return nil
}
// killAllPoolSessions kills all autonomous and dedicated pool sessions
// managed by this daemon. Sessions outside the configured
// StartIndex..StartIndex+Max range (e.g. manual operator sessions
// `ccl-0..ccl-9`) are left untouched.
func (a *AccountSwitcher) killAllPoolSessions() {
prefix := a.config.Pool.Autonomous.Prefix
if prefix == "" {
prefix = "ccl-auto-"
}
start := a.config.Pool.Autonomous.StartIndex
for i := start; i < start+a.config.Pool.Autonomous.Max; i++ {
a.tmux.KillSession(sessionName(prefix, i)) //nolint:errcheck
}
for _, ds := range a.config.Pool.Dedicated {
a.tmux.KillSession(ds.Name) //nolint:errcheck
}
}
// recreatePoolSessions creates fresh pool sessions after a switch,
// respecting StartIndex so the pool stays within its configured range.
func (a *AccountSwitcher) recreatePoolSessions() {
prefix := a.config.Pool.Autonomous.Prefix
if prefix == "" {
prefix = "ccl-auto-"
}
start := a.config.Pool.Autonomous.StartIndex
for i := start; i < start+a.config.Pool.Autonomous.Min; i++ {
name := sessionName(prefix, i)
if err := a.tmux.CreateSession(name, ""); err != nil {
a.logger.Printf("[switcher] recreate autonomous %q: %v", name, err)
}
}
for _, ds := range a.config.Pool.Dedicated {
if err := a.tmux.CreateSession(ds.Name, ds.Project); err != nil {
a.logger.Printf("[switcher] recreate dedicated %q: %v", ds.Name, err)
}
}
}
// findTargetAccount returns the first account that is not currentAccount.
func (a *AccountSwitcher) findTargetAccount(currentAccount string) *config.AccountConfig {
for i := range a.config.Accounts {
if a.config.Accounts[i].Name != currentAccount {
return &a.config.Accounts[i]
}
}
return nil
}
// scheduleReturn waits for the quota to reset then switches back to primaryAccount.
func (a *AccountSwitcher) scheduleReturn(primaryAccount, resetTime string) {
dur := timeUntilReset(resetTime) + 5*time.Minute
a.logger.Printf("[switcher] return to %q scheduled in %v", primaryAccount, dur.Round(time.Minute))
time.Sleep(dur)
a.executeSwitch(quota.SwitchRequest{
From: a.state.ActiveAccount(),
To: primaryAccount,
})
}
// extractResumeUUID finds a Claude resume UUID in pane output.
func extractResumeUUID(content string) string {
m := resumeRe.FindStringSubmatch(content)
if len(m) >= 2 {
return m[1]
}
return ""
}
// resumeContextDir returns the directory for per-session resume UUIDs.
// Honours a.homeDir override so tests never write to the real ~/.claude-context.
func (a *AccountSwitcher) resumeContextDir() string {
home, _ := a.resolveHomeDir()
return filepath.Join(home, ".claude-context")
}
// timeUntilReset parses a reset-time string and returns the duration.
// Returns a 2-hour fallback when parsing fails.
func timeUntilReset(resetTime string) time.Duration {
lower := strings.ToLower(strings.TrimSpace(resetTime))
if m := reMinutes.FindStringSubmatch(lower); len(m) >= 2 {
n, _ := strconv.Atoi(m[1])
return time.Duration(n) * time.Minute
}
if m := reHours.FindStringSubmatch(lower); len(m) >= 2 {
n, _ := strconv.Atoi(m[1])
return time.Duration(n) * time.Hour
}
return 2 * time.Hour
}
func sessionName(prefix string, i int) string {
return prefix + itoa(i)
}
func itoa(n int) string {
if n == 0 {
return "0"
}
b := make([]byte, 0, 10)
for n > 0 {
b = append([]byte{byte('0' + n%10)}, b...)
n /= 10
}
return string(b)
}