claude-failover/internal/switcher/account_switcher_test.go
Ubuntu 20063b1939 fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10)
Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency
- Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount
  still fired → daemon declared target active while shared symlinks were
  absent/divergent → transcripts silently duplicated, resume broken.
- After: ensure failure triggers rollback flip to previous account home;
  if rollback succeeds → explicit error, ActiveAccount stays on previous.
  If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all
  further swaps refused until operator intervention (daemon restart).
- New public IsPartialSwap() for watchdog / health-check integration.

Bug #10 (MOYENNE) — requiredShared contract never exercised
- All existing tests override a.sharedSymlinks with tmpdir-scoped lists,
  so symlinks.RequiredShared itself was never tested. A rename or drop
  would pass every test but silently break prod failover.
- TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with
  the exact required names, absolute targets, and a single shared parent
  directory (invariant EnsureForAccount depends on).

Tests:
- go test ./... PASS
- go test -race ./... PASS (no data race)
- 2 new switcher tests: TestFlipEnsureFailureTriggersRollback,
  TestFlipEnsureAndRollbackFailure
- 1 new symlinks test: TestRequiredSharedIsCoherent
- 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap
  (encoded the old buggy best-effort behaviour)
2026-04-16 19:53:48 +00:00

457 lines
15 KiB
Go

package switcher
import (
"errors"
"os"
"path/filepath"
"strings"
"testing"
"time"
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
"forge.secuaas.ovh/olivier/claude-failover/internal/quota"
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
"forge.secuaas.ovh/olivier/claude-failover/internal/symlinks"
)
// tmpShared returns a SharedSymlink list whose targets live entirely under
// tmpDir, so switcher tests never touch /home/ubuntu/.claude-*-shared.
func tmpShared(tmpDir string) []symlinks.SharedSymlink {
return []symlinks.SharedSymlink{
{Target: filepath.Join(tmpDir, "session-env-shared"), Name: "session-env"},
{Target: filepath.Join(tmpDir, "file-history-shared"), Name: "file-history"},
{Target: filepath.Join(tmpDir, "projects-shared"), Name: "projects"},
}
}
// mockTmux for switcher tests.
type mockTmux struct {
sessions map[string]bool
paneOutput map[string]string
killCalls []string
createCalls []string
sendKeyCalls []string
}
func newMockTmux() *mockTmux {
return &mockTmux{
sessions: make(map[string]bool),
paneOutput: make(map[string]string),
}
}
func (m *mockTmux) HasSession(name string) bool { return m.sessions[name] }
func (m *mockTmux) CreateSession(name, _ string) error {
m.sessions[name] = true
m.createCalls = append(m.createCalls, name)
return nil
}
func (m *mockTmux) KillSession(name string) error {
delete(m.sessions, name)
m.killCalls = append(m.killCalls, name)
return nil
}
func (m *mockTmux) SendKeys(session, keys string) error {
m.sendKeyCalls = append(m.sendKeyCalls, session+":"+keys)
return nil
}
func (m *mockTmux) SendEnter(session string) error {
m.sendKeyCalls = append(m.sendKeyCalls, session+":<ENTER>")
return nil
}
func (m *mockTmux) CapturePaneTail(session string, _ int) (string, error) {
return m.paneOutput[session], nil
}
// TestFindTargetAccount returns the first account that differs from current.
func TestFindTargetAccount(t *testing.T) {
tc := newMockTmux()
s := state.New("")
cfg := &config.Config{
Accounts: []config.AccountConfig{
{Name: "compte1", Priority: 1},
{Name: "compte2", Priority: 2},
},
}
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
target := a.findTargetAccount("compte1")
if target == nil || target.Name != "compte2" {
t.Errorf("expected compte2, got %v", target)
}
}
// TestFindTargetAccountSingleAccount returns nil when only one account exists.
func TestFindTargetAccountSingleAccount(t *testing.T) {
tc := newMockTmux()
s := state.New("")
cfg := &config.Config{
Accounts: []config.AccountConfig{{Name: "solo"}},
}
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
if got := a.findTargetAccount("solo"); got != nil {
t.Errorf("expected nil for single account, got %v", got)
}
}
// TestExtractResumeUUID parses UUID from pane output.
func TestExtractResumeUUID(t *testing.T) {
input := "$ claude --resume a1b2c3d4-e5f6-7890-abcd-ef1234567890 --model sonnet"
got := extractResumeUUID(input)
want := "a1b2c3d4-e5f6-7890-abcd-ef1234567890"
if got != want {
t.Errorf("expected %q, got %q", want, got)
}
}
// TestExtractResumeUUIDMissing returns empty string when no UUID present.
func TestExtractResumeUUIDMissing(t *testing.T) {
if got := extractResumeUUID("no uuid here"); got != "" {
t.Errorf("expected empty, got %q", got)
}
}
// TestTimeUntilReset parses minute and hour formats correctly.
func TestTimeUntilReset(t *testing.T) {
cases := []struct {
input string
want time.Duration
}{
{"in 45 minutes", 45 * time.Minute},
{"in 2 hours", 2 * time.Hour},
{"in 1 hour", 1 * time.Hour},
{"", 2 * time.Hour},
{"8pm", 2 * time.Hour}, // fallback for unrecognised formats
}
for _, c := range cases {
if got := timeUntilReset(c.input); got != c.want {
t.Errorf("timeUntilReset(%q) = %v, want %v", c.input, got, c.want)
}
}
}
// TestKillAndRecreatePoolSessions verifies that executeSwitch restarts sessions.
func TestKillAndRecreatePoolSessions(t *testing.T) {
tc := newMockTmux()
tc.sessions["ccl-auto-0"] = true
tc.sessions["ccl-auto-1"] = true
tc.sessions["dedicated-1"] = true
s := state.New("")
s.SetActiveAccount("compte1")
cfg := &config.Config{
Accounts: []config.AccountConfig{
{Name: "compte1", Home: t.TempDir()},
{Name: "compte2", Home: t.TempDir()},
},
Pool: config.PoolConfig{
Dedicated: []config.DedicatedSession{{Name: "dedicated-1", Project: "/tmp"}},
Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 2, Max: 2},
},
}
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
// CRITICAL: isolate symlink manipulation in a tmpdir so the test never
// touches the real ~/.claude (regression: a reboot used to leave Claude
// Code unusable because the test had repointed ~/.claude to /tmp/...).
a.homeDir = t.TempDir()
// Scope shared-symlink targets to a tmpdir so the post-flip ensure
// pass does not write inside /home/ubuntu/.claude-*-shared.
a.sharedSymlinks = tmpShared(t.TempDir())
a.executeSwitch(quota.SwitchRequest{From: "compte1"})
// Active account must have changed.
if got := s.ActiveAccount(); got != "compte2" {
t.Errorf("expected active account compte2, got %q", got)
}
// All old sessions must have been killed.
for _, name := range []string{"ccl-auto-0", "ccl-auto-1", "dedicated-1"} {
found := false
for _, k := range tc.killCalls {
if k == name {
found = true
break
}
}
if !found {
t.Errorf("expected %q to be killed", name)
}
}
// Min pool sessions must be recreated.
recreated := map[string]bool{}
for _, c := range tc.createCalls {
recreated[c] = true
}
if !recreated["ccl-auto-0"] || !recreated["ccl-auto-1"] {
t.Errorf("expected autonomous sessions recreated; createCalls=%v", tc.createCalls)
}
}
// TestDedicatedRelaunchAfterSwap verifies that a dedicated session is
// automatically restarted with `claude --resume <uuid>` on the target
// account's home after a swap, so interactive user work is preserved.
func TestDedicatedRelaunchAfterSwap(t *testing.T) {
tc := newMockTmux()
tc.sessions["dedicated-1"] = true
// Pane shows the full resume command — saveDedicatedUUIDs will extract it.
tc.paneOutput["dedicated-1"] = "claude --resume aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee --dangerously-skip-permissions"
s := state.New("")
s.SetActiveAccount("compte1")
home1 := filepath.Join(t.TempDir(), "claude-1-xxxx")
home2 := filepath.Join(t.TempDir(), "claude-2-xxxx")
cfg := &config.Config{
Accounts: []config.AccountConfig{
{Name: "compte1", Home: home1},
{Name: "compte2", Home: home2},
},
Pool: config.PoolConfig{
Dedicated: []config.DedicatedSession{{Name: "dedicated-1", Project: "/tmp"}},
Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0},
},
}
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
a.homeDir = t.TempDir()
a.sharedSymlinks = tmpShared(t.TempDir())
a.executeSwitch(quota.SwitchRequest{From: "compte1"})
// The relaunch must send a resume command on the dedicated session,
// pointing CLAUDE_CONFIG_DIR at the target account's home.
var relaunch string
for _, k := range tc.sendKeyCalls {
if strings.HasPrefix(k, "dedicated-1:") && strings.Contains(k, "--resume") {
relaunch = k
break
}
}
if relaunch == "" {
t.Fatalf("expected dedicated-1 relaunch send-keys; got %v", tc.sendKeyCalls)
}
if !strings.Contains(relaunch, "CLAUDE_CONFIG_DIR="+home2) {
t.Errorf("relaunch should set CLAUDE_CONFIG_DIR to target home; got %q", relaunch)
}
if !strings.Contains(relaunch, "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee") {
t.Errorf("relaunch should include captured UUID; got %q", relaunch)
}
}
// TestFlipReconcilesSharedSymlinksOnTargetHome verifies A3: after the main
// ~/.claude flip, the switcher reconciles the three shared-state symlinks
// (session-env / file-history / projects) on the TARGET account home.
// Scenario: the target home has NO links yet — a freshly-provisioned account
// that has never been flipped into. Post-switch, all three links must exist
// inside the target home and point at the canonical shared targets.
func TestFlipReconcilesSharedSymlinksOnTargetHome(t *testing.T) {
tc := newMockTmux()
s := state.New("")
s.SetActiveAccount("compte1")
// Target home starts empty: EnsureForAccount will mkdir + create links.
targetHome := filepath.Join(t.TempDir(), "claude-compte2")
cfg := &config.Config{
Accounts: []config.AccountConfig{
{Name: "compte1", Home: filepath.Join(t.TempDir(), "claude-compte1")},
{Name: "compte2", Home: targetHome},
},
Pool: config.PoolConfig{
Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0},
},
}
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
a.homeDir = t.TempDir()
shared := tmpShared(t.TempDir())
a.sharedSymlinks = shared
// Pre-assert: no link exists in targetHome.
for _, sl := range shared {
if _, err := os.Lstat(filepath.Join(targetHome, sl.Name)); !os.IsNotExist(err) {
t.Fatalf("pre-condition: %q should not exist yet (err=%v)", sl.Name, err)
}
}
a.executeSwitch(quota.SwitchRequest{From: "compte1"})
// Post-assert: every required link exists and points at the canonical
// target under the tmpdir-scoped shared root.
for _, sl := range shared {
linkPath := filepath.Join(targetHome, sl.Name)
info, err := os.Lstat(linkPath)
if err != nil {
t.Errorf("expected link at %s after flip: %v", linkPath, err)
continue
}
if info.Mode()&os.ModeSymlink == 0 {
t.Errorf("%s exists but is not a symlink", linkPath)
continue
}
got, err := os.Readlink(linkPath)
if err != nil {
t.Errorf("readlink %s: %v", linkPath, err)
continue
}
if got != sl.Target {
t.Errorf("link %s points to %q, want %q", linkPath, got, sl.Target)
}
}
}
// TestFlipEnsureFailureTriggersRollback verifies the fix for the A3 bug
// (flip+ensure inconsistency): if EnsureForAccount fails on the target home
// after the ~/.claude flip, the switcher MUST NOT mark the target account
// active. It must instead roll back the ~/.claude symlink to the previous
// account's home, leaving the daemon in the pre-swap state so subsequent
// session work keeps writing to the known-good shared state.
//
// Old (buggy) behaviour: ensure error was WARN-only, SetActiveAccount still
// happened, dedicated sessions were relaunched against a target whose
// /projects, /session-env, /file-history were missing or divergent →
// transcripts duplicated silently, resume broke, undo history diverged.
func TestFlipEnsureFailureTriggersRollback(t *testing.T) {
tc := newMockTmux()
s := state.New("")
s.SetActiveAccount("compte1")
previousHome := filepath.Join(t.TempDir(), "claude-compte1")
targetHome := filepath.Join(t.TempDir(), "claude-compte2")
if err := os.MkdirAll(previousHome, 0700); err != nil {
t.Fatalf("mkdir previous home: %v", err)
}
if err := os.MkdirAll(targetHome, 0700); err != nil {
t.Fatalf("mkdir target home: %v", err)
}
// Plant a divergent link at <targetHome>/session-env. The symlinks
// package refuses to auto-correct this (data-loss safeguard) and will
// return an error, which must now trigger a rollback.
bogus := filepath.Join(t.TempDir(), "somewhere-else")
if err := os.MkdirAll(bogus, 0700); err != nil {
t.Fatalf("mkdir bogus: %v", err)
}
if err := os.Symlink(bogus, filepath.Join(targetHome, "session-env")); err != nil {
t.Fatalf("plant divergent link: %v", err)
}
cfg := &config.Config{
Accounts: []config.AccountConfig{
{Name: "compte1", Home: previousHome},
{Name: "compte2", Home: targetHome},
},
Pool: config.PoolConfig{
Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0},
},
}
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
homeDir := t.TempDir()
a.homeDir = homeDir
a.sharedSymlinks = tmpShared(t.TempDir())
err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"})
if err == nil {
t.Fatalf("executeSwitchE: expected cancellation error, got nil")
}
// The public symmetric swap-cancelled error must mention ensure and
// wrap the underlying symlinks package message. ErrPartialSwap must
// NOT be set (rollback succeeded → recoverable condition).
if errors.Is(err, ErrPartialSwap) {
t.Errorf("did not expect ErrPartialSwap; rollback succeeded; got %v", err)
}
if a.IsPartialSwap() {
t.Errorf("IsPartialSwap should be false when rollback succeeds")
}
// Active account must remain the previous one — SetActiveAccount must
// NOT have been called.
if got := s.ActiveAccount(); got != "compte1" {
t.Errorf("active account should stay compte1 after rollback; got %q", got)
}
// ~/.claude must now point at the previous home (rollback target).
link, rlErr := os.Readlink(filepath.Join(homeDir, ".claude"))
if rlErr != nil {
t.Fatalf("readlink ~/.claude: %v", rlErr)
}
if link != previousHome {
t.Errorf("~/.claude should point at previous home %q after rollback; got %q", previousHome, link)
}
}
// TestFlipEnsureAndRollbackFailure verifies that when BOTH EnsureForAccount
// AND the rollback flip fail, the switcher sets the sticky partial-swap
// flag and returns ErrPartialSwap. The daemon is then in a documented
// degraded state where any further swap is refused until the operator
// restarts it.
func TestFlipEnsureAndRollbackFailure(t *testing.T) {
tc := newMockTmux()
s := state.New("")
s.SetActiveAccount("compte1")
previousHome := filepath.Join(t.TempDir(), "claude-compte1")
targetHome := filepath.Join(t.TempDir(), "claude-compte2")
if err := os.MkdirAll(previousHome, 0700); err != nil {
t.Fatalf("mkdir previous home: %v", err)
}
if err := os.MkdirAll(targetHome, 0700); err != nil {
t.Fatalf("mkdir target home: %v", err)
}
// Plant the divergent link that will cause EnsureForAccount to fail.
bogus := filepath.Join(t.TempDir(), "somewhere-else")
if err := os.MkdirAll(bogus, 0700); err != nil {
t.Fatalf("mkdir bogus: %v", err)
}
if err := os.Symlink(bogus, filepath.Join(targetHome, "session-env")); err != nil {
t.Fatalf("plant divergent link: %v", err)
}
cfg := &config.Config{
Accounts: []config.AccountConfig{
{Name: "compte1", Home: previousHome},
{Name: "compte2", Home: targetHome},
},
Pool: config.PoolConfig{
Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0},
},
}
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
// Force the rollback flip to fail: point homeDir at a file that cannot
// host a .claude symlink. We use a regular file; the flipSymlink
// implementation does os.Remove() then os.Symlink() under homeDir,
// which fails when homeDir is itself a file (ENOTDIR).
badHomeFile := filepath.Join(t.TempDir(), "not-a-dir")
if err := os.WriteFile(badHomeFile, []byte("block"), 0600); err != nil {
t.Fatalf("write bad home: %v", err)
}
a.homeDir = badHomeFile
a.sharedSymlinks = tmpShared(t.TempDir())
err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"})
if err == nil {
t.Fatalf("expected ErrPartialSwap, got nil")
}
if !errors.Is(err, ErrPartialSwap) {
t.Errorf("expected ErrPartialSwap, got %v", err)
}
if !a.IsPartialSwap() {
t.Errorf("IsPartialSwap should be true when both ensure AND rollback fail")
}
// SetActiveAccount must still not have been called.
if got := s.ActiveAccount(); got != "compte1" {
t.Errorf("active account must stay compte1 in partial-swap; got %q", got)
}
// A subsequent swap attempt must be refused while the flag is set.
if err2 := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}); err2 == nil {
t.Errorf("expected subsequent swap to be refused in degraded state")
}
}