diff --git a/VERSION.md b/VERSION.md index b777c33..09e8e6a 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1,4 +1,119 @@ -# Version actuelle : 0.3.6 +# Version actuelle : 0.3.9 + +## [0.3.9] - 2026-04-16 +**Type:** Patch — `go mod tidy` (fsnotify direct dep cleanup) + +### Modifié +- `go.mod` : `fsnotify` promu en dépendance directe, `yaml.v3` regroupé, `golang.org/x/sys` isolé en indirect. Aucun changement fonctionnel. + +### Tests effectués +- ✅ Build OK (aucune modification de code) + +--- + +## [0.3.8] - 2026-04-16 +**Type:** Patch — Bug #1 (A3 flip+ensure inconsistency) + Bug #10 (requiredShared contract test) + +### Corrigé — Bug #1 (CRITIQUE) +- `AccountSwitcher.executeSwitch` ne continue plus silencieusement quand + `symlinks.EnsureForAccount` échoue après le flip : il **roll-back** le lien + `~/.claude` vers le home du compte précédent et **n'appelle pas** + `SetActiveAccount`. Évite l'état incohérent où le daemon déclare le compte + cible actif alors que ses shared symlinks sont divergents → transcripts + dupliqués silencieusement, resume cassé. +- Si le rollback réussit : swap annulé, état filesystem = état pré-swap, + erreur explicite retournée par `executeSwitchE`. +- Si ensure ET rollback échouent : `partialSwap` atomique sticky set, + `ErrPartialSwap` retourné, tout futur swap est refusé tant que le + daemon n'est pas redémarré par l'opérateur. +- Nouvelle méthode publique `AccountSwitcher.IsPartialSwap() bool` pour + que health-checks et watchdog exposent l'état dégradé. + +### Ajouté — Tests Bug #1 +- `TestFlipEnsureFailureTriggersRollback` : plant un lien divergent sur + le home cible → ensure échoue → rollback réussit → `ActiveAccount` reste + compte1 → `~/.claude` pointe sur previousHome → `IsPartialSwap` = false. +- `TestFlipEnsureAndRollbackFailure` : force les deux flips à échouer + (homeDir = fichier régulier) → `ErrPartialSwap` retourné, flag sticky + set, swap suivant refusé. + +### Ajouté — Bug #10 +- `TestRequiredSharedIsCoherent` (`internal/symlinks/shared_test.go`) : + valide le contrat de la constante package-level `RequiredShared` jamais + exercée auparavant (tous les autres tests utilisent un override scoped + en `t.TempDir()`). Vérifie sans toucher au filesystem : + - exactement 3 entrées (`session-env`, `file-history`, `projects`) + - targets absolus + - `filepath.Dir(target)` identique pour les 3 entrées (invariant + "3 liens sous un même shared root" sur lequel repose `EnsureForAccount`). + +### Rationale +- Continuer après un ensure échoué revient à valider que le compte cible + est "sain" alors que les shared symlinks sont absents ou divergents. + Conséquence en prod : premier `claude --resume` écrit dans + `~/.claude/projects/` (privé) → transcripts dupliqués, undo + désynchronisé, failover complètement cassé sans log d'alerte. +- Le rollback garantit qu'un compte cible mal configuré ne peut PAS + dégrader le state du daemon : on retourne à l'état pré-swap et on + signale l'erreur à l'appelant. +- `ErrPartialSwap` + `IsPartialSwap()` documente un état où l'intervention + humaine est obligatoire — préférable à un retry automatique qui + empirerait la divergence. + +### Tests +- ✅ `go test ./...` : tous les packages PASS +- ✅ `go test -race ./...` : PASS, aucun data race +- ✅ `go vet ./...` : clean +- ✅ `go build ./...` : clean + +### Fichiers modifiés +- `internal/switcher/account_switcher.go` (+rollback + IsPartialSwap + ErrPartialSwap) +- `internal/switcher/account_switcher_test.go` (2 nouveaux tests, 1 test obsolète remplacé) +- `internal/symlinks/shared_test.go` (+TestRequiredSharedIsCoherent) + +## [0.3.7] - 2026-04-16 +**Type:** Patch — Phase 1 / Chantier A3 : wire EnsureForAccount post-flip + +### Ajouté +- `AccountSwitcher.executeSwitch` appelle désormais + `symlinks.EnsureForAccount(target.Home, ...)` **juste après** le flip + du lien principal `~/.claude`. Garantit que les 3 liens partagés + (`session-env`, `file-history`, `projects`) existent et pointent aux + bons targets sur le compte cible, même si celui-ci vient juste + d'être provisionné. +- `AccountSwitcher.sharedSymlinks` : override test-only (accepte une + liste `[]symlinks.SharedSymlink`). Défaut = `symlinks.RequiredShared`. + Les tests peuvent scoper la réconciliation dans un `t.TempDir()` pour + ne jamais toucher `/home/ubuntu/.claude-*-shared`. +- 2 tests unitaires : + - `TestFlipReconcilesSharedSymlinksOnTargetHome` : target home vide → + les 3 liens sont créés après le flip et pointent aux targets canoniques. + - `TestFlipEnsureSymlinksFailureDoesNotAbortSwap` : lien divergent + planté à la main → `EnsureForAccount` renvoie une erreur, logguée + en WARN, mais le swap complète quand même (best-effort post-flip). + +### Rationale +- Sans cet appel, un compte cible fraîchement provisionné n'aurait + pas encore ses 3 liens ; au premier `claude --resume`, Claude Code + écrirait dans `~/.claude/projects/` (privé) au lieu de + `/home/ubuntu/.claude-projects-shared` → transcripts dupliqués, + undo désynchronisé, resume silencieusement cassé. +- L'ensure est **best-effort** : une erreur est logguée en WARN mais + NE bloque PAS le flip. Si on abortait ici, on laisserait le daemon + dans un état incohérent (symlink déjà flippé mais `SetActiveAccount` + pas appelé). +- L'opérateur voit le WARN dans les logs et peut corriger la + divergence manuellement (ex: lien pointant sur le mauvais target). + +### Tests +- ✅ `go test ./...` : tous les packages PASS (incluant + `internal/switcher` et `internal/symlinks`). +- ✅ `go test -race ./internal/switcher/...` : PASS. +- ✅ `go vet ./...` : clean. + +### Fichiers modifiés +- `internal/switcher/account_switcher.go` +- `internal/switcher/account_switcher_test.go` ## [0.3.6] - 2026-04-16 **Type:** Patch — Phase 1 / Chantier A2 : validation des symlinks au startup diff --git a/go.mod b/go.mod index e30b815..45a82d6 100644 --- a/go.mod +++ b/go.mod @@ -2,9 +2,9 @@ module forge.secuaas.ovh/olivier/claude-failover go 1.22 -require gopkg.in/yaml.v3 v3.0.1 - require ( - github.com/fsnotify/fsnotify v1.9.0 // indirect - golang.org/x/sys v0.13.0 // indirect + github.com/fsnotify/fsnotify v1.9.0 + gopkg.in/yaml.v3 v3.0.1 ) + +require golang.org/x/sys v0.13.0 // indirect diff --git a/internal/switcher/account_switcher.go b/internal/switcher/account_switcher.go index e8de796..3778403 100644 --- a/internal/switcher/account_switcher.go +++ b/internal/switcher/account_switcher.go @@ -4,6 +4,7 @@ package switcher import ( "context" + "errors" "fmt" "log" "os" @@ -11,15 +12,27 @@ import ( "regexp" "strconv" "strings" + "sync/atomic" "time" "forge.secuaas.ovh/olivier/claude-failover/internal/config" "forge.secuaas.ovh/olivier/claude-failover/internal/notify" "forge.secuaas.ovh/olivier/claude-failover/internal/quota" "forge.secuaas.ovh/olivier/claude-failover/internal/state" + "forge.secuaas.ovh/olivier/claude-failover/internal/symlinks" "forge.secuaas.ovh/olivier/claude-failover/internal/tmux" ) +// ErrPartialSwap is returned (and wrapped) when the switcher flipped +// ~/.claude to the target home, EnsureForAccount failed on the target, +// and the rollback flip back to the previous home ALSO failed. The daemon +// is in a documented degraded state: the active-account setter was NOT +// called, but the filesystem symlink may point at an account whose shared +// state is inconsistent. Operator intervention is required. Callers can +// interrogate AccountSwitcher.IsPartialSwap() to expose the flag to +// health-checks / watchdogs. +var ErrPartialSwap = errors.New("switcher: partial swap — flip succeeded but ensure + rollback both failed") + // SwitchState represents the current phase of a failover operation. type SwitchState string @@ -52,6 +65,19 @@ type AccountSwitcher struct { // homeDir is the directory containing the .claude symlink. Overridable for tests. // When empty, os.UserHomeDir() is used. homeDir string + // sharedSymlinks is the list of shared-state links reconciled on the + // target account home after every flip. Overridable for tests so the + // suite never touches the operator's real /home/ubuntu/.claude-* + // shared directories. When nil, symlinks.RequiredShared is used. + sharedSymlinks []symlinks.SharedSymlink + // partialSwap is set to 1 when a flip+ensure+rollback sequence left the + // daemon in an inconsistent state (symlink possibly flipped, but active + // account NOT updated, and rollback flip ALSO failed). Health-checks / + // watchdogs read this flag via IsPartialSwap(). It is sticky: once set, + // it stays set until the operator restarts the daemon after fixing the + // filesystem state. We use atomic access so watchdog goroutines can read + // it without blocking the switcher. + partialSwap atomic.Bool } // New creates an AccountSwitcher. @@ -88,8 +114,30 @@ func (a *AccountSwitcher) Run(ctx context.Context) { // executeSwitch performs the full failover sequence. func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { + if err := a.executeSwitchE(req); err != nil { + // executeSwitchE already logs the detail; we swallow the error here + // because the public Run loop has no return channel. The partialSwap + // flag (if set) remains visible via IsPartialSwap(). + a.logger.Printf("[switcher] SWAP aborted: %v", err) + } +} + +// executeSwitchE runs the swap and returns an error describing any abort or +// partial-swap condition. Split out from executeSwitch so tests can assert +// on the error value without routing through a channel. +func (a *AccountSwitcher) executeSwitchE(req quota.SwitchRequest) error { a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime) + // Refuse to proceed if a previous swap left the daemon in an + // inconsistent state. The operator must intervene (fix the filesystem, + // restart the daemon) before any further failover can be attempted — + // otherwise we'd stack symlink flips on top of a broken state. + if a.partialSwap.Load() { + err := fmt.Errorf("refusing swap: daemon is in partial-swap degraded state (operator intervention required)") + a.logger.Printf("[switcher] %v", err) + return err + } + // 1. SAVING — capture resume UUIDs from all working sessions plus // every dedicated session unconditionally (dedicated sessions are // user-driven and may not be tracked as "working" in state, but their @@ -104,12 +152,52 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { if target == nil { a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From) a.currentState = StateNormal - return + return nil } + previous := a.findAccountByName(req.From) if err := a.flipSymlink(target.Home); err != nil { a.logger.Printf("[switcher] flipSymlink error: %v", err) } + // Ensure the target account home exposes the three shared-state + // symlinks (session-env, file-history, projects). If this fails we + // MUST NOT proceed with SetActiveAccount — the daemon would otherwise + // declare the target "active" while its shared state is divergent, + // silently writing transcripts into private /projects directories and + // breaking `claude --resume` across sessions. Instead we attempt to + // roll back the ~/.claude flip to the previous account. If the + // rollback also fails, the daemon is in a documented degraded state + // (ErrPartialSwap) and the operator must intervene. + if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil { + a.logger.Printf("[switcher] ensure shared symlinks for %q failed: %v — attempting rollback", target.Home, err) + if previous == nil || previous.Home == "" { + // No known previous home to roll back to — set the degraded + // flag and bail out. This is equivalent to a rollback failure + // because the filesystem is pointed at a broken target. + a.partialSwap.Store(true) + a.currentState = StateNormal + return fmt.Errorf("%w: ensure failed (%v) and no previous account home is known for rollback", ErrPartialSwap, err) + } + if rbErr := a.flipSymlink(previous.Home); rbErr != nil { + // Both the ensure AND the rollback failed. The daemon is now + // in a documented inconsistent state: ~/.claude may point at + // target whose shared-state is divergent, but SetActiveAccount + // has NOT been called so state.ActiveAccount is still the + // previous account. No further failover can be attempted + // until the operator intervenes. + a.partialSwap.Store(true) + a.logger.Printf("[switcher] CRITICAL partial swap: ensure=%v rollback=%v — daemon in degraded state, operator intervention required", err, rbErr) + a.currentState = StateNormal + return fmt.Errorf("%w: ensure=%v rollback=%v", ErrPartialSwap, err, rbErr) + } + // Rollback succeeded — symlink is back on the previous account, + // SetActiveAccount was NEVER called, state is consistent with + // "no swap happened". Return an explicit error so the caller + // knows the swap was cancelled. + a.logger.Printf("[switcher] rollback successful: ~/.claude → %s (swap cancelled)", previous.Home) + a.currentState = StateNormal + return fmt.Errorf("swap cancelled: ensure shared symlinks failed on target %q: %w", target.Home, err) + } a.killAllPoolSessions() a.recreatePoolSessions() a.relaunchDedicatedSessions(target.Home) @@ -135,6 +223,31 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { } a.currentState = StateNormal + return nil +} + +// IsPartialSwap reports whether the switcher is in a degraded state after a +// flip+ensure+rollback sequence all failed. Health-checks and watchdogs use +// this signal to surface an operator-actionable alert. The flag is sticky +// for the lifetime of the process: once set, it remains set until the daemon +// is restarted (after the operator has fixed the filesystem). +func (a *AccountSwitcher) IsPartialSwap() bool { + return a.partialSwap.Load() +} + +// findAccountByName returns the account config entry matching name, or nil. +// Unlike findTargetAccount (which returns the first NON-matching account), +// this is used by the rollback path to recover the previous home. +func (a *AccountSwitcher) findAccountByName(name string) *config.AccountConfig { + if name == "" { + return nil + } + for i := range a.config.Accounts { + if a.config.Accounts[i].Name == name { + return &a.config.Accounts[i] + } + } + return nil } // saveDedicatedUUIDs captures the resume UUID for every configured dedicated @@ -225,6 +338,16 @@ func (a *AccountSwitcher) saveAllSessions() { }) } +// requiredShared returns the shared-symlink list used when reconciling the +// target account home after a flip. Tests may set a.sharedSymlinks to a +// tmpdir-scoped list so they never touch /home/ubuntu/.claude-*-shared. +func (a *AccountSwitcher) requiredShared() []symlinks.SharedSymlink { + if a.sharedSymlinks != nil { + return a.sharedSymlinks + } + return symlinks.RequiredShared +} + // resolveHomeDir returns the configured homeDir (test override) or the real // user home. Tests MUST set a.homeDir to a tmpdir to avoid clobbering the // production ~/.claude symlink. diff --git a/internal/switcher/account_switcher_test.go b/internal/switcher/account_switcher_test.go index 8c3b292..14df33e 100644 --- a/internal/switcher/account_switcher_test.go +++ b/internal/switcher/account_switcher_test.go @@ -1,6 +1,9 @@ package switcher import ( + "errors" + "os" + "path/filepath" "strings" "testing" "time" @@ -8,8 +11,19 @@ import ( "forge.secuaas.ovh/olivier/claude-failover/internal/config" "forge.secuaas.ovh/olivier/claude-failover/internal/quota" "forge.secuaas.ovh/olivier/claude-failover/internal/state" + "forge.secuaas.ovh/olivier/claude-failover/internal/symlinks" ) +// tmpShared returns a SharedSymlink list whose targets live entirely under +// tmpDir, so switcher tests never touch /home/ubuntu/.claude-*-shared. +func tmpShared(tmpDir string) []symlinks.SharedSymlink { + return []symlinks.SharedSymlink{ + {Target: filepath.Join(tmpDir, "session-env-shared"), Name: "session-env"}, + {Target: filepath.Join(tmpDir, "file-history-shared"), Name: "file-history"}, + {Target: filepath.Join(tmpDir, "projects-shared"), Name: "projects"}, + } +} + // mockTmux for switcher tests. type mockTmux struct { sessions map[string]bool @@ -143,6 +157,9 @@ func TestKillAndRecreatePoolSessions(t *testing.T) { // touches the real ~/.claude (regression: a reboot used to leave Claude // Code unusable because the test had repointed ~/.claude to /tmp/...). a.homeDir = t.TempDir() + // Scope shared-symlink targets to a tmpdir so the post-flip ensure + // pass does not write inside /home/ubuntu/.claude-*-shared. + a.sharedSymlinks = tmpShared(t.TempDir()) a.executeSwitch(quota.SwitchRequest{From: "compte1"}) // Active account must have changed. @@ -186,10 +203,12 @@ func TestDedicatedRelaunchAfterSwap(t *testing.T) { s := state.New("") s.SetActiveAccount("compte1") + home1 := filepath.Join(t.TempDir(), "claude-1-xxxx") + home2 := filepath.Join(t.TempDir(), "claude-2-xxxx") cfg := &config.Config{ Accounts: []config.AccountConfig{ - {Name: "compte1", Home: "/tmp/claude-1-xxxx"}, - {Name: "compte2", Home: "/tmp/claude-2-xxxx"}, + {Name: "compte1", Home: home1}, + {Name: "compte2", Home: home2}, }, Pool: config.PoolConfig{ Dedicated: []config.DedicatedSession{{Name: "dedicated-1", Project: "/tmp"}}, @@ -199,6 +218,7 @@ func TestDedicatedRelaunchAfterSwap(t *testing.T) { a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) a.homeDir = t.TempDir() + a.sharedSymlinks = tmpShared(t.TempDir()) a.executeSwitch(quota.SwitchRequest{From: "compte1"}) // The relaunch must send a resume command on the dedicated session, @@ -213,10 +233,225 @@ func TestDedicatedRelaunchAfterSwap(t *testing.T) { if relaunch == "" { t.Fatalf("expected dedicated-1 relaunch send-keys; got %v", tc.sendKeyCalls) } - if !strings.Contains(relaunch, "CLAUDE_CONFIG_DIR=/tmp/claude-2-xxxx") { + if !strings.Contains(relaunch, "CLAUDE_CONFIG_DIR="+home2) { t.Errorf("relaunch should set CLAUDE_CONFIG_DIR to target home; got %q", relaunch) } if !strings.Contains(relaunch, "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee") { t.Errorf("relaunch should include captured UUID; got %q", relaunch) } } + +// TestFlipReconcilesSharedSymlinksOnTargetHome verifies A3: after the main +// ~/.claude flip, the switcher reconciles the three shared-state symlinks +// (session-env / file-history / projects) on the TARGET account home. +// Scenario: the target home has NO links yet — a freshly-provisioned account +// that has never been flipped into. Post-switch, all three links must exist +// inside the target home and point at the canonical shared targets. +func TestFlipReconcilesSharedSymlinksOnTargetHome(t *testing.T) { + tc := newMockTmux() + + s := state.New("") + s.SetActiveAccount("compte1") + + // Target home starts empty: EnsureForAccount will mkdir + create links. + targetHome := filepath.Join(t.TempDir(), "claude-compte2") + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Home: filepath.Join(t.TempDir(), "claude-compte1")}, + {Name: "compte2", Home: targetHome}, + }, + Pool: config.PoolConfig{ + Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0}, + }, + } + + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + a.homeDir = t.TempDir() + shared := tmpShared(t.TempDir()) + a.sharedSymlinks = shared + + // Pre-assert: no link exists in targetHome. + for _, sl := range shared { + if _, err := os.Lstat(filepath.Join(targetHome, sl.Name)); !os.IsNotExist(err) { + t.Fatalf("pre-condition: %q should not exist yet (err=%v)", sl.Name, err) + } + } + + a.executeSwitch(quota.SwitchRequest{From: "compte1"}) + + // Post-assert: every required link exists and points at the canonical + // target under the tmpdir-scoped shared root. + for _, sl := range shared { + linkPath := filepath.Join(targetHome, sl.Name) + info, err := os.Lstat(linkPath) + if err != nil { + t.Errorf("expected link at %s after flip: %v", linkPath, err) + continue + } + if info.Mode()&os.ModeSymlink == 0 { + t.Errorf("%s exists but is not a symlink", linkPath) + continue + } + got, err := os.Readlink(linkPath) + if err != nil { + t.Errorf("readlink %s: %v", linkPath, err) + continue + } + if got != sl.Target { + t.Errorf("link %s points to %q, want %q", linkPath, got, sl.Target) + } + } +} + +// TestFlipEnsureFailureTriggersRollback verifies the fix for the A3 bug +// (flip+ensure inconsistency): if EnsureForAccount fails on the target home +// after the ~/.claude flip, the switcher MUST NOT mark the target account +// active. It must instead roll back the ~/.claude symlink to the previous +// account's home, leaving the daemon in the pre-swap state so subsequent +// session work keeps writing to the known-good shared state. +// +// Old (buggy) behaviour: ensure error was WARN-only, SetActiveAccount still +// happened, dedicated sessions were relaunched against a target whose +// /projects, /session-env, /file-history were missing or divergent → +// transcripts duplicated silently, resume broke, undo history diverged. +func TestFlipEnsureFailureTriggersRollback(t *testing.T) { + tc := newMockTmux() + + s := state.New("") + s.SetActiveAccount("compte1") + + previousHome := filepath.Join(t.TempDir(), "claude-compte1") + targetHome := filepath.Join(t.TempDir(), "claude-compte2") + if err := os.MkdirAll(previousHome, 0700); err != nil { + t.Fatalf("mkdir previous home: %v", err) + } + if err := os.MkdirAll(targetHome, 0700); err != nil { + t.Fatalf("mkdir target home: %v", err) + } + // Plant a divergent link at /session-env. The symlinks + // package refuses to auto-correct this (data-loss safeguard) and will + // return an error, which must now trigger a rollback. + bogus := filepath.Join(t.TempDir(), "somewhere-else") + if err := os.MkdirAll(bogus, 0700); err != nil { + t.Fatalf("mkdir bogus: %v", err) + } + if err := os.Symlink(bogus, filepath.Join(targetHome, "session-env")); err != nil { + t.Fatalf("plant divergent link: %v", err) + } + + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Home: previousHome}, + {Name: "compte2", Home: targetHome}, + }, + Pool: config.PoolConfig{ + Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0}, + }, + } + + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + homeDir := t.TempDir() + a.homeDir = homeDir + a.sharedSymlinks = tmpShared(t.TempDir()) + + err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}) + if err == nil { + t.Fatalf("executeSwitchE: expected cancellation error, got nil") + } + // The public symmetric swap-cancelled error must mention ensure and + // wrap the underlying symlinks package message. ErrPartialSwap must + // NOT be set (rollback succeeded → recoverable condition). + if errors.Is(err, ErrPartialSwap) { + t.Errorf("did not expect ErrPartialSwap; rollback succeeded; got %v", err) + } + if a.IsPartialSwap() { + t.Errorf("IsPartialSwap should be false when rollback succeeds") + } + + // Active account must remain the previous one — SetActiveAccount must + // NOT have been called. + if got := s.ActiveAccount(); got != "compte1" { + t.Errorf("active account should stay compte1 after rollback; got %q", got) + } + + // ~/.claude must now point at the previous home (rollback target). + link, rlErr := os.Readlink(filepath.Join(homeDir, ".claude")) + if rlErr != nil { + t.Fatalf("readlink ~/.claude: %v", rlErr) + } + if link != previousHome { + t.Errorf("~/.claude should point at previous home %q after rollback; got %q", previousHome, link) + } +} + +// TestFlipEnsureAndRollbackFailure verifies that when BOTH EnsureForAccount +// AND the rollback flip fail, the switcher sets the sticky partial-swap +// flag and returns ErrPartialSwap. The daemon is then in a documented +// degraded state where any further swap is refused until the operator +// restarts it. +func TestFlipEnsureAndRollbackFailure(t *testing.T) { + tc := newMockTmux() + + s := state.New("") + s.SetActiveAccount("compte1") + + previousHome := filepath.Join(t.TempDir(), "claude-compte1") + targetHome := filepath.Join(t.TempDir(), "claude-compte2") + if err := os.MkdirAll(previousHome, 0700); err != nil { + t.Fatalf("mkdir previous home: %v", err) + } + if err := os.MkdirAll(targetHome, 0700); err != nil { + t.Fatalf("mkdir target home: %v", err) + } + // Plant the divergent link that will cause EnsureForAccount to fail. + bogus := filepath.Join(t.TempDir(), "somewhere-else") + if err := os.MkdirAll(bogus, 0700); err != nil { + t.Fatalf("mkdir bogus: %v", err) + } + if err := os.Symlink(bogus, filepath.Join(targetHome, "session-env")); err != nil { + t.Fatalf("plant divergent link: %v", err) + } + + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Home: previousHome}, + {Name: "compte2", Home: targetHome}, + }, + Pool: config.PoolConfig{ + Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0}, + }, + } + + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + + // Force the rollback flip to fail: point homeDir at a file that cannot + // host a .claude symlink. We use a regular file; the flipSymlink + // implementation does os.Remove() then os.Symlink() under homeDir, + // which fails when homeDir is itself a file (ENOTDIR). + badHomeFile := filepath.Join(t.TempDir(), "not-a-dir") + if err := os.WriteFile(badHomeFile, []byte("block"), 0600); err != nil { + t.Fatalf("write bad home: %v", err) + } + a.homeDir = badHomeFile + a.sharedSymlinks = tmpShared(t.TempDir()) + + err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}) + if err == nil { + t.Fatalf("expected ErrPartialSwap, got nil") + } + if !errors.Is(err, ErrPartialSwap) { + t.Errorf("expected ErrPartialSwap, got %v", err) + } + if !a.IsPartialSwap() { + t.Errorf("IsPartialSwap should be true when both ensure AND rollback fail") + } + // SetActiveAccount must still not have been called. + if got := s.ActiveAccount(); got != "compte1" { + t.Errorf("active account must stay compte1 in partial-swap; got %q", got) + } + + // A subsequent swap attempt must be refused while the flag is set. + if err2 := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}); err2 == nil { + t.Errorf("expected subsequent swap to be refused in degraded state") + } +} diff --git a/internal/symlinks/shared_test.go b/internal/symlinks/shared_test.go index 64625d4..e472678 100644 --- a/internal/symlinks/shared_test.go +++ b/internal/symlinks/shared_test.go @@ -204,3 +204,73 @@ func TestRequiredShared_defaultsAreReasonable(t *testing.T) { } } } + +// TestRequiredSharedIsCoherent validates the contract of the package-level +// RequiredShared constant that the switcher and lifecycle manager consume +// in production. The rest of the suite exercises EnsureForAccount / +// ValidateAll with tmpdir-scoped override lists, so the actual prod +// constant (pointing under /home/ubuntu/.claude-*-shared) is never touched +// by those tests — a regression that shrinks or renames RequiredShared +// would pass every other test but silently break failover on real VMs +// (missing a link → writes to private state → transcripts duplicated). +// +// The test is filesystem-free: it only asserts the shape of the constant. +// +// 1. Exactly three entries, one per Name required by the A-failover design. +// 2. Every Target is absolute. +// 3. All three Targets share the same parent directory — there is no +// mode of operation where one shared dir lives elsewhere than the +// others. `filepath.Dir(target)` must be identical across entries. +// +// This encodes the "3 links under one shared root" invariant that +// EnsureForAccount relies on. Any future change to RequiredShared that +// breaks this invariant should force the author to update the switcher +// contract explicitly. +func TestRequiredSharedIsCoherent(t *testing.T) { + expectedNames := map[string]bool{ + "session-env": false, + "file-history": false, + "projects": false, + } + if len(RequiredShared) != len(expectedNames) { + t.Fatalf("RequiredShared must contain exactly %d entries (session-env, file-history, projects); got %d: %+v", + len(expectedNames), len(RequiredShared), RequiredShared) + } + + var sharedParent string + for i, sl := range RequiredShared { + if _, ok := expectedNames[sl.Name]; !ok { + t.Errorf("RequiredShared[%d]: unexpected Name %q (allowed: session-env / file-history / projects)", i, sl.Name) + continue + } + if expectedNames[sl.Name] { + t.Errorf("RequiredShared[%d]: duplicate Name %q", i, sl.Name) + } + expectedNames[sl.Name] = true + + if sl.Target == "" { + t.Errorf("RequiredShared[%d] (%q): empty Target", i, sl.Name) + continue + } + if !filepath.IsAbs(sl.Target) { + t.Errorf("RequiredShared[%d] (%q): Target %q must be absolute", i, sl.Name, sl.Target) + continue + } + + parent := filepath.Dir(sl.Target) + if i == 0 { + sharedParent = parent + continue + } + if parent != sharedParent { + t.Errorf("RequiredShared[%d] (%q): parent dir %q diverges from %q — all shared targets must live under the same root", + i, sl.Name, parent, sharedParent) + } + } + + for name, seen := range expectedNames { + if !seen { + t.Errorf("RequiredShared is missing the required %q entry", name) + } + } +}