diff --git a/VERSION.md b/VERSION.md index e9acf7d..9fa2f32 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1,4 +1,64 @@ -# Version actuelle : 0.3.7 +# Version actuelle : 0.3.8 + +## [0.3.8] - 2026-04-16 +**Type:** Patch — Bug #1 (A3 flip+ensure inconsistency) + Bug #10 (requiredShared contract test) + +### Corrigé — Bug #1 (CRITIQUE) +- `AccountSwitcher.executeSwitch` ne continue plus silencieusement quand + `symlinks.EnsureForAccount` échoue après le flip : il **roll-back** le lien + `~/.claude` vers le home du compte précédent et **n'appelle pas** + `SetActiveAccount`. Évite l'état incohérent où le daemon déclare le compte + cible actif alors que ses shared symlinks sont divergents → transcripts + dupliqués silencieusement, resume cassé. +- Si le rollback réussit : swap annulé, état filesystem = état pré-swap, + erreur explicite retournée par `executeSwitchE`. +- Si ensure ET rollback échouent : `partialSwap` atomique sticky set, + `ErrPartialSwap` retourné, tout futur swap est refusé tant que le + daemon n'est pas redémarré par l'opérateur. +- Nouvelle méthode publique `AccountSwitcher.IsPartialSwap() bool` pour + que health-checks et watchdog exposent l'état dégradé. + +### Ajouté — Tests Bug #1 +- `TestFlipEnsureFailureTriggersRollback` : plant un lien divergent sur + le home cible → ensure échoue → rollback réussit → `ActiveAccount` reste + compte1 → `~/.claude` pointe sur previousHome → `IsPartialSwap` = false. +- `TestFlipEnsureAndRollbackFailure` : force les deux flips à échouer + (homeDir = fichier régulier) → `ErrPartialSwap` retourné, flag sticky + set, swap suivant refusé. + +### Ajouté — Bug #10 +- `TestRequiredSharedIsCoherent` (`internal/symlinks/shared_test.go`) : + valide le contrat de la constante package-level `RequiredShared` jamais + exercée auparavant (tous les autres tests utilisent un override scoped + en `t.TempDir()`). Vérifie sans toucher au filesystem : + - exactement 3 entrées (`session-env`, `file-history`, `projects`) + - targets absolus + - `filepath.Dir(target)` identique pour les 3 entrées (invariant + "3 liens sous un même shared root" sur lequel repose `EnsureForAccount`). + +### Rationale +- Continuer après un ensure échoué revient à valider que le compte cible + est "sain" alors que les shared symlinks sont absents ou divergents. + Conséquence en prod : premier `claude --resume` écrit dans + `~/.claude/projects/` (privé) → transcripts dupliqués, undo + désynchronisé, failover complètement cassé sans log d'alerte. +- Le rollback garantit qu'un compte cible mal configuré ne peut PAS + dégrader le state du daemon : on retourne à l'état pré-swap et on + signale l'erreur à l'appelant. +- `ErrPartialSwap` + `IsPartialSwap()` documente un état où l'intervention + humaine est obligatoire — préférable à un retry automatique qui + empirerait la divergence. + +### Tests +- ✅ `go test ./...` : tous les packages PASS +- ✅ `go test -race ./...` : PASS, aucun data race +- ✅ `go vet ./...` : clean +- ✅ `go build ./...` : clean + +### Fichiers modifiés +- `internal/switcher/account_switcher.go` (+rollback + IsPartialSwap + ErrPartialSwap) +- `internal/switcher/account_switcher_test.go` (2 nouveaux tests, 1 test obsolète remplacé) +- `internal/symlinks/shared_test.go` (+TestRequiredSharedIsCoherent) ## [0.3.7] - 2026-04-16 **Type:** Patch — Phase 1 / Chantier A3 : wire EnsureForAccount post-flip diff --git a/internal/switcher/account_switcher.go b/internal/switcher/account_switcher.go index 9e854b1..3778403 100644 --- a/internal/switcher/account_switcher.go +++ b/internal/switcher/account_switcher.go @@ -4,6 +4,7 @@ package switcher import ( "context" + "errors" "fmt" "log" "os" @@ -11,6 +12,7 @@ import ( "regexp" "strconv" "strings" + "sync/atomic" "time" "forge.secuaas.ovh/olivier/claude-failover/internal/config" @@ -21,6 +23,16 @@ import ( "forge.secuaas.ovh/olivier/claude-failover/internal/tmux" ) +// ErrPartialSwap is returned (and wrapped) when the switcher flipped +// ~/.claude to the target home, EnsureForAccount failed on the target, +// and the rollback flip back to the previous home ALSO failed. The daemon +// is in a documented degraded state: the active-account setter was NOT +// called, but the filesystem symlink may point at an account whose shared +// state is inconsistent. Operator intervention is required. Callers can +// interrogate AccountSwitcher.IsPartialSwap() to expose the flag to +// health-checks / watchdogs. +var ErrPartialSwap = errors.New("switcher: partial swap — flip succeeded but ensure + rollback both failed") + // SwitchState represents the current phase of a failover operation. type SwitchState string @@ -58,6 +70,14 @@ type AccountSwitcher struct { // suite never touches the operator's real /home/ubuntu/.claude-* // shared directories. When nil, symlinks.RequiredShared is used. sharedSymlinks []symlinks.SharedSymlink + // partialSwap is set to 1 when a flip+ensure+rollback sequence left the + // daemon in an inconsistent state (symlink possibly flipped, but active + // account NOT updated, and rollback flip ALSO failed). Health-checks / + // watchdogs read this flag via IsPartialSwap(). It is sticky: once set, + // it stays set until the operator restarts the daemon after fixing the + // filesystem state. We use atomic access so watchdog goroutines can read + // it without blocking the switcher. + partialSwap atomic.Bool } // New creates an AccountSwitcher. @@ -94,8 +114,30 @@ func (a *AccountSwitcher) Run(ctx context.Context) { // executeSwitch performs the full failover sequence. func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { + if err := a.executeSwitchE(req); err != nil { + // executeSwitchE already logs the detail; we swallow the error here + // because the public Run loop has no return channel. The partialSwap + // flag (if set) remains visible via IsPartialSwap(). + a.logger.Printf("[switcher] SWAP aborted: %v", err) + } +} + +// executeSwitchE runs the swap and returns an error describing any abort or +// partial-swap condition. Split out from executeSwitch so tests can assert +// on the error value without routing through a channel. +func (a *AccountSwitcher) executeSwitchE(req quota.SwitchRequest) error { a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime) + // Refuse to proceed if a previous swap left the daemon in an + // inconsistent state. The operator must intervene (fix the filesystem, + // restart the daemon) before any further failover can be attempted — + // otherwise we'd stack symlink flips on top of a broken state. + if a.partialSwap.Load() { + err := fmt.Errorf("refusing swap: daemon is in partial-swap degraded state (operator intervention required)") + a.logger.Printf("[switcher] %v", err) + return err + } + // 1. SAVING — capture resume UUIDs from all working sessions plus // every dedicated session unconditionally (dedicated sessions are // user-driven and may not be tracked as "working" in state, but their @@ -110,21 +152,51 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { if target == nil { a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From) a.currentState = StateNormal - return + return nil } + previous := a.findAccountByName(req.From) if err := a.flipSymlink(target.Home); err != nil { a.logger.Printf("[switcher] flipSymlink error: %v", err) } - // Best-effort: make sure the target account home exposes the three - // shared-state symlinks (session-env, file-history, projects). The main - // ~/.claude flip is already done, so an error here must NOT abort the - // swap — we just log it so the operator can investigate. Without this - // call, a fresh target account with no shared links would silently - // start writing into private /projects/session-env/file-history dirs - // and diverge from the primary account's transcripts. + // Ensure the target account home exposes the three shared-state + // symlinks (session-env, file-history, projects). If this fails we + // MUST NOT proceed with SetActiveAccount — the daemon would otherwise + // declare the target "active" while its shared state is divergent, + // silently writing transcripts into private /projects directories and + // breaking `claude --resume` across sessions. Instead we attempt to + // roll back the ~/.claude flip to the previous account. If the + // rollback also fails, the daemon is in a documented degraded state + // (ErrPartialSwap) and the operator must intervene. if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil { - a.logger.Printf("[switcher] WARN ensure shared symlinks for %q: %v", target.Home, err) + a.logger.Printf("[switcher] ensure shared symlinks for %q failed: %v — attempting rollback", target.Home, err) + if previous == nil || previous.Home == "" { + // No known previous home to roll back to — set the degraded + // flag and bail out. This is equivalent to a rollback failure + // because the filesystem is pointed at a broken target. + a.partialSwap.Store(true) + a.currentState = StateNormal + return fmt.Errorf("%w: ensure failed (%v) and no previous account home is known for rollback", ErrPartialSwap, err) + } + if rbErr := a.flipSymlink(previous.Home); rbErr != nil { + // Both the ensure AND the rollback failed. The daemon is now + // in a documented inconsistent state: ~/.claude may point at + // target whose shared-state is divergent, but SetActiveAccount + // has NOT been called so state.ActiveAccount is still the + // previous account. No further failover can be attempted + // until the operator intervenes. + a.partialSwap.Store(true) + a.logger.Printf("[switcher] CRITICAL partial swap: ensure=%v rollback=%v — daemon in degraded state, operator intervention required", err, rbErr) + a.currentState = StateNormal + return fmt.Errorf("%w: ensure=%v rollback=%v", ErrPartialSwap, err, rbErr) + } + // Rollback succeeded — symlink is back on the previous account, + // SetActiveAccount was NEVER called, state is consistent with + // "no swap happened". Return an explicit error so the caller + // knows the swap was cancelled. + a.logger.Printf("[switcher] rollback successful: ~/.claude → %s (swap cancelled)", previous.Home) + a.currentState = StateNormal + return fmt.Errorf("swap cancelled: ensure shared symlinks failed on target %q: %w", target.Home, err) } a.killAllPoolSessions() a.recreatePoolSessions() @@ -151,6 +223,31 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { } a.currentState = StateNormal + return nil +} + +// IsPartialSwap reports whether the switcher is in a degraded state after a +// flip+ensure+rollback sequence all failed. Health-checks and watchdogs use +// this signal to surface an operator-actionable alert. The flag is sticky +// for the lifetime of the process: once set, it remains set until the daemon +// is restarted (after the operator has fixed the filesystem). +func (a *AccountSwitcher) IsPartialSwap() bool { + return a.partialSwap.Load() +} + +// findAccountByName returns the account config entry matching name, or nil. +// Unlike findTargetAccount (which returns the first NON-matching account), +// this is used by the rollback path to recover the previous home. +func (a *AccountSwitcher) findAccountByName(name string) *config.AccountConfig { + if name == "" { + return nil + } + for i := range a.config.Accounts { + if a.config.Accounts[i].Name == name { + return &a.config.Accounts[i] + } + } + return nil } // saveDedicatedUUIDs captures the resume UUID for every configured dedicated diff --git a/internal/switcher/account_switcher_test.go b/internal/switcher/account_switcher_test.go index d2f327d..14df33e 100644 --- a/internal/switcher/account_switcher_test.go +++ b/internal/switcher/account_switcher_test.go @@ -1,6 +1,7 @@ package switcher import ( + "errors" "os" "path/filepath" "strings" @@ -302,25 +303,34 @@ func TestFlipReconcilesSharedSymlinksOnTargetHome(t *testing.T) { } } -// TestFlipEnsureSymlinksFailureDoesNotAbortSwap verifies A3 best-effort: -// if EnsureForAccount returns an error (here: a divergent pre-existing link -// that the symlinks package refuses to auto-correct), the flip and the swap -// MUST still complete. The shared symlink reconcile is post-flip cleanup, -// not a gate on the failover itself — aborting here would leave the daemon -// in an inconsistent state (symlink flipped but active account not updated). -func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) { +// TestFlipEnsureFailureTriggersRollback verifies the fix for the A3 bug +// (flip+ensure inconsistency): if EnsureForAccount fails on the target home +// after the ~/.claude flip, the switcher MUST NOT mark the target account +// active. It must instead roll back the ~/.claude symlink to the previous +// account's home, leaving the daemon in the pre-swap state so subsequent +// session work keeps writing to the known-good shared state. +// +// Old (buggy) behaviour: ensure error was WARN-only, SetActiveAccount still +// happened, dedicated sessions were relaunched against a target whose +// /projects, /session-env, /file-history were missing or divergent → +// transcripts duplicated silently, resume broke, undo history diverged. +func TestFlipEnsureFailureTriggersRollback(t *testing.T) { tc := newMockTmux() s := state.New("") s.SetActiveAccount("compte1") + previousHome := filepath.Join(t.TempDir(), "claude-compte1") targetHome := filepath.Join(t.TempDir(), "claude-compte2") + if err := os.MkdirAll(previousHome, 0700); err != nil { + t.Fatalf("mkdir previous home: %v", err) + } if err := os.MkdirAll(targetHome, 0700); err != nil { t.Fatalf("mkdir target home: %v", err) } // Plant a divergent link at /session-env. The symlinks // package refuses to auto-correct this (data-loss safeguard) and will - // return an error — which the switcher must swallow with a WARN log. + // return an error, which must now trigger a rollback. bogus := filepath.Join(t.TempDir(), "somewhere-else") if err := os.MkdirAll(bogus, 0700); err != nil { t.Fatalf("mkdir bogus: %v", err) @@ -331,7 +341,7 @@ func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) { cfg := &config.Config{ Accounts: []config.AccountConfig{ - {Name: "compte1", Home: filepath.Join(t.TempDir(), "claude-compte1")}, + {Name: "compte1", Home: previousHome}, {Name: "compte2", Home: targetHome}, }, Pool: config.PoolConfig{ @@ -340,13 +350,108 @@ func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) { } a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) - a.homeDir = t.TempDir() + homeDir := t.TempDir() + a.homeDir = homeDir a.sharedSymlinks = tmpShared(t.TempDir()) - a.executeSwitch(quota.SwitchRequest{From: "compte1"}) + err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}) + if err == nil { + t.Fatalf("executeSwitchE: expected cancellation error, got nil") + } + // The public symmetric swap-cancelled error must mention ensure and + // wrap the underlying symlinks package message. ErrPartialSwap must + // NOT be set (rollback succeeded → recoverable condition). + if errors.Is(err, ErrPartialSwap) { + t.Errorf("did not expect ErrPartialSwap; rollback succeeded; got %v", err) + } + if a.IsPartialSwap() { + t.Errorf("IsPartialSwap should be false when rollback succeeds") + } - // The swap must have completed despite the divergent-link error. - if got := s.ActiveAccount(); got != "compte2" { - t.Errorf("swap should complete even when ensure fails; active=%q want compte2", got) + // Active account must remain the previous one — SetActiveAccount must + // NOT have been called. + if got := s.ActiveAccount(); got != "compte1" { + t.Errorf("active account should stay compte1 after rollback; got %q", got) + } + + // ~/.claude must now point at the previous home (rollback target). + link, rlErr := os.Readlink(filepath.Join(homeDir, ".claude")) + if rlErr != nil { + t.Fatalf("readlink ~/.claude: %v", rlErr) + } + if link != previousHome { + t.Errorf("~/.claude should point at previous home %q after rollback; got %q", previousHome, link) + } +} + +// TestFlipEnsureAndRollbackFailure verifies that when BOTH EnsureForAccount +// AND the rollback flip fail, the switcher sets the sticky partial-swap +// flag and returns ErrPartialSwap. The daemon is then in a documented +// degraded state where any further swap is refused until the operator +// restarts it. +func TestFlipEnsureAndRollbackFailure(t *testing.T) { + tc := newMockTmux() + + s := state.New("") + s.SetActiveAccount("compte1") + + previousHome := filepath.Join(t.TempDir(), "claude-compte1") + targetHome := filepath.Join(t.TempDir(), "claude-compte2") + if err := os.MkdirAll(previousHome, 0700); err != nil { + t.Fatalf("mkdir previous home: %v", err) + } + if err := os.MkdirAll(targetHome, 0700); err != nil { + t.Fatalf("mkdir target home: %v", err) + } + // Plant the divergent link that will cause EnsureForAccount to fail. + bogus := filepath.Join(t.TempDir(), "somewhere-else") + if err := os.MkdirAll(bogus, 0700); err != nil { + t.Fatalf("mkdir bogus: %v", err) + } + if err := os.Symlink(bogus, filepath.Join(targetHome, "session-env")); err != nil { + t.Fatalf("plant divergent link: %v", err) + } + + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Home: previousHome}, + {Name: "compte2", Home: targetHome}, + }, + Pool: config.PoolConfig{ + Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0}, + }, + } + + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + + // Force the rollback flip to fail: point homeDir at a file that cannot + // host a .claude symlink. We use a regular file; the flipSymlink + // implementation does os.Remove() then os.Symlink() under homeDir, + // which fails when homeDir is itself a file (ENOTDIR). + badHomeFile := filepath.Join(t.TempDir(), "not-a-dir") + if err := os.WriteFile(badHomeFile, []byte("block"), 0600); err != nil { + t.Fatalf("write bad home: %v", err) + } + a.homeDir = badHomeFile + a.sharedSymlinks = tmpShared(t.TempDir()) + + err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}) + if err == nil { + t.Fatalf("expected ErrPartialSwap, got nil") + } + if !errors.Is(err, ErrPartialSwap) { + t.Errorf("expected ErrPartialSwap, got %v", err) + } + if !a.IsPartialSwap() { + t.Errorf("IsPartialSwap should be true when both ensure AND rollback fail") + } + // SetActiveAccount must still not have been called. + if got := s.ActiveAccount(); got != "compte1" { + t.Errorf("active account must stay compte1 in partial-swap; got %q", got) + } + + // A subsequent swap attempt must be refused while the flag is set. + if err2 := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}); err2 == nil { + t.Errorf("expected subsequent swap to be refused in degraded state") } } diff --git a/internal/symlinks/shared_test.go b/internal/symlinks/shared_test.go index 64625d4..e472678 100644 --- a/internal/symlinks/shared_test.go +++ b/internal/symlinks/shared_test.go @@ -204,3 +204,73 @@ func TestRequiredShared_defaultsAreReasonable(t *testing.T) { } } } + +// TestRequiredSharedIsCoherent validates the contract of the package-level +// RequiredShared constant that the switcher and lifecycle manager consume +// in production. The rest of the suite exercises EnsureForAccount / +// ValidateAll with tmpdir-scoped override lists, so the actual prod +// constant (pointing under /home/ubuntu/.claude-*-shared) is never touched +// by those tests — a regression that shrinks or renames RequiredShared +// would pass every other test but silently break failover on real VMs +// (missing a link → writes to private state → transcripts duplicated). +// +// The test is filesystem-free: it only asserts the shape of the constant. +// +// 1. Exactly three entries, one per Name required by the A-failover design. +// 2. Every Target is absolute. +// 3. All three Targets share the same parent directory — there is no +// mode of operation where one shared dir lives elsewhere than the +// others. `filepath.Dir(target)` must be identical across entries. +// +// This encodes the "3 links under one shared root" invariant that +// EnsureForAccount relies on. Any future change to RequiredShared that +// breaks this invariant should force the author to update the switcher +// contract explicitly. +func TestRequiredSharedIsCoherent(t *testing.T) { + expectedNames := map[string]bool{ + "session-env": false, + "file-history": false, + "projects": false, + } + if len(RequiredShared) != len(expectedNames) { + t.Fatalf("RequiredShared must contain exactly %d entries (session-env, file-history, projects); got %d: %+v", + len(expectedNames), len(RequiredShared), RequiredShared) + } + + var sharedParent string + for i, sl := range RequiredShared { + if _, ok := expectedNames[sl.Name]; !ok { + t.Errorf("RequiredShared[%d]: unexpected Name %q (allowed: session-env / file-history / projects)", i, sl.Name) + continue + } + if expectedNames[sl.Name] { + t.Errorf("RequiredShared[%d]: duplicate Name %q", i, sl.Name) + } + expectedNames[sl.Name] = true + + if sl.Target == "" { + t.Errorf("RequiredShared[%d] (%q): empty Target", i, sl.Name) + continue + } + if !filepath.IsAbs(sl.Target) { + t.Errorf("RequiredShared[%d] (%q): Target %q must be absolute", i, sl.Name, sl.Target) + continue + } + + parent := filepath.Dir(sl.Target) + if i == 0 { + sharedParent = parent + continue + } + if parent != sharedParent { + t.Errorf("RequiredShared[%d] (%q): parent dir %q diverges from %q — all shared targets must live under the same root", + i, sl.Name, parent, sharedParent) + } + } + + for name, seen := range expectedNames { + if !seen { + t.Errorf("RequiredShared is missing the required %q entry", name) + } + } +}