From 8eaf0bbd359e36c0b0feac5e2e2e22499d3eb663 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 16 Apr 2026 19:34:03 +0000 Subject: [PATCH 1/3] feat(switcher): ensure shared symlinks on target home after flip (A3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire symlinks.EnsureForAccount into executeSwitch, called immediately after the ~/.claude flip. Guarantees the three shared-state links (session-env, file-history, projects) exist on the target account home even for freshly-provisioned accounts, preventing silent transcript duplication and undo-history divergence on first resume. Best-effort: errors are logged as WARN but never abort the swap. If we returned here the daemon would be left inconsistent (symlink flipped, SetActiveAccount never called). Operator sees the warning in logs and resolves divergent links manually. Tests: - TestFlipReconcilesSharedSymlinksOnTargetHome: empty target home gets all three links pointing at canonical targets after the flip. - TestFlipEnsureSymlinksFailureDoesNotAbortSwap: a planted divergent link triggers the symlinks-package error; the swap completes anyway and the active account is updated. Hermetic: added AccountSwitcher.sharedSymlinks override so tests scope the reconcile inside t.TempDir() and never touch /home/ubuntu/.claude-*-shared. Existing tests migrated to this pattern and hardcoded /tmp/claude-*-xxxx paths replaced with tmpdirs. Phase 1 / Chantier A — task A3. --- VERSION.md | 46 ++++++- internal/switcher/account_switcher.go | 26 ++++ internal/switcher/account_switcher_test.go | 136 ++++++++++++++++++++- 3 files changed, 204 insertions(+), 4 deletions(-) diff --git a/VERSION.md b/VERSION.md index b777c33..e9acf7d 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1,4 +1,48 @@ -# Version actuelle : 0.3.6 +# Version actuelle : 0.3.7 + +## [0.3.7] - 2026-04-16 +**Type:** Patch — Phase 1 / Chantier A3 : wire EnsureForAccount post-flip + +### Ajouté +- `AccountSwitcher.executeSwitch` appelle désormais + `symlinks.EnsureForAccount(target.Home, ...)` **juste après** le flip + du lien principal `~/.claude`. Garantit que les 3 liens partagés + (`session-env`, `file-history`, `projects`) existent et pointent aux + bons targets sur le compte cible, même si celui-ci vient juste + d'être provisionné. +- `AccountSwitcher.sharedSymlinks` : override test-only (accepte une + liste `[]symlinks.SharedSymlink`). Défaut = `symlinks.RequiredShared`. + Les tests peuvent scoper la réconciliation dans un `t.TempDir()` pour + ne jamais toucher `/home/ubuntu/.claude-*-shared`. +- 2 tests unitaires : + - `TestFlipReconcilesSharedSymlinksOnTargetHome` : target home vide → + les 3 liens sont créés après le flip et pointent aux targets canoniques. + - `TestFlipEnsureSymlinksFailureDoesNotAbortSwap` : lien divergent + planté à la main → `EnsureForAccount` renvoie une erreur, logguée + en WARN, mais le swap complète quand même (best-effort post-flip). + +### Rationale +- Sans cet appel, un compte cible fraîchement provisionné n'aurait + pas encore ses 3 liens ; au premier `claude --resume`, Claude Code + écrirait dans `~/.claude/projects/` (privé) au lieu de + `/home/ubuntu/.claude-projects-shared` → transcripts dupliqués, + undo désynchronisé, resume silencieusement cassé. +- L'ensure est **best-effort** : une erreur est logguée en WARN mais + NE bloque PAS le flip. Si on abortait ici, on laisserait le daemon + dans un état incohérent (symlink déjà flippé mais `SetActiveAccount` + pas appelé). +- L'opérateur voit le WARN dans les logs et peut corriger la + divergence manuellement (ex: lien pointant sur le mauvais target). + +### Tests +- ✅ `go test ./...` : tous les packages PASS (incluant + `internal/switcher` et `internal/symlinks`). +- ✅ `go test -race ./internal/switcher/...` : PASS. +- ✅ `go vet ./...` : clean. + +### Fichiers modifiés +- `internal/switcher/account_switcher.go` +- `internal/switcher/account_switcher_test.go` ## [0.3.6] - 2026-04-16 **Type:** Patch — Phase 1 / Chantier A2 : validation des symlinks au startup diff --git a/internal/switcher/account_switcher.go b/internal/switcher/account_switcher.go index e8de796..9e854b1 100644 --- a/internal/switcher/account_switcher.go +++ b/internal/switcher/account_switcher.go @@ -17,6 +17,7 @@ import ( "forge.secuaas.ovh/olivier/claude-failover/internal/notify" "forge.secuaas.ovh/olivier/claude-failover/internal/quota" "forge.secuaas.ovh/olivier/claude-failover/internal/state" + "forge.secuaas.ovh/olivier/claude-failover/internal/symlinks" "forge.secuaas.ovh/olivier/claude-failover/internal/tmux" ) @@ -52,6 +53,11 @@ type AccountSwitcher struct { // homeDir is the directory containing the .claude symlink. Overridable for tests. // When empty, os.UserHomeDir() is used. homeDir string + // sharedSymlinks is the list of shared-state links reconciled on the + // target account home after every flip. Overridable for tests so the + // suite never touches the operator's real /home/ubuntu/.claude-* + // shared directories. When nil, symlinks.RequiredShared is used. + sharedSymlinks []symlinks.SharedSymlink } // New creates an AccountSwitcher. @@ -110,6 +116,16 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { if err := a.flipSymlink(target.Home); err != nil { a.logger.Printf("[switcher] flipSymlink error: %v", err) } + // Best-effort: make sure the target account home exposes the three + // shared-state symlinks (session-env, file-history, projects). The main + // ~/.claude flip is already done, so an error here must NOT abort the + // swap — we just log it so the operator can investigate. Without this + // call, a fresh target account with no shared links would silently + // start writing into private /projects/session-env/file-history dirs + // and diverge from the primary account's transcripts. + if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil { + a.logger.Printf("[switcher] WARN ensure shared symlinks for %q: %v", target.Home, err) + } a.killAllPoolSessions() a.recreatePoolSessions() a.relaunchDedicatedSessions(target.Home) @@ -225,6 +241,16 @@ func (a *AccountSwitcher) saveAllSessions() { }) } +// requiredShared returns the shared-symlink list used when reconciling the +// target account home after a flip. Tests may set a.sharedSymlinks to a +// tmpdir-scoped list so they never touch /home/ubuntu/.claude-*-shared. +func (a *AccountSwitcher) requiredShared() []symlinks.SharedSymlink { + if a.sharedSymlinks != nil { + return a.sharedSymlinks + } + return symlinks.RequiredShared +} + // resolveHomeDir returns the configured homeDir (test override) or the real // user home. Tests MUST set a.homeDir to a tmpdir to avoid clobbering the // production ~/.claude symlink. diff --git a/internal/switcher/account_switcher_test.go b/internal/switcher/account_switcher_test.go index 8c3b292..d2f327d 100644 --- a/internal/switcher/account_switcher_test.go +++ b/internal/switcher/account_switcher_test.go @@ -1,6 +1,8 @@ package switcher import ( + "os" + "path/filepath" "strings" "testing" "time" @@ -8,8 +10,19 @@ import ( "forge.secuaas.ovh/olivier/claude-failover/internal/config" "forge.secuaas.ovh/olivier/claude-failover/internal/quota" "forge.secuaas.ovh/olivier/claude-failover/internal/state" + "forge.secuaas.ovh/olivier/claude-failover/internal/symlinks" ) +// tmpShared returns a SharedSymlink list whose targets live entirely under +// tmpDir, so switcher tests never touch /home/ubuntu/.claude-*-shared. +func tmpShared(tmpDir string) []symlinks.SharedSymlink { + return []symlinks.SharedSymlink{ + {Target: filepath.Join(tmpDir, "session-env-shared"), Name: "session-env"}, + {Target: filepath.Join(tmpDir, "file-history-shared"), Name: "file-history"}, + {Target: filepath.Join(tmpDir, "projects-shared"), Name: "projects"}, + } +} + // mockTmux for switcher tests. type mockTmux struct { sessions map[string]bool @@ -143,6 +156,9 @@ func TestKillAndRecreatePoolSessions(t *testing.T) { // touches the real ~/.claude (regression: a reboot used to leave Claude // Code unusable because the test had repointed ~/.claude to /tmp/...). a.homeDir = t.TempDir() + // Scope shared-symlink targets to a tmpdir so the post-flip ensure + // pass does not write inside /home/ubuntu/.claude-*-shared. + a.sharedSymlinks = tmpShared(t.TempDir()) a.executeSwitch(quota.SwitchRequest{From: "compte1"}) // Active account must have changed. @@ -186,10 +202,12 @@ func TestDedicatedRelaunchAfterSwap(t *testing.T) { s := state.New("") s.SetActiveAccount("compte1") + home1 := filepath.Join(t.TempDir(), "claude-1-xxxx") + home2 := filepath.Join(t.TempDir(), "claude-2-xxxx") cfg := &config.Config{ Accounts: []config.AccountConfig{ - {Name: "compte1", Home: "/tmp/claude-1-xxxx"}, - {Name: "compte2", Home: "/tmp/claude-2-xxxx"}, + {Name: "compte1", Home: home1}, + {Name: "compte2", Home: home2}, }, Pool: config.PoolConfig{ Dedicated: []config.DedicatedSession{{Name: "dedicated-1", Project: "/tmp"}}, @@ -199,6 +217,7 @@ func TestDedicatedRelaunchAfterSwap(t *testing.T) { a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) a.homeDir = t.TempDir() + a.sharedSymlinks = tmpShared(t.TempDir()) a.executeSwitch(quota.SwitchRequest{From: "compte1"}) // The relaunch must send a resume command on the dedicated session, @@ -213,10 +232,121 @@ func TestDedicatedRelaunchAfterSwap(t *testing.T) { if relaunch == "" { t.Fatalf("expected dedicated-1 relaunch send-keys; got %v", tc.sendKeyCalls) } - if !strings.Contains(relaunch, "CLAUDE_CONFIG_DIR=/tmp/claude-2-xxxx") { + if !strings.Contains(relaunch, "CLAUDE_CONFIG_DIR="+home2) { t.Errorf("relaunch should set CLAUDE_CONFIG_DIR to target home; got %q", relaunch) } if !strings.Contains(relaunch, "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee") { t.Errorf("relaunch should include captured UUID; got %q", relaunch) } } + +// TestFlipReconcilesSharedSymlinksOnTargetHome verifies A3: after the main +// ~/.claude flip, the switcher reconciles the three shared-state symlinks +// (session-env / file-history / projects) on the TARGET account home. +// Scenario: the target home has NO links yet — a freshly-provisioned account +// that has never been flipped into. Post-switch, all three links must exist +// inside the target home and point at the canonical shared targets. +func TestFlipReconcilesSharedSymlinksOnTargetHome(t *testing.T) { + tc := newMockTmux() + + s := state.New("") + s.SetActiveAccount("compte1") + + // Target home starts empty: EnsureForAccount will mkdir + create links. + targetHome := filepath.Join(t.TempDir(), "claude-compte2") + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Home: filepath.Join(t.TempDir(), "claude-compte1")}, + {Name: "compte2", Home: targetHome}, + }, + Pool: config.PoolConfig{ + Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0}, + }, + } + + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + a.homeDir = t.TempDir() + shared := tmpShared(t.TempDir()) + a.sharedSymlinks = shared + + // Pre-assert: no link exists in targetHome. + for _, sl := range shared { + if _, err := os.Lstat(filepath.Join(targetHome, sl.Name)); !os.IsNotExist(err) { + t.Fatalf("pre-condition: %q should not exist yet (err=%v)", sl.Name, err) + } + } + + a.executeSwitch(quota.SwitchRequest{From: "compte1"}) + + // Post-assert: every required link exists and points at the canonical + // target under the tmpdir-scoped shared root. + for _, sl := range shared { + linkPath := filepath.Join(targetHome, sl.Name) + info, err := os.Lstat(linkPath) + if err != nil { + t.Errorf("expected link at %s after flip: %v", linkPath, err) + continue + } + if info.Mode()&os.ModeSymlink == 0 { + t.Errorf("%s exists but is not a symlink", linkPath) + continue + } + got, err := os.Readlink(linkPath) + if err != nil { + t.Errorf("readlink %s: %v", linkPath, err) + continue + } + if got != sl.Target { + t.Errorf("link %s points to %q, want %q", linkPath, got, sl.Target) + } + } +} + +// TestFlipEnsureSymlinksFailureDoesNotAbortSwap verifies A3 best-effort: +// if EnsureForAccount returns an error (here: a divergent pre-existing link +// that the symlinks package refuses to auto-correct), the flip and the swap +// MUST still complete. The shared symlink reconcile is post-flip cleanup, +// not a gate on the failover itself — aborting here would leave the daemon +// in an inconsistent state (symlink flipped but active account not updated). +func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) { + tc := newMockTmux() + + s := state.New("") + s.SetActiveAccount("compte1") + + targetHome := filepath.Join(t.TempDir(), "claude-compte2") + if err := os.MkdirAll(targetHome, 0700); err != nil { + t.Fatalf("mkdir target home: %v", err) + } + // Plant a divergent link at /session-env. The symlinks + // package refuses to auto-correct this (data-loss safeguard) and will + // return an error — which the switcher must swallow with a WARN log. + bogus := filepath.Join(t.TempDir(), "somewhere-else") + if err := os.MkdirAll(bogus, 0700); err != nil { + t.Fatalf("mkdir bogus: %v", err) + } + if err := os.Symlink(bogus, filepath.Join(targetHome, "session-env")); err != nil { + t.Fatalf("plant divergent link: %v", err) + } + + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Home: filepath.Join(t.TempDir(), "claude-compte1")}, + {Name: "compte2", Home: targetHome}, + }, + Pool: config.PoolConfig{ + Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0}, + }, + } + + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + a.homeDir = t.TempDir() + a.sharedSymlinks = tmpShared(t.TempDir()) + + a.executeSwitch(quota.SwitchRequest{From: "compte1"}) + + // The swap must have completed despite the divergent-link error. + if got := s.ActiveAccount(); got != "compte2" { + t.Errorf("swap should complete even when ensure fails; active=%q want compte2", got) + } +} From 20063b1939a5391763c588bdc42e4fdf051aba4c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 16 Apr 2026 19:53:48 +0000 Subject: [PATCH 2/3] fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour) --- VERSION.md | 62 +++++++++- internal/switcher/account_switcher.go | 115 ++++++++++++++++-- internal/switcher/account_switcher_test.go | 133 ++++++++++++++++++--- internal/symlinks/shared_test.go | 70 +++++++++++ 4 files changed, 356 insertions(+), 24 deletions(-) diff --git a/VERSION.md b/VERSION.md index e9acf7d..9fa2f32 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1,4 +1,64 @@ -# Version actuelle : 0.3.7 +# Version actuelle : 0.3.8 + +## [0.3.8] - 2026-04-16 +**Type:** Patch — Bug #1 (A3 flip+ensure inconsistency) + Bug #10 (requiredShared contract test) + +### Corrigé — Bug #1 (CRITIQUE) +- `AccountSwitcher.executeSwitch` ne continue plus silencieusement quand + `symlinks.EnsureForAccount` échoue après le flip : il **roll-back** le lien + `~/.claude` vers le home du compte précédent et **n'appelle pas** + `SetActiveAccount`. Évite l'état incohérent où le daemon déclare le compte + cible actif alors que ses shared symlinks sont divergents → transcripts + dupliqués silencieusement, resume cassé. +- Si le rollback réussit : swap annulé, état filesystem = état pré-swap, + erreur explicite retournée par `executeSwitchE`. +- Si ensure ET rollback échouent : `partialSwap` atomique sticky set, + `ErrPartialSwap` retourné, tout futur swap est refusé tant que le + daemon n'est pas redémarré par l'opérateur. +- Nouvelle méthode publique `AccountSwitcher.IsPartialSwap() bool` pour + que health-checks et watchdog exposent l'état dégradé. + +### Ajouté — Tests Bug #1 +- `TestFlipEnsureFailureTriggersRollback` : plant un lien divergent sur + le home cible → ensure échoue → rollback réussit → `ActiveAccount` reste + compte1 → `~/.claude` pointe sur previousHome → `IsPartialSwap` = false. +- `TestFlipEnsureAndRollbackFailure` : force les deux flips à échouer + (homeDir = fichier régulier) → `ErrPartialSwap` retourné, flag sticky + set, swap suivant refusé. + +### Ajouté — Bug #10 +- `TestRequiredSharedIsCoherent` (`internal/symlinks/shared_test.go`) : + valide le contrat de la constante package-level `RequiredShared` jamais + exercée auparavant (tous les autres tests utilisent un override scoped + en `t.TempDir()`). Vérifie sans toucher au filesystem : + - exactement 3 entrées (`session-env`, `file-history`, `projects`) + - targets absolus + - `filepath.Dir(target)` identique pour les 3 entrées (invariant + "3 liens sous un même shared root" sur lequel repose `EnsureForAccount`). + +### Rationale +- Continuer après un ensure échoué revient à valider que le compte cible + est "sain" alors que les shared symlinks sont absents ou divergents. + Conséquence en prod : premier `claude --resume` écrit dans + `~/.claude/projects/` (privé) → transcripts dupliqués, undo + désynchronisé, failover complètement cassé sans log d'alerte. +- Le rollback garantit qu'un compte cible mal configuré ne peut PAS + dégrader le state du daemon : on retourne à l'état pré-swap et on + signale l'erreur à l'appelant. +- `ErrPartialSwap` + `IsPartialSwap()` documente un état où l'intervention + humaine est obligatoire — préférable à un retry automatique qui + empirerait la divergence. + +### Tests +- ✅ `go test ./...` : tous les packages PASS +- ✅ `go test -race ./...` : PASS, aucun data race +- ✅ `go vet ./...` : clean +- ✅ `go build ./...` : clean + +### Fichiers modifiés +- `internal/switcher/account_switcher.go` (+rollback + IsPartialSwap + ErrPartialSwap) +- `internal/switcher/account_switcher_test.go` (2 nouveaux tests, 1 test obsolète remplacé) +- `internal/symlinks/shared_test.go` (+TestRequiredSharedIsCoherent) ## [0.3.7] - 2026-04-16 **Type:** Patch — Phase 1 / Chantier A3 : wire EnsureForAccount post-flip diff --git a/internal/switcher/account_switcher.go b/internal/switcher/account_switcher.go index 9e854b1..3778403 100644 --- a/internal/switcher/account_switcher.go +++ b/internal/switcher/account_switcher.go @@ -4,6 +4,7 @@ package switcher import ( "context" + "errors" "fmt" "log" "os" @@ -11,6 +12,7 @@ import ( "regexp" "strconv" "strings" + "sync/atomic" "time" "forge.secuaas.ovh/olivier/claude-failover/internal/config" @@ -21,6 +23,16 @@ import ( "forge.secuaas.ovh/olivier/claude-failover/internal/tmux" ) +// ErrPartialSwap is returned (and wrapped) when the switcher flipped +// ~/.claude to the target home, EnsureForAccount failed on the target, +// and the rollback flip back to the previous home ALSO failed. The daemon +// is in a documented degraded state: the active-account setter was NOT +// called, but the filesystem symlink may point at an account whose shared +// state is inconsistent. Operator intervention is required. Callers can +// interrogate AccountSwitcher.IsPartialSwap() to expose the flag to +// health-checks / watchdogs. +var ErrPartialSwap = errors.New("switcher: partial swap — flip succeeded but ensure + rollback both failed") + // SwitchState represents the current phase of a failover operation. type SwitchState string @@ -58,6 +70,14 @@ type AccountSwitcher struct { // suite never touches the operator's real /home/ubuntu/.claude-* // shared directories. When nil, symlinks.RequiredShared is used. sharedSymlinks []symlinks.SharedSymlink + // partialSwap is set to 1 when a flip+ensure+rollback sequence left the + // daemon in an inconsistent state (symlink possibly flipped, but active + // account NOT updated, and rollback flip ALSO failed). Health-checks / + // watchdogs read this flag via IsPartialSwap(). It is sticky: once set, + // it stays set until the operator restarts the daemon after fixing the + // filesystem state. We use atomic access so watchdog goroutines can read + // it without blocking the switcher. + partialSwap atomic.Bool } // New creates an AccountSwitcher. @@ -94,8 +114,30 @@ func (a *AccountSwitcher) Run(ctx context.Context) { // executeSwitch performs the full failover sequence. func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { + if err := a.executeSwitchE(req); err != nil { + // executeSwitchE already logs the detail; we swallow the error here + // because the public Run loop has no return channel. The partialSwap + // flag (if set) remains visible via IsPartialSwap(). + a.logger.Printf("[switcher] SWAP aborted: %v", err) + } +} + +// executeSwitchE runs the swap and returns an error describing any abort or +// partial-swap condition. Split out from executeSwitch so tests can assert +// on the error value without routing through a channel. +func (a *AccountSwitcher) executeSwitchE(req quota.SwitchRequest) error { a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime) + // Refuse to proceed if a previous swap left the daemon in an + // inconsistent state. The operator must intervene (fix the filesystem, + // restart the daemon) before any further failover can be attempted — + // otherwise we'd stack symlink flips on top of a broken state. + if a.partialSwap.Load() { + err := fmt.Errorf("refusing swap: daemon is in partial-swap degraded state (operator intervention required)") + a.logger.Printf("[switcher] %v", err) + return err + } + // 1. SAVING — capture resume UUIDs from all working sessions plus // every dedicated session unconditionally (dedicated sessions are // user-driven and may not be tracked as "working" in state, but their @@ -110,21 +152,51 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { if target == nil { a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From) a.currentState = StateNormal - return + return nil } + previous := a.findAccountByName(req.From) if err := a.flipSymlink(target.Home); err != nil { a.logger.Printf("[switcher] flipSymlink error: %v", err) } - // Best-effort: make sure the target account home exposes the three - // shared-state symlinks (session-env, file-history, projects). The main - // ~/.claude flip is already done, so an error here must NOT abort the - // swap — we just log it so the operator can investigate. Without this - // call, a fresh target account with no shared links would silently - // start writing into private /projects/session-env/file-history dirs - // and diverge from the primary account's transcripts. + // Ensure the target account home exposes the three shared-state + // symlinks (session-env, file-history, projects). If this fails we + // MUST NOT proceed with SetActiveAccount — the daemon would otherwise + // declare the target "active" while its shared state is divergent, + // silently writing transcripts into private /projects directories and + // breaking `claude --resume` across sessions. Instead we attempt to + // roll back the ~/.claude flip to the previous account. If the + // rollback also fails, the daemon is in a documented degraded state + // (ErrPartialSwap) and the operator must intervene. if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil { - a.logger.Printf("[switcher] WARN ensure shared symlinks for %q: %v", target.Home, err) + a.logger.Printf("[switcher] ensure shared symlinks for %q failed: %v — attempting rollback", target.Home, err) + if previous == nil || previous.Home == "" { + // No known previous home to roll back to — set the degraded + // flag and bail out. This is equivalent to a rollback failure + // because the filesystem is pointed at a broken target. + a.partialSwap.Store(true) + a.currentState = StateNormal + return fmt.Errorf("%w: ensure failed (%v) and no previous account home is known for rollback", ErrPartialSwap, err) + } + if rbErr := a.flipSymlink(previous.Home); rbErr != nil { + // Both the ensure AND the rollback failed. The daemon is now + // in a documented inconsistent state: ~/.claude may point at + // target whose shared-state is divergent, but SetActiveAccount + // has NOT been called so state.ActiveAccount is still the + // previous account. No further failover can be attempted + // until the operator intervenes. + a.partialSwap.Store(true) + a.logger.Printf("[switcher] CRITICAL partial swap: ensure=%v rollback=%v — daemon in degraded state, operator intervention required", err, rbErr) + a.currentState = StateNormal + return fmt.Errorf("%w: ensure=%v rollback=%v", ErrPartialSwap, err, rbErr) + } + // Rollback succeeded — symlink is back on the previous account, + // SetActiveAccount was NEVER called, state is consistent with + // "no swap happened". Return an explicit error so the caller + // knows the swap was cancelled. + a.logger.Printf("[switcher] rollback successful: ~/.claude → %s (swap cancelled)", previous.Home) + a.currentState = StateNormal + return fmt.Errorf("swap cancelled: ensure shared symlinks failed on target %q: %w", target.Home, err) } a.killAllPoolSessions() a.recreatePoolSessions() @@ -151,6 +223,31 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { } a.currentState = StateNormal + return nil +} + +// IsPartialSwap reports whether the switcher is in a degraded state after a +// flip+ensure+rollback sequence all failed. Health-checks and watchdogs use +// this signal to surface an operator-actionable alert. The flag is sticky +// for the lifetime of the process: once set, it remains set until the daemon +// is restarted (after the operator has fixed the filesystem). +func (a *AccountSwitcher) IsPartialSwap() bool { + return a.partialSwap.Load() +} + +// findAccountByName returns the account config entry matching name, or nil. +// Unlike findTargetAccount (which returns the first NON-matching account), +// this is used by the rollback path to recover the previous home. +func (a *AccountSwitcher) findAccountByName(name string) *config.AccountConfig { + if name == "" { + return nil + } + for i := range a.config.Accounts { + if a.config.Accounts[i].Name == name { + return &a.config.Accounts[i] + } + } + return nil } // saveDedicatedUUIDs captures the resume UUID for every configured dedicated diff --git a/internal/switcher/account_switcher_test.go b/internal/switcher/account_switcher_test.go index d2f327d..14df33e 100644 --- a/internal/switcher/account_switcher_test.go +++ b/internal/switcher/account_switcher_test.go @@ -1,6 +1,7 @@ package switcher import ( + "errors" "os" "path/filepath" "strings" @@ -302,25 +303,34 @@ func TestFlipReconcilesSharedSymlinksOnTargetHome(t *testing.T) { } } -// TestFlipEnsureSymlinksFailureDoesNotAbortSwap verifies A3 best-effort: -// if EnsureForAccount returns an error (here: a divergent pre-existing link -// that the symlinks package refuses to auto-correct), the flip and the swap -// MUST still complete. The shared symlink reconcile is post-flip cleanup, -// not a gate on the failover itself — aborting here would leave the daemon -// in an inconsistent state (symlink flipped but active account not updated). -func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) { +// TestFlipEnsureFailureTriggersRollback verifies the fix for the A3 bug +// (flip+ensure inconsistency): if EnsureForAccount fails on the target home +// after the ~/.claude flip, the switcher MUST NOT mark the target account +// active. It must instead roll back the ~/.claude symlink to the previous +// account's home, leaving the daemon in the pre-swap state so subsequent +// session work keeps writing to the known-good shared state. +// +// Old (buggy) behaviour: ensure error was WARN-only, SetActiveAccount still +// happened, dedicated sessions were relaunched against a target whose +// /projects, /session-env, /file-history were missing or divergent → +// transcripts duplicated silently, resume broke, undo history diverged. +func TestFlipEnsureFailureTriggersRollback(t *testing.T) { tc := newMockTmux() s := state.New("") s.SetActiveAccount("compte1") + previousHome := filepath.Join(t.TempDir(), "claude-compte1") targetHome := filepath.Join(t.TempDir(), "claude-compte2") + if err := os.MkdirAll(previousHome, 0700); err != nil { + t.Fatalf("mkdir previous home: %v", err) + } if err := os.MkdirAll(targetHome, 0700); err != nil { t.Fatalf("mkdir target home: %v", err) } // Plant a divergent link at /session-env. The symlinks // package refuses to auto-correct this (data-loss safeguard) and will - // return an error — which the switcher must swallow with a WARN log. + // return an error, which must now trigger a rollback. bogus := filepath.Join(t.TempDir(), "somewhere-else") if err := os.MkdirAll(bogus, 0700); err != nil { t.Fatalf("mkdir bogus: %v", err) @@ -331,7 +341,7 @@ func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) { cfg := &config.Config{ Accounts: []config.AccountConfig{ - {Name: "compte1", Home: filepath.Join(t.TempDir(), "claude-compte1")}, + {Name: "compte1", Home: previousHome}, {Name: "compte2", Home: targetHome}, }, Pool: config.PoolConfig{ @@ -340,13 +350,108 @@ func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) { } a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) - a.homeDir = t.TempDir() + homeDir := t.TempDir() + a.homeDir = homeDir a.sharedSymlinks = tmpShared(t.TempDir()) - a.executeSwitch(quota.SwitchRequest{From: "compte1"}) + err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}) + if err == nil { + t.Fatalf("executeSwitchE: expected cancellation error, got nil") + } + // The public symmetric swap-cancelled error must mention ensure and + // wrap the underlying symlinks package message. ErrPartialSwap must + // NOT be set (rollback succeeded → recoverable condition). + if errors.Is(err, ErrPartialSwap) { + t.Errorf("did not expect ErrPartialSwap; rollback succeeded; got %v", err) + } + if a.IsPartialSwap() { + t.Errorf("IsPartialSwap should be false when rollback succeeds") + } - // The swap must have completed despite the divergent-link error. - if got := s.ActiveAccount(); got != "compte2" { - t.Errorf("swap should complete even when ensure fails; active=%q want compte2", got) + // Active account must remain the previous one — SetActiveAccount must + // NOT have been called. + if got := s.ActiveAccount(); got != "compte1" { + t.Errorf("active account should stay compte1 after rollback; got %q", got) + } + + // ~/.claude must now point at the previous home (rollback target). + link, rlErr := os.Readlink(filepath.Join(homeDir, ".claude")) + if rlErr != nil { + t.Fatalf("readlink ~/.claude: %v", rlErr) + } + if link != previousHome { + t.Errorf("~/.claude should point at previous home %q after rollback; got %q", previousHome, link) + } +} + +// TestFlipEnsureAndRollbackFailure verifies that when BOTH EnsureForAccount +// AND the rollback flip fail, the switcher sets the sticky partial-swap +// flag and returns ErrPartialSwap. The daemon is then in a documented +// degraded state where any further swap is refused until the operator +// restarts it. +func TestFlipEnsureAndRollbackFailure(t *testing.T) { + tc := newMockTmux() + + s := state.New("") + s.SetActiveAccount("compte1") + + previousHome := filepath.Join(t.TempDir(), "claude-compte1") + targetHome := filepath.Join(t.TempDir(), "claude-compte2") + if err := os.MkdirAll(previousHome, 0700); err != nil { + t.Fatalf("mkdir previous home: %v", err) + } + if err := os.MkdirAll(targetHome, 0700); err != nil { + t.Fatalf("mkdir target home: %v", err) + } + // Plant the divergent link that will cause EnsureForAccount to fail. + bogus := filepath.Join(t.TempDir(), "somewhere-else") + if err := os.MkdirAll(bogus, 0700); err != nil { + t.Fatalf("mkdir bogus: %v", err) + } + if err := os.Symlink(bogus, filepath.Join(targetHome, "session-env")); err != nil { + t.Fatalf("plant divergent link: %v", err) + } + + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Home: previousHome}, + {Name: "compte2", Home: targetHome}, + }, + Pool: config.PoolConfig{ + Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0}, + }, + } + + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + + // Force the rollback flip to fail: point homeDir at a file that cannot + // host a .claude symlink. We use a regular file; the flipSymlink + // implementation does os.Remove() then os.Symlink() under homeDir, + // which fails when homeDir is itself a file (ENOTDIR). + badHomeFile := filepath.Join(t.TempDir(), "not-a-dir") + if err := os.WriteFile(badHomeFile, []byte("block"), 0600); err != nil { + t.Fatalf("write bad home: %v", err) + } + a.homeDir = badHomeFile + a.sharedSymlinks = tmpShared(t.TempDir()) + + err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}) + if err == nil { + t.Fatalf("expected ErrPartialSwap, got nil") + } + if !errors.Is(err, ErrPartialSwap) { + t.Errorf("expected ErrPartialSwap, got %v", err) + } + if !a.IsPartialSwap() { + t.Errorf("IsPartialSwap should be true when both ensure AND rollback fail") + } + // SetActiveAccount must still not have been called. + if got := s.ActiveAccount(); got != "compte1" { + t.Errorf("active account must stay compte1 in partial-swap; got %q", got) + } + + // A subsequent swap attempt must be refused while the flag is set. + if err2 := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}); err2 == nil { + t.Errorf("expected subsequent swap to be refused in degraded state") } } diff --git a/internal/symlinks/shared_test.go b/internal/symlinks/shared_test.go index 64625d4..e472678 100644 --- a/internal/symlinks/shared_test.go +++ b/internal/symlinks/shared_test.go @@ -204,3 +204,73 @@ func TestRequiredShared_defaultsAreReasonable(t *testing.T) { } } } + +// TestRequiredSharedIsCoherent validates the contract of the package-level +// RequiredShared constant that the switcher and lifecycle manager consume +// in production. The rest of the suite exercises EnsureForAccount / +// ValidateAll with tmpdir-scoped override lists, so the actual prod +// constant (pointing under /home/ubuntu/.claude-*-shared) is never touched +// by those tests — a regression that shrinks or renames RequiredShared +// would pass every other test but silently break failover on real VMs +// (missing a link → writes to private state → transcripts duplicated). +// +// The test is filesystem-free: it only asserts the shape of the constant. +// +// 1. Exactly three entries, one per Name required by the A-failover design. +// 2. Every Target is absolute. +// 3. All three Targets share the same parent directory — there is no +// mode of operation where one shared dir lives elsewhere than the +// others. `filepath.Dir(target)` must be identical across entries. +// +// This encodes the "3 links under one shared root" invariant that +// EnsureForAccount relies on. Any future change to RequiredShared that +// breaks this invariant should force the author to update the switcher +// contract explicitly. +func TestRequiredSharedIsCoherent(t *testing.T) { + expectedNames := map[string]bool{ + "session-env": false, + "file-history": false, + "projects": false, + } + if len(RequiredShared) != len(expectedNames) { + t.Fatalf("RequiredShared must contain exactly %d entries (session-env, file-history, projects); got %d: %+v", + len(expectedNames), len(RequiredShared), RequiredShared) + } + + var sharedParent string + for i, sl := range RequiredShared { + if _, ok := expectedNames[sl.Name]; !ok { + t.Errorf("RequiredShared[%d]: unexpected Name %q (allowed: session-env / file-history / projects)", i, sl.Name) + continue + } + if expectedNames[sl.Name] { + t.Errorf("RequiredShared[%d]: duplicate Name %q", i, sl.Name) + } + expectedNames[sl.Name] = true + + if sl.Target == "" { + t.Errorf("RequiredShared[%d] (%q): empty Target", i, sl.Name) + continue + } + if !filepath.IsAbs(sl.Target) { + t.Errorf("RequiredShared[%d] (%q): Target %q must be absolute", i, sl.Name, sl.Target) + continue + } + + parent := filepath.Dir(sl.Target) + if i == 0 { + sharedParent = parent + continue + } + if parent != sharedParent { + t.Errorf("RequiredShared[%d] (%q): parent dir %q diverges from %q — all shared targets must live under the same root", + i, sl.Name, parent, sharedParent) + } + } + + for name, seen := range expectedNames { + if !seen { + t.Errorf("RequiredShared is missing the required %q entry", name) + } + } +} From 336f1f27bbbfb65537962fa9aa598df157c146df Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 16 Apr 2026 21:00:16 +0000 Subject: [PATCH 3/3] =?UTF-8?q?chore(deps):=20go=20mod=20tidy=20=E2=80=94?= =?UTF-8?q?=20promote=20fsnotify=20to=20direct?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No functional change. Groups yaml.v3 and fsnotify as direct deps, isolates golang.org/x/sys as indirect. Co-Authored-By: Claude Opus 4.7 (1M context) --- VERSION.md | 13 ++++++++++++- go.mod | 8 ++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/VERSION.md b/VERSION.md index 9fa2f32..09e8e6a 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1,4 +1,15 @@ -# Version actuelle : 0.3.8 +# Version actuelle : 0.3.9 + +## [0.3.9] - 2026-04-16 +**Type:** Patch — `go mod tidy` (fsnotify direct dep cleanup) + +### Modifié +- `go.mod` : `fsnotify` promu en dépendance directe, `yaml.v3` regroupé, `golang.org/x/sys` isolé en indirect. Aucun changement fonctionnel. + +### Tests effectués +- ✅ Build OK (aucune modification de code) + +--- ## [0.3.8] - 2026-04-16 **Type:** Patch — Bug #1 (A3 flip+ensure inconsistency) + Bug #10 (requiredShared contract test) diff --git a/go.mod b/go.mod index e30b815..45a82d6 100644 --- a/go.mod +++ b/go.mod @@ -2,9 +2,9 @@ module forge.secuaas.ovh/olivier/claude-failover go 1.22 -require gopkg.in/yaml.v3 v3.0.1 - require ( - github.com/fsnotify/fsnotify v1.9.0 // indirect - golang.org/x/sys v0.13.0 // indirect + github.com/fsnotify/fsnotify v1.9.0 + gopkg.in/yaml.v3 v3.0.1 ) + +require golang.org/x/sys v0.13.0 // indirect