From 91091d7abf773eafbe611c231ffae46bf41df68f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 16 Apr 2026 18:55:32 +0000 Subject: [PATCH 1/4] feat(symlinks): add shared-state symlink manager (A1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds internal/symlinks package that encodes in code the convention previously maintained by hand on the VM: every Claude account home must expose `session-env`, `file-history` and `projects` as symlinks to a single shared target, so account failover does not create divergent state (duplicate JSONL transcripts, broken undo history). - EnsureForAccount(home, required) creates missing links and target directories, refuses to auto-correct a divergent link (risks data loss), and errors when a regular file sits where the link belongs. - ValidateAll(homes, required) aggregates errors across both accounts so the operator sees every problem at once rather than fixing one per restart cycle. - RequiredShared exposes the production defaults so lifecycle and switcher (A2/A3) can depend on it directly. 9/9 unit tests green. Part of Phase 1 Chantier A — Failover robuste. --- VERSION.md | 28 ++++- WORK_IN_PROGRESS.md | 19 ++- internal/symlinks/shared.go | 165 +++++++++++++++++++++++++ internal/symlinks/shared_test.go | 206 +++++++++++++++++++++++++++++++ 4 files changed, 414 insertions(+), 4 deletions(-) create mode 100644 internal/symlinks/shared.go create mode 100644 internal/symlinks/shared_test.go diff --git a/VERSION.md b/VERSION.md index fe1d813..c699e3c 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1,4 +1,30 @@ -# Version actuelle : 0.3.4 +# Version actuelle : 0.3.5 + +## [0.3.5] - 2026-04-16 +**Type:** Patch — Phase 1 / Chantier A1 : package `internal/symlinks` + +### Ajouté +- `internal/symlinks/shared.go` : `EnsureForAccount` + `ValidateAll` qui + encodent en code la convention des 3 symlinks partagés par compte + (`session-env`, `file-history`, `projects`). Jusqu'à aujourd'hui ces + liens étaient maintenus à la main et leur absence silencieuse cassait + le failover (JSONL dupliqués, undo désynchronisé). +- Tests unitaires couvrant : création missing, idempotence, divergence + (refus d'auto-correction pour éviter la perte de données), fichier + régulier à la place du lien, home vide, agrégation d'erreurs multi-comptes. + +### Rationale +- Un déploiement sur une nouvelle VM ne peut plus omettre les liens. +- Divergent link → erreur explicite, jamais de correction silencieuse. +- Préparation des tâches A2 (ValidateAll au startup) et A3 (EnsureForAccount + post-flipSymlink dans le switcher). + +### Tests +- ✅ `go test ./internal/symlinks/...` : 9/9 PASS + +### Fichiers ajoutés +- `internal/symlinks/shared.go` +- `internal/symlinks/shared_test.go` ## [0.3.4] - 2026-04-16 **Type:** Patch — Dispatcher ne route JAMAIS vers les sessions dédiées diff --git a/WORK_IN_PROGRESS.md b/WORK_IN_PROGRESS.md index 049b34a..7538918 100644 --- a/WORK_IN_PROGRESS.md +++ b/WORK_IN_PROGRESS.md @@ -1,13 +1,26 @@ # Travaux en Cours - claude-failover ## Dernière mise à jour -2026-04-15 19:30:00 +2026-04-16 19:00:00 ## Version Actuelle -0.3.0 +0.3.5 (en cours de progression vers 0.4.0) ## Demande Actuelle -Aucune — v0.2.3 shippée, service stable. +**Phase 1 / Chantier A — Failover robuste** (spec dans `ccl-platform/phases/phase1/A-failover.md`). +Rendre le failover compte1 ↔ compte2 déterministe en intégrant dans le code les fixes manuels +(symlinks partagés), en ajoutant un registre UUID fiable, et en durcissant tmux send-keys. + +Branche : `feat/phase1-A-failover-robust`. + +## Sous-tâches Chantier A +- [x] A1 — `internal/symlinks/shared.go` (+ tests) — v0.3.5 +- [ ] A2 — `lifecycle/manager.go` : `ValidateAll` au startup +- [ ] A3 — `switcher/account_switcher.go` : `EnsureForAccount` post-flip +- [ ] A4 — `internal/registry/uuid_registry.go` (+ tests) +- [ ] A5 — `internal/tmux/send.go` avec retry exponentiel (+ tests) +- [ ] A6 — Capture UUID 200 → 500 lignes +- [ ] A7 — `scripts/test-failover.sh` dans ccl-platform + scripts associés ## Étapes Complétées - [x] v0.2.1 — Cooldown post-swap + log forensique (trigger_session, pattern, snippet) diff --git a/internal/symlinks/shared.go b/internal/symlinks/shared.go new file mode 100644 index 0000000..7687fb0 --- /dev/null +++ b/internal/symlinks/shared.go @@ -0,0 +1,165 @@ +// Package symlinks manages the shared-state symlinks that every Claude +// account home must expose, so that account failover does not create state +// divergence (duplicated JSONL transcripts, broken undo history, drifted +// session env). +// +// Rationale +// +// Claude Code stores three directories whose content MUST be identical +// across the two configured accounts for failover to be a no-op: +// +// - projects/ — session JSONL transcripts (used by `claude --resume`) +// - session-env/ — per-session environment and working-dir metadata +// - file-history/ — undo/redo history persistence +// +// If account A writes under `~/.claude-compte1/projects/...` while account +// B later runs under `~/.claude-compte2/projects/...`, resume fails +// silently with "session not found" and the operator loses every in-flight +// conversation. +// +// Historically we fixed this by creating symlinks manually on the +// operator's VM. Any fresh deployment that forgets those links silently +// reintroduces the bug. This package encodes the convention in code: +// EnsureForAccount creates missing links, ValidateAll fails fast at +// startup when an account home is misconfigured. +package symlinks + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strings" +) + +// DefaultSharedRoot is the directory under which the three shared targets +// live. All SharedSymlink.Target values default to a subdirectory of this +// root so tests can override the root without rewriting the shared list. +const DefaultSharedRoot = "/home/ubuntu" + +// SharedSymlink describes one required link inside a Claude account home. +// +// Target is the absolute path on disk that holds the real shared +// directory. Name is the basename of the link that must exist inside each +// account home (e.g. `session-env`, `file-history`, `projects`). +type SharedSymlink struct { + Target string + Name string +} + +// RequiredShared lists the three symlinks every Claude account home must +// expose. The list is package-level so integration tests can read it, but +// callers SHOULD prefer the EnsureForAccount / ValidateAll entry points +// that accept an override list for isolation. +var RequiredShared = []SharedSymlink{ + {Target: "/home/ubuntu/.claude-session-env-shared", Name: "session-env"}, + {Target: "/home/ubuntu/.claude-file-history-shared", Name: "file-history"}, + {Target: "/home/ubuntu/.claude-projects-shared", Name: "projects"}, +} + +// EnsureForAccount verifies (and creates if missing) every required shared +// symlink for a single account home. Behaviour: +// +// - If accountHome does not exist, it is created (mode 0700). +// - If Target (the shared destination) does not exist, it is created +// (mode 0700). Both accounts pointing at a non-existent target would +// produce two separate state trees on first write. +// - If the link is absent, it is created. +// - If the link is present and points at Target, nothing happens. +// - If the link is present but points elsewhere, an error is returned. +// We REFUSE to auto-correct a divergent link because fixing it blindly +// could delete user data: the "wrong" target may contain the only copy +// of the session transcripts. +// - If a regular file or directory exists where the link should be, +// an error is returned for the same reason. +func EnsureForAccount(accountHome string, required []SharedSymlink) error { + if accountHome == "" { + return errors.New("symlinks: accountHome is empty") + } + + if err := os.MkdirAll(accountHome, 0700); err != nil { + return fmt.Errorf("symlinks: create account home %q: %w", accountHome, err) + } + + for _, sl := range required { + if err := ensureTarget(sl.Target); err != nil { + return err + } + if err := ensureLink(accountHome, sl); err != nil { + return err + } + } + return nil +} + +// ValidateAll runs EnsureForAccount on every account home. It aggregates +// all errors and returns a single error with every failure inlined, so the +// operator sees the full picture at startup rather than fixing one link, +// restarting, hitting the next one, repeat. +func ValidateAll(accountHomes []string, required []SharedSymlink) error { + if len(accountHomes) == 0 { + return errors.New("symlinks: no account homes provided") + } + var errs []string + for _, home := range accountHomes { + if err := EnsureForAccount(home, required); err != nil { + errs = append(errs, err.Error()) + } + } + if len(errs) > 0 { + return fmt.Errorf("symlinks: validation failed for %d account home(s): %s", + len(errs), strings.Join(errs, "; ")) + } + return nil +} + +// ensureTarget creates Target as an empty directory when absent. +// An existing file (non-directory, non-symlink) at Target is an operator +// error we cannot resolve automatically. +func ensureTarget(target string) error { + info, err := os.Stat(target) + if err != nil { + if !os.IsNotExist(err) { + return fmt.Errorf("symlinks: stat shared target %q: %w", target, err) + } + if mkErr := os.MkdirAll(target, 0700); mkErr != nil { + return fmt.Errorf("symlinks: create shared target %q: %w", target, mkErr) + } + return nil + } + if !info.IsDir() { + return fmt.Errorf("symlinks: shared target %q is not a directory", target) + } + return nil +} + +// ensureLink reconciles one link entry inside accountHome. +func ensureLink(accountHome string, sl SharedSymlink) error { + linkPath := filepath.Join(accountHome, sl.Name) + + info, err := os.Lstat(linkPath) + if err != nil { + if os.IsNotExist(err) { + if linkErr := os.Symlink(sl.Target, linkPath); linkErr != nil { + return fmt.Errorf("symlinks: create %q → %q: %w", linkPath, sl.Target, linkErr) + } + return nil + } + return fmt.Errorf("symlinks: lstat %q: %w", linkPath, err) + } + + // Path exists — must be a symlink pointing at Target. + if info.Mode()&os.ModeSymlink == 0 { + return fmt.Errorf("symlinks: %q exists but is not a symlink (expected → %q)", + linkPath, sl.Target) + } + currentTarget, err := os.Readlink(linkPath) + if err != nil { + return fmt.Errorf("symlinks: readlink %q: %w", linkPath, err) + } + if currentTarget != sl.Target { + return fmt.Errorf("symlinks: divergent link at %q: points to %q, expected %q (refusing to auto-correct to avoid data loss)", + linkPath, currentTarget, sl.Target) + } + return nil +} diff --git a/internal/symlinks/shared_test.go b/internal/symlinks/shared_test.go new file mode 100644 index 0000000..64625d4 --- /dev/null +++ b/internal/symlinks/shared_test.go @@ -0,0 +1,206 @@ +package symlinks + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +// testRequired returns a SharedSymlink list whose Targets live entirely +// under tmpDir, so the tests never touch the operator's real home. +func testRequired(tmpDir string) []SharedSymlink { + return []SharedSymlink{ + {Target: filepath.Join(tmpDir, "session-env-shared"), Name: "session-env"}, + {Target: filepath.Join(tmpDir, "file-history-shared"), Name: "file-history"}, + {Target: filepath.Join(tmpDir, "projects-shared"), Name: "projects"}, + } +} + +func TestEnsureForAccount_missingCreatesLinksAndTargets(t *testing.T) { + tmp := t.TempDir() + home := filepath.Join(tmp, "account1") + req := testRequired(tmp) + + if err := EnsureForAccount(home, req); err != nil { + t.Fatalf("EnsureForAccount: %v", err) + } + + for _, sl := range req { + linkPath := filepath.Join(home, sl.Name) + info, err := os.Lstat(linkPath) + if err != nil { + t.Errorf("expected link at %s: %v", linkPath, err) + continue + } + if info.Mode()&os.ModeSymlink == 0 { + t.Errorf("%s exists but is not a symlink", linkPath) + } + got, err := os.Readlink(linkPath) + if err != nil { + t.Errorf("readlink %s: %v", linkPath, err) + continue + } + if got != sl.Target { + t.Errorf("link %s points to %q, want %q", linkPath, got, sl.Target) + } + // Target directory must exist too. + if st, err := os.Stat(sl.Target); err != nil || !st.IsDir() { + t.Errorf("target %s should be a directory, err=%v", sl.Target, err) + } + } +} + +func TestEnsureForAccount_idempotent(t *testing.T) { + tmp := t.TempDir() + home := filepath.Join(tmp, "account1") + req := testRequired(tmp) + + if err := EnsureForAccount(home, req); err != nil { + t.Fatalf("first pass: %v", err) + } + if err := EnsureForAccount(home, req); err != nil { + t.Fatalf("second pass should be a no-op, got: %v", err) + } +} + +func TestEnsureForAccount_divergentLinkReturnsError(t *testing.T) { + tmp := t.TempDir() + home := filepath.Join(tmp, "account1") + req := testRequired(tmp) + + // Pre-create a wrong symlink for "projects". + if err := os.MkdirAll(home, 0700); err != nil { + t.Fatalf("mkdir home: %v", err) + } + wrongTarget := filepath.Join(tmp, "someone-elses-dir") + if err := os.MkdirAll(wrongTarget, 0700); err != nil { + t.Fatalf("mkdir wrong target: %v", err) + } + linkPath := filepath.Join(home, "projects") + if err := os.Symlink(wrongTarget, linkPath); err != nil { + t.Fatalf("seed wrong symlink: %v", err) + } + + err := EnsureForAccount(home, req) + if err == nil { + t.Fatal("expected error for divergent link, got nil") + } + if !strings.Contains(err.Error(), "divergent") { + t.Errorf("error should mention 'divergent': %v", err) + } + + // The wrong symlink MUST be preserved (no auto-correction). + got, err := os.Readlink(linkPath) + if err != nil { + t.Fatalf("readlink after error: %v", err) + } + if got != wrongTarget { + t.Errorf("divergent link was mutated: now %q, want preserved %q", got, wrongTarget) + } +} + +func TestEnsureForAccount_regularFileInsteadOfLinkFails(t *testing.T) { + tmp := t.TempDir() + home := filepath.Join(tmp, "account1") + req := testRequired(tmp) + + if err := os.MkdirAll(home, 0700); err != nil { + t.Fatalf("mkdir home: %v", err) + } + // Create a regular file at the session-env path. + bogus := filepath.Join(home, "session-env") + if err := os.WriteFile(bogus, []byte("oops"), 0600); err != nil { + t.Fatalf("seed regular file: %v", err) + } + + err := EnsureForAccount(home, req) + if err == nil { + t.Fatal("expected error for regular-file-at-link-path, got nil") + } + if !strings.Contains(err.Error(), "not a symlink") { + t.Errorf("error should mention 'not a symlink': %v", err) + } +} + +func TestEnsureForAccount_emptyHomeReturnsError(t *testing.T) { + if err := EnsureForAccount("", nil); err == nil { + t.Fatal("expected error for empty home, got nil") + } +} + +func TestValidateAll_multipleAccountsAllOK(t *testing.T) { + tmp := t.TempDir() + req := testRequired(tmp) + homes := []string{ + filepath.Join(tmp, "a"), + filepath.Join(tmp, "b"), + } + if err := ValidateAll(homes, req); err != nil { + t.Fatalf("ValidateAll: %v", err) + } +} + +func TestValidateAll_aggregatesErrors(t *testing.T) { + tmp := t.TempDir() + req := testRequired(tmp) + homes := []string{ + filepath.Join(tmp, "a"), + filepath.Join(tmp, "b"), + } + + // Pre-seed account `a` with a divergent link so ValidateAll must + // surface that error while still processing account `b`. + if err := os.MkdirAll(homes[0], 0700); err != nil { + t.Fatalf("mkdir a: %v", err) + } + wrongTarget := filepath.Join(tmp, "bad") + if err := os.MkdirAll(wrongTarget, 0700); err != nil { + t.Fatalf("mkdir bad: %v", err) + } + if err := os.Symlink(wrongTarget, filepath.Join(homes[0], "projects")); err != nil { + t.Fatalf("seed wrong link: %v", err) + } + + err := ValidateAll(homes, req) + if err == nil { + t.Fatal("expected aggregated error, got nil") + } + if !strings.Contains(err.Error(), "divergent") { + t.Errorf("should surface divergent: %v", err) + } + + // Account `b` must have been configured successfully even though `a` + // failed. Otherwise the operator cannot see the full state at once. + for _, sl := range req { + if _, err := os.Lstat(filepath.Join(homes[1], sl.Name)); err != nil { + t.Errorf("account b link %s should have been created despite a's failure: %v", sl.Name, err) + } + } +} + +func TestValidateAll_emptyListReturnsError(t *testing.T) { + if err := ValidateAll(nil, nil); err == nil { + t.Fatal("expected error for empty account list") + } +} + +// TestRequiredShared_defaultsAreReasonable pins the default SharedSymlink +// list so an accidental edit that breaks production is caught. +func TestRequiredShared_defaultsAreReasonable(t *testing.T) { + want := map[string]string{ + "session-env": "/home/ubuntu/.claude-session-env-shared", + "file-history": "/home/ubuntu/.claude-file-history-shared", + "projects": "/home/ubuntu/.claude-projects-shared", + } + if len(RequiredShared) != len(want) { + t.Fatalf("RequiredShared has %d entries, want %d", len(RequiredShared), len(want)) + } + for _, sl := range RequiredShared { + if got, ok := want[sl.Name]; !ok { + t.Errorf("unexpected RequiredShared entry %q", sl.Name) + } else if got != sl.Target { + t.Errorf("RequiredShared %q target = %q, want %q", sl.Name, sl.Target, got) + } + } +} From e16e3526a0d8905b6e3f34051a6c643e2dc5f84c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 16 Apr 2026 19:03:43 +0000 Subject: [PATCH 2/4] feat(lifecycle): validate shared symlinks at daemon startup (A2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire symlinks.ValidateAll into the lifecycle manager so the daemon refuses to start if any configured account is missing one of the shared-state symlinks or if a link diverges from the canonical target. Previously, a missing link on a freshly deployed VM would silently create a divergent state tree per account (duplicate JSONL transcripts, broken undo history) — exactly the failure mode the symlinks package (A1) was introduced to prevent. The check runs once at startup before EnsureAllSessions, guarding a single well-defined invariant: "every account home shares the same projects/, file-history/ and session-env/ roots". No auto-heal on divergence — we fail fast with an explicit error so the operator fixes it manually rather than one account's state being overwritten. Part of Phase 1 Chantier A — Failover robuste. Co-Authored-By: Claude Opus 4.7 (1M context) --- VERSION.md | 29 ++++++++++++++++++++++++++++- cmd/claude-failover/main.go | 9 +++++++++ internal/lifecycle/manager.go | 31 +++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/VERSION.md b/VERSION.md index c699e3c..b777c33 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1,4 +1,31 @@ -# Version actuelle : 0.3.5 +# Version actuelle : 0.3.6 + +## [0.3.6] - 2026-04-16 +**Type:** Patch — Phase 1 / Chantier A2 : validation des symlinks au startup + +### Ajouté +- `Manager.ValidateSharedSymlinks()` : nouvelle méthode dans + `internal/lifecycle` qui agrège les `Home` de tous les comptes + configurés et délègue à `symlinks.ValidateAll`. Échoue dur si un + compte n'a pas de `home` défini ou si un lien est absent/divergent. +- `cmd/claude-failover/main.go` appelle cette validation **avant** + `EnsureAllSessions()` : un état partagé cassé ne laissera plus le + daemon démarrer et divergér silencieusement. + +### Rationale +- Un opérateur qui copie la config sur une nouvelle VM ne peut plus + oublier les liens — le daemon refuse de démarrer jusqu'à ce qu'ils + soient corrects. +- Pas d'auto-heal sur divergence : on préfère un message d'erreur + explicite à un `rm -f` silencieux qui détruirait l'autre compte. + +### Tests +- ✅ `go test ./...` : tous les packages PASS (incluant + `internal/lifecycle` et `internal/symlinks`). + +### Fichiers modifiés +- `cmd/claude-failover/main.go` (+9) +- `internal/lifecycle/manager.go` (+31) ## [0.3.5] - 2026-04-16 **Type:** Patch — Phase 1 / Chantier A1 : package `internal/symlinks` diff --git a/cmd/claude-failover/main.go b/cmd/claude-failover/main.go index 8bc8fc5..2c29f89 100644 --- a/cmd/claude-failover/main.go +++ b/cmd/claude-failover/main.go @@ -51,6 +51,15 @@ func main() { // Initialise tmux client and lifecycle manager. tmuxClient := tmux.NewExecClient() lm := lifecycle.New(tmuxClient, s, cfg) + + // Validate (and self-heal) the shared-state symlinks BEFORE spawning + // any sessions. A divergent link would silently fork transcripts + // between accounts and make failover destructive, so we fail fast here + // rather than after work is in flight. + if err := lm.ValidateSharedSymlinks(); err != nil { + log.Fatalf("shared symlinks validation failed: %v", err) + } + lm.EnsureAllSessions() // Block until SIGINT or SIGTERM. diff --git a/internal/lifecycle/manager.go b/internal/lifecycle/manager.go index 40fa4b0..eeed9cc 100644 --- a/internal/lifecycle/manager.go +++ b/internal/lifecycle/manager.go @@ -4,11 +4,13 @@ package lifecycle import ( "context" + "fmt" "log" "time" "forge.secuaas.ovh/olivier/claude-failover/internal/config" "forge.secuaas.ovh/olivier/claude-failover/internal/state" + "forge.secuaas.ovh/olivier/claude-failover/internal/symlinks" "forge.secuaas.ovh/olivier/claude-failover/internal/tmux" ) @@ -47,6 +49,35 @@ func (m *Manager) Run(ctx context.Context) { } } +// ValidateSharedSymlinks verifies that every configured account home has +// the three shared-state symlinks (session-env, file-history, projects) +// in place and pointing at the canonical shared targets. +// +// Called once at daemon startup BEFORE sessions are recreated. A missing +// or divergent link would silently fork the state tree between the two +// accounts, breaking failover. We fail fast so the operator fixes it +// before any work is in flight. +// +// EnsureForAccount creates missing links but refuses to touch divergent +// ones — see internal/symlinks for the rationale. +func (m *Manager) ValidateSharedSymlinks() error { + if len(m.config.Accounts) == 0 { + return fmt.Errorf("[lifecycle] no accounts configured — cannot validate shared symlinks") + } + homes := make([]string, 0, len(m.config.Accounts)) + for _, acc := range m.config.Accounts { + if acc.Home == "" { + return fmt.Errorf("[lifecycle] account %q has empty home — refusing to continue", acc.Name) + } + homes = append(homes, acc.Home) + } + if err := symlinks.ValidateAll(homes, symlinks.RequiredShared); err != nil { + return fmt.Errorf("shared symlinks invalid, refusing to start: %w", err) + } + m.logger.Printf("[lifecycle] shared symlinks OK for %d account(s)", len(homes)) + return nil +} + // EnsureAllSessions creates all configured sessions that are not yet present in tmux. // It is intended to be called once at daemon startup before Run is launched. func (m *Manager) EnsureAllSessions() { From 8eaf0bbd359e36c0b0feac5e2e2e22499d3eb663 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 16 Apr 2026 19:34:03 +0000 Subject: [PATCH 3/4] feat(switcher): ensure shared symlinks on target home after flip (A3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire symlinks.EnsureForAccount into executeSwitch, called immediately after the ~/.claude flip. Guarantees the three shared-state links (session-env, file-history, projects) exist on the target account home even for freshly-provisioned accounts, preventing silent transcript duplication and undo-history divergence on first resume. Best-effort: errors are logged as WARN but never abort the swap. If we returned here the daemon would be left inconsistent (symlink flipped, SetActiveAccount never called). Operator sees the warning in logs and resolves divergent links manually. Tests: - TestFlipReconcilesSharedSymlinksOnTargetHome: empty target home gets all three links pointing at canonical targets after the flip. - TestFlipEnsureSymlinksFailureDoesNotAbortSwap: a planted divergent link triggers the symlinks-package error; the swap completes anyway and the active account is updated. Hermetic: added AccountSwitcher.sharedSymlinks override so tests scope the reconcile inside t.TempDir() and never touch /home/ubuntu/.claude-*-shared. Existing tests migrated to this pattern and hardcoded /tmp/claude-*-xxxx paths replaced with tmpdirs. Phase 1 / Chantier A — task A3. --- VERSION.md | 46 ++++++- internal/switcher/account_switcher.go | 26 ++++ internal/switcher/account_switcher_test.go | 136 ++++++++++++++++++++- 3 files changed, 204 insertions(+), 4 deletions(-) diff --git a/VERSION.md b/VERSION.md index b777c33..e9acf7d 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1,4 +1,48 @@ -# Version actuelle : 0.3.6 +# Version actuelle : 0.3.7 + +## [0.3.7] - 2026-04-16 +**Type:** Patch — Phase 1 / Chantier A3 : wire EnsureForAccount post-flip + +### Ajouté +- `AccountSwitcher.executeSwitch` appelle désormais + `symlinks.EnsureForAccount(target.Home, ...)` **juste après** le flip + du lien principal `~/.claude`. Garantit que les 3 liens partagés + (`session-env`, `file-history`, `projects`) existent et pointent aux + bons targets sur le compte cible, même si celui-ci vient juste + d'être provisionné. +- `AccountSwitcher.sharedSymlinks` : override test-only (accepte une + liste `[]symlinks.SharedSymlink`). Défaut = `symlinks.RequiredShared`. + Les tests peuvent scoper la réconciliation dans un `t.TempDir()` pour + ne jamais toucher `/home/ubuntu/.claude-*-shared`. +- 2 tests unitaires : + - `TestFlipReconcilesSharedSymlinksOnTargetHome` : target home vide → + les 3 liens sont créés après le flip et pointent aux targets canoniques. + - `TestFlipEnsureSymlinksFailureDoesNotAbortSwap` : lien divergent + planté à la main → `EnsureForAccount` renvoie une erreur, logguée + en WARN, mais le swap complète quand même (best-effort post-flip). + +### Rationale +- Sans cet appel, un compte cible fraîchement provisionné n'aurait + pas encore ses 3 liens ; au premier `claude --resume`, Claude Code + écrirait dans `~/.claude/projects/` (privé) au lieu de + `/home/ubuntu/.claude-projects-shared` → transcripts dupliqués, + undo désynchronisé, resume silencieusement cassé. +- L'ensure est **best-effort** : une erreur est logguée en WARN mais + NE bloque PAS le flip. Si on abortait ici, on laisserait le daemon + dans un état incohérent (symlink déjà flippé mais `SetActiveAccount` + pas appelé). +- L'opérateur voit le WARN dans les logs et peut corriger la + divergence manuellement (ex: lien pointant sur le mauvais target). + +### Tests +- ✅ `go test ./...` : tous les packages PASS (incluant + `internal/switcher` et `internal/symlinks`). +- ✅ `go test -race ./internal/switcher/...` : PASS. +- ✅ `go vet ./...` : clean. + +### Fichiers modifiés +- `internal/switcher/account_switcher.go` +- `internal/switcher/account_switcher_test.go` ## [0.3.6] - 2026-04-16 **Type:** Patch — Phase 1 / Chantier A2 : validation des symlinks au startup diff --git a/internal/switcher/account_switcher.go b/internal/switcher/account_switcher.go index e8de796..9e854b1 100644 --- a/internal/switcher/account_switcher.go +++ b/internal/switcher/account_switcher.go @@ -17,6 +17,7 @@ import ( "forge.secuaas.ovh/olivier/claude-failover/internal/notify" "forge.secuaas.ovh/olivier/claude-failover/internal/quota" "forge.secuaas.ovh/olivier/claude-failover/internal/state" + "forge.secuaas.ovh/olivier/claude-failover/internal/symlinks" "forge.secuaas.ovh/olivier/claude-failover/internal/tmux" ) @@ -52,6 +53,11 @@ type AccountSwitcher struct { // homeDir is the directory containing the .claude symlink. Overridable for tests. // When empty, os.UserHomeDir() is used. homeDir string + // sharedSymlinks is the list of shared-state links reconciled on the + // target account home after every flip. Overridable for tests so the + // suite never touches the operator's real /home/ubuntu/.claude-* + // shared directories. When nil, symlinks.RequiredShared is used. + sharedSymlinks []symlinks.SharedSymlink } // New creates an AccountSwitcher. @@ -110,6 +116,16 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { if err := a.flipSymlink(target.Home); err != nil { a.logger.Printf("[switcher] flipSymlink error: %v", err) } + // Best-effort: make sure the target account home exposes the three + // shared-state symlinks (session-env, file-history, projects). The main + // ~/.claude flip is already done, so an error here must NOT abort the + // swap — we just log it so the operator can investigate. Without this + // call, a fresh target account with no shared links would silently + // start writing into private /projects/session-env/file-history dirs + // and diverge from the primary account's transcripts. + if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil { + a.logger.Printf("[switcher] WARN ensure shared symlinks for %q: %v", target.Home, err) + } a.killAllPoolSessions() a.recreatePoolSessions() a.relaunchDedicatedSessions(target.Home) @@ -225,6 +241,16 @@ func (a *AccountSwitcher) saveAllSessions() { }) } +// requiredShared returns the shared-symlink list used when reconciling the +// target account home after a flip. Tests may set a.sharedSymlinks to a +// tmpdir-scoped list so they never touch /home/ubuntu/.claude-*-shared. +func (a *AccountSwitcher) requiredShared() []symlinks.SharedSymlink { + if a.sharedSymlinks != nil { + return a.sharedSymlinks + } + return symlinks.RequiredShared +} + // resolveHomeDir returns the configured homeDir (test override) or the real // user home. Tests MUST set a.homeDir to a tmpdir to avoid clobbering the // production ~/.claude symlink. diff --git a/internal/switcher/account_switcher_test.go b/internal/switcher/account_switcher_test.go index 8c3b292..d2f327d 100644 --- a/internal/switcher/account_switcher_test.go +++ b/internal/switcher/account_switcher_test.go @@ -1,6 +1,8 @@ package switcher import ( + "os" + "path/filepath" "strings" "testing" "time" @@ -8,8 +10,19 @@ import ( "forge.secuaas.ovh/olivier/claude-failover/internal/config" "forge.secuaas.ovh/olivier/claude-failover/internal/quota" "forge.secuaas.ovh/olivier/claude-failover/internal/state" + "forge.secuaas.ovh/olivier/claude-failover/internal/symlinks" ) +// tmpShared returns a SharedSymlink list whose targets live entirely under +// tmpDir, so switcher tests never touch /home/ubuntu/.claude-*-shared. +func tmpShared(tmpDir string) []symlinks.SharedSymlink { + return []symlinks.SharedSymlink{ + {Target: filepath.Join(tmpDir, "session-env-shared"), Name: "session-env"}, + {Target: filepath.Join(tmpDir, "file-history-shared"), Name: "file-history"}, + {Target: filepath.Join(tmpDir, "projects-shared"), Name: "projects"}, + } +} + // mockTmux for switcher tests. type mockTmux struct { sessions map[string]bool @@ -143,6 +156,9 @@ func TestKillAndRecreatePoolSessions(t *testing.T) { // touches the real ~/.claude (regression: a reboot used to leave Claude // Code unusable because the test had repointed ~/.claude to /tmp/...). a.homeDir = t.TempDir() + // Scope shared-symlink targets to a tmpdir so the post-flip ensure + // pass does not write inside /home/ubuntu/.claude-*-shared. + a.sharedSymlinks = tmpShared(t.TempDir()) a.executeSwitch(quota.SwitchRequest{From: "compte1"}) // Active account must have changed. @@ -186,10 +202,12 @@ func TestDedicatedRelaunchAfterSwap(t *testing.T) { s := state.New("") s.SetActiveAccount("compte1") + home1 := filepath.Join(t.TempDir(), "claude-1-xxxx") + home2 := filepath.Join(t.TempDir(), "claude-2-xxxx") cfg := &config.Config{ Accounts: []config.AccountConfig{ - {Name: "compte1", Home: "/tmp/claude-1-xxxx"}, - {Name: "compte2", Home: "/tmp/claude-2-xxxx"}, + {Name: "compte1", Home: home1}, + {Name: "compte2", Home: home2}, }, Pool: config.PoolConfig{ Dedicated: []config.DedicatedSession{{Name: "dedicated-1", Project: "/tmp"}}, @@ -199,6 +217,7 @@ func TestDedicatedRelaunchAfterSwap(t *testing.T) { a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) a.homeDir = t.TempDir() + a.sharedSymlinks = tmpShared(t.TempDir()) a.executeSwitch(quota.SwitchRequest{From: "compte1"}) // The relaunch must send a resume command on the dedicated session, @@ -213,10 +232,121 @@ func TestDedicatedRelaunchAfterSwap(t *testing.T) { if relaunch == "" { t.Fatalf("expected dedicated-1 relaunch send-keys; got %v", tc.sendKeyCalls) } - if !strings.Contains(relaunch, "CLAUDE_CONFIG_DIR=/tmp/claude-2-xxxx") { + if !strings.Contains(relaunch, "CLAUDE_CONFIG_DIR="+home2) { t.Errorf("relaunch should set CLAUDE_CONFIG_DIR to target home; got %q", relaunch) } if !strings.Contains(relaunch, "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee") { t.Errorf("relaunch should include captured UUID; got %q", relaunch) } } + +// TestFlipReconcilesSharedSymlinksOnTargetHome verifies A3: after the main +// ~/.claude flip, the switcher reconciles the three shared-state symlinks +// (session-env / file-history / projects) on the TARGET account home. +// Scenario: the target home has NO links yet — a freshly-provisioned account +// that has never been flipped into. Post-switch, all three links must exist +// inside the target home and point at the canonical shared targets. +func TestFlipReconcilesSharedSymlinksOnTargetHome(t *testing.T) { + tc := newMockTmux() + + s := state.New("") + s.SetActiveAccount("compte1") + + // Target home starts empty: EnsureForAccount will mkdir + create links. + targetHome := filepath.Join(t.TempDir(), "claude-compte2") + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Home: filepath.Join(t.TempDir(), "claude-compte1")}, + {Name: "compte2", Home: targetHome}, + }, + Pool: config.PoolConfig{ + Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0}, + }, + } + + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + a.homeDir = t.TempDir() + shared := tmpShared(t.TempDir()) + a.sharedSymlinks = shared + + // Pre-assert: no link exists in targetHome. + for _, sl := range shared { + if _, err := os.Lstat(filepath.Join(targetHome, sl.Name)); !os.IsNotExist(err) { + t.Fatalf("pre-condition: %q should not exist yet (err=%v)", sl.Name, err) + } + } + + a.executeSwitch(quota.SwitchRequest{From: "compte1"}) + + // Post-assert: every required link exists and points at the canonical + // target under the tmpdir-scoped shared root. + for _, sl := range shared { + linkPath := filepath.Join(targetHome, sl.Name) + info, err := os.Lstat(linkPath) + if err != nil { + t.Errorf("expected link at %s after flip: %v", linkPath, err) + continue + } + if info.Mode()&os.ModeSymlink == 0 { + t.Errorf("%s exists but is not a symlink", linkPath) + continue + } + got, err := os.Readlink(linkPath) + if err != nil { + t.Errorf("readlink %s: %v", linkPath, err) + continue + } + if got != sl.Target { + t.Errorf("link %s points to %q, want %q", linkPath, got, sl.Target) + } + } +} + +// TestFlipEnsureSymlinksFailureDoesNotAbortSwap verifies A3 best-effort: +// if EnsureForAccount returns an error (here: a divergent pre-existing link +// that the symlinks package refuses to auto-correct), the flip and the swap +// MUST still complete. The shared symlink reconcile is post-flip cleanup, +// not a gate on the failover itself — aborting here would leave the daemon +// in an inconsistent state (symlink flipped but active account not updated). +func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) { + tc := newMockTmux() + + s := state.New("") + s.SetActiveAccount("compte1") + + targetHome := filepath.Join(t.TempDir(), "claude-compte2") + if err := os.MkdirAll(targetHome, 0700); err != nil { + t.Fatalf("mkdir target home: %v", err) + } + // Plant a divergent link at /session-env. The symlinks + // package refuses to auto-correct this (data-loss safeguard) and will + // return an error — which the switcher must swallow with a WARN log. + bogus := filepath.Join(t.TempDir(), "somewhere-else") + if err := os.MkdirAll(bogus, 0700); err != nil { + t.Fatalf("mkdir bogus: %v", err) + } + if err := os.Symlink(bogus, filepath.Join(targetHome, "session-env")); err != nil { + t.Fatalf("plant divergent link: %v", err) + } + + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Home: filepath.Join(t.TempDir(), "claude-compte1")}, + {Name: "compte2", Home: targetHome}, + }, + Pool: config.PoolConfig{ + Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0}, + }, + } + + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + a.homeDir = t.TempDir() + a.sharedSymlinks = tmpShared(t.TempDir()) + + a.executeSwitch(quota.SwitchRequest{From: "compte1"}) + + // The swap must have completed despite the divergent-link error. + if got := s.ActiveAccount(); got != "compte2" { + t.Errorf("swap should complete even when ensure fails; active=%q want compte2", got) + } +} From 20063b1939a5391763c588bdc42e4fdf051aba4c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 16 Apr 2026 19:53:48 +0000 Subject: [PATCH 4/4] fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour) --- VERSION.md | 62 +++++++++- internal/switcher/account_switcher.go | 115 ++++++++++++++++-- internal/switcher/account_switcher_test.go | 133 ++++++++++++++++++--- internal/symlinks/shared_test.go | 70 +++++++++++ 4 files changed, 356 insertions(+), 24 deletions(-) diff --git a/VERSION.md b/VERSION.md index e9acf7d..9fa2f32 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1,4 +1,64 @@ -# Version actuelle : 0.3.7 +# Version actuelle : 0.3.8 + +## [0.3.8] - 2026-04-16 +**Type:** Patch — Bug #1 (A3 flip+ensure inconsistency) + Bug #10 (requiredShared contract test) + +### Corrigé — Bug #1 (CRITIQUE) +- `AccountSwitcher.executeSwitch` ne continue plus silencieusement quand + `symlinks.EnsureForAccount` échoue après le flip : il **roll-back** le lien + `~/.claude` vers le home du compte précédent et **n'appelle pas** + `SetActiveAccount`. Évite l'état incohérent où le daemon déclare le compte + cible actif alors que ses shared symlinks sont divergents → transcripts + dupliqués silencieusement, resume cassé. +- Si le rollback réussit : swap annulé, état filesystem = état pré-swap, + erreur explicite retournée par `executeSwitchE`. +- Si ensure ET rollback échouent : `partialSwap` atomique sticky set, + `ErrPartialSwap` retourné, tout futur swap est refusé tant que le + daemon n'est pas redémarré par l'opérateur. +- Nouvelle méthode publique `AccountSwitcher.IsPartialSwap() bool` pour + que health-checks et watchdog exposent l'état dégradé. + +### Ajouté — Tests Bug #1 +- `TestFlipEnsureFailureTriggersRollback` : plant un lien divergent sur + le home cible → ensure échoue → rollback réussit → `ActiveAccount` reste + compte1 → `~/.claude` pointe sur previousHome → `IsPartialSwap` = false. +- `TestFlipEnsureAndRollbackFailure` : force les deux flips à échouer + (homeDir = fichier régulier) → `ErrPartialSwap` retourné, flag sticky + set, swap suivant refusé. + +### Ajouté — Bug #10 +- `TestRequiredSharedIsCoherent` (`internal/symlinks/shared_test.go`) : + valide le contrat de la constante package-level `RequiredShared` jamais + exercée auparavant (tous les autres tests utilisent un override scoped + en `t.TempDir()`). Vérifie sans toucher au filesystem : + - exactement 3 entrées (`session-env`, `file-history`, `projects`) + - targets absolus + - `filepath.Dir(target)` identique pour les 3 entrées (invariant + "3 liens sous un même shared root" sur lequel repose `EnsureForAccount`). + +### Rationale +- Continuer après un ensure échoué revient à valider que le compte cible + est "sain" alors que les shared symlinks sont absents ou divergents. + Conséquence en prod : premier `claude --resume` écrit dans + `~/.claude/projects/` (privé) → transcripts dupliqués, undo + désynchronisé, failover complètement cassé sans log d'alerte. +- Le rollback garantit qu'un compte cible mal configuré ne peut PAS + dégrader le state du daemon : on retourne à l'état pré-swap et on + signale l'erreur à l'appelant. +- `ErrPartialSwap` + `IsPartialSwap()` documente un état où l'intervention + humaine est obligatoire — préférable à un retry automatique qui + empirerait la divergence. + +### Tests +- ✅ `go test ./...` : tous les packages PASS +- ✅ `go test -race ./...` : PASS, aucun data race +- ✅ `go vet ./...` : clean +- ✅ `go build ./...` : clean + +### Fichiers modifiés +- `internal/switcher/account_switcher.go` (+rollback + IsPartialSwap + ErrPartialSwap) +- `internal/switcher/account_switcher_test.go` (2 nouveaux tests, 1 test obsolète remplacé) +- `internal/symlinks/shared_test.go` (+TestRequiredSharedIsCoherent) ## [0.3.7] - 2026-04-16 **Type:** Patch — Phase 1 / Chantier A3 : wire EnsureForAccount post-flip diff --git a/internal/switcher/account_switcher.go b/internal/switcher/account_switcher.go index 9e854b1..3778403 100644 --- a/internal/switcher/account_switcher.go +++ b/internal/switcher/account_switcher.go @@ -4,6 +4,7 @@ package switcher import ( "context" + "errors" "fmt" "log" "os" @@ -11,6 +12,7 @@ import ( "regexp" "strconv" "strings" + "sync/atomic" "time" "forge.secuaas.ovh/olivier/claude-failover/internal/config" @@ -21,6 +23,16 @@ import ( "forge.secuaas.ovh/olivier/claude-failover/internal/tmux" ) +// ErrPartialSwap is returned (and wrapped) when the switcher flipped +// ~/.claude to the target home, EnsureForAccount failed on the target, +// and the rollback flip back to the previous home ALSO failed. The daemon +// is in a documented degraded state: the active-account setter was NOT +// called, but the filesystem symlink may point at an account whose shared +// state is inconsistent. Operator intervention is required. Callers can +// interrogate AccountSwitcher.IsPartialSwap() to expose the flag to +// health-checks / watchdogs. +var ErrPartialSwap = errors.New("switcher: partial swap — flip succeeded but ensure + rollback both failed") + // SwitchState represents the current phase of a failover operation. type SwitchState string @@ -58,6 +70,14 @@ type AccountSwitcher struct { // suite never touches the operator's real /home/ubuntu/.claude-* // shared directories. When nil, symlinks.RequiredShared is used. sharedSymlinks []symlinks.SharedSymlink + // partialSwap is set to 1 when a flip+ensure+rollback sequence left the + // daemon in an inconsistent state (symlink possibly flipped, but active + // account NOT updated, and rollback flip ALSO failed). Health-checks / + // watchdogs read this flag via IsPartialSwap(). It is sticky: once set, + // it stays set until the operator restarts the daemon after fixing the + // filesystem state. We use atomic access so watchdog goroutines can read + // it without blocking the switcher. + partialSwap atomic.Bool } // New creates an AccountSwitcher. @@ -94,8 +114,30 @@ func (a *AccountSwitcher) Run(ctx context.Context) { // executeSwitch performs the full failover sequence. func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { + if err := a.executeSwitchE(req); err != nil { + // executeSwitchE already logs the detail; we swallow the error here + // because the public Run loop has no return channel. The partialSwap + // flag (if set) remains visible via IsPartialSwap(). + a.logger.Printf("[switcher] SWAP aborted: %v", err) + } +} + +// executeSwitchE runs the swap and returns an error describing any abort or +// partial-swap condition. Split out from executeSwitch so tests can assert +// on the error value without routing through a channel. +func (a *AccountSwitcher) executeSwitchE(req quota.SwitchRequest) error { a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime) + // Refuse to proceed if a previous swap left the daemon in an + // inconsistent state. The operator must intervene (fix the filesystem, + // restart the daemon) before any further failover can be attempted — + // otherwise we'd stack symlink flips on top of a broken state. + if a.partialSwap.Load() { + err := fmt.Errorf("refusing swap: daemon is in partial-swap degraded state (operator intervention required)") + a.logger.Printf("[switcher] %v", err) + return err + } + // 1. SAVING — capture resume UUIDs from all working sessions plus // every dedicated session unconditionally (dedicated sessions are // user-driven and may not be tracked as "working" in state, but their @@ -110,21 +152,51 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { if target == nil { a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From) a.currentState = StateNormal - return + return nil } + previous := a.findAccountByName(req.From) if err := a.flipSymlink(target.Home); err != nil { a.logger.Printf("[switcher] flipSymlink error: %v", err) } - // Best-effort: make sure the target account home exposes the three - // shared-state symlinks (session-env, file-history, projects). The main - // ~/.claude flip is already done, so an error here must NOT abort the - // swap — we just log it so the operator can investigate. Without this - // call, a fresh target account with no shared links would silently - // start writing into private /projects/session-env/file-history dirs - // and diverge from the primary account's transcripts. + // Ensure the target account home exposes the three shared-state + // symlinks (session-env, file-history, projects). If this fails we + // MUST NOT proceed with SetActiveAccount — the daemon would otherwise + // declare the target "active" while its shared state is divergent, + // silently writing transcripts into private /projects directories and + // breaking `claude --resume` across sessions. Instead we attempt to + // roll back the ~/.claude flip to the previous account. If the + // rollback also fails, the daemon is in a documented degraded state + // (ErrPartialSwap) and the operator must intervene. if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil { - a.logger.Printf("[switcher] WARN ensure shared symlinks for %q: %v", target.Home, err) + a.logger.Printf("[switcher] ensure shared symlinks for %q failed: %v — attempting rollback", target.Home, err) + if previous == nil || previous.Home == "" { + // No known previous home to roll back to — set the degraded + // flag and bail out. This is equivalent to a rollback failure + // because the filesystem is pointed at a broken target. + a.partialSwap.Store(true) + a.currentState = StateNormal + return fmt.Errorf("%w: ensure failed (%v) and no previous account home is known for rollback", ErrPartialSwap, err) + } + if rbErr := a.flipSymlink(previous.Home); rbErr != nil { + // Both the ensure AND the rollback failed. The daemon is now + // in a documented inconsistent state: ~/.claude may point at + // target whose shared-state is divergent, but SetActiveAccount + // has NOT been called so state.ActiveAccount is still the + // previous account. No further failover can be attempted + // until the operator intervenes. + a.partialSwap.Store(true) + a.logger.Printf("[switcher] CRITICAL partial swap: ensure=%v rollback=%v — daemon in degraded state, operator intervention required", err, rbErr) + a.currentState = StateNormal + return fmt.Errorf("%w: ensure=%v rollback=%v", ErrPartialSwap, err, rbErr) + } + // Rollback succeeded — symlink is back on the previous account, + // SetActiveAccount was NEVER called, state is consistent with + // "no swap happened". Return an explicit error so the caller + // knows the swap was cancelled. + a.logger.Printf("[switcher] rollback successful: ~/.claude → %s (swap cancelled)", previous.Home) + a.currentState = StateNormal + return fmt.Errorf("swap cancelled: ensure shared symlinks failed on target %q: %w", target.Home, err) } a.killAllPoolSessions() a.recreatePoolSessions() @@ -151,6 +223,31 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { } a.currentState = StateNormal + return nil +} + +// IsPartialSwap reports whether the switcher is in a degraded state after a +// flip+ensure+rollback sequence all failed. Health-checks and watchdogs use +// this signal to surface an operator-actionable alert. The flag is sticky +// for the lifetime of the process: once set, it remains set until the daemon +// is restarted (after the operator has fixed the filesystem). +func (a *AccountSwitcher) IsPartialSwap() bool { + return a.partialSwap.Load() +} + +// findAccountByName returns the account config entry matching name, or nil. +// Unlike findTargetAccount (which returns the first NON-matching account), +// this is used by the rollback path to recover the previous home. +func (a *AccountSwitcher) findAccountByName(name string) *config.AccountConfig { + if name == "" { + return nil + } + for i := range a.config.Accounts { + if a.config.Accounts[i].Name == name { + return &a.config.Accounts[i] + } + } + return nil } // saveDedicatedUUIDs captures the resume UUID for every configured dedicated diff --git a/internal/switcher/account_switcher_test.go b/internal/switcher/account_switcher_test.go index d2f327d..14df33e 100644 --- a/internal/switcher/account_switcher_test.go +++ b/internal/switcher/account_switcher_test.go @@ -1,6 +1,7 @@ package switcher import ( + "errors" "os" "path/filepath" "strings" @@ -302,25 +303,34 @@ func TestFlipReconcilesSharedSymlinksOnTargetHome(t *testing.T) { } } -// TestFlipEnsureSymlinksFailureDoesNotAbortSwap verifies A3 best-effort: -// if EnsureForAccount returns an error (here: a divergent pre-existing link -// that the symlinks package refuses to auto-correct), the flip and the swap -// MUST still complete. The shared symlink reconcile is post-flip cleanup, -// not a gate on the failover itself — aborting here would leave the daemon -// in an inconsistent state (symlink flipped but active account not updated). -func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) { +// TestFlipEnsureFailureTriggersRollback verifies the fix for the A3 bug +// (flip+ensure inconsistency): if EnsureForAccount fails on the target home +// after the ~/.claude flip, the switcher MUST NOT mark the target account +// active. It must instead roll back the ~/.claude symlink to the previous +// account's home, leaving the daemon in the pre-swap state so subsequent +// session work keeps writing to the known-good shared state. +// +// Old (buggy) behaviour: ensure error was WARN-only, SetActiveAccount still +// happened, dedicated sessions were relaunched against a target whose +// /projects, /session-env, /file-history were missing or divergent → +// transcripts duplicated silently, resume broke, undo history diverged. +func TestFlipEnsureFailureTriggersRollback(t *testing.T) { tc := newMockTmux() s := state.New("") s.SetActiveAccount("compte1") + previousHome := filepath.Join(t.TempDir(), "claude-compte1") targetHome := filepath.Join(t.TempDir(), "claude-compte2") + if err := os.MkdirAll(previousHome, 0700); err != nil { + t.Fatalf("mkdir previous home: %v", err) + } if err := os.MkdirAll(targetHome, 0700); err != nil { t.Fatalf("mkdir target home: %v", err) } // Plant a divergent link at /session-env. The symlinks // package refuses to auto-correct this (data-loss safeguard) and will - // return an error — which the switcher must swallow with a WARN log. + // return an error, which must now trigger a rollback. bogus := filepath.Join(t.TempDir(), "somewhere-else") if err := os.MkdirAll(bogus, 0700); err != nil { t.Fatalf("mkdir bogus: %v", err) @@ -331,7 +341,7 @@ func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) { cfg := &config.Config{ Accounts: []config.AccountConfig{ - {Name: "compte1", Home: filepath.Join(t.TempDir(), "claude-compte1")}, + {Name: "compte1", Home: previousHome}, {Name: "compte2", Home: targetHome}, }, Pool: config.PoolConfig{ @@ -340,13 +350,108 @@ func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) { } a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) - a.homeDir = t.TempDir() + homeDir := t.TempDir() + a.homeDir = homeDir a.sharedSymlinks = tmpShared(t.TempDir()) - a.executeSwitch(quota.SwitchRequest{From: "compte1"}) + err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}) + if err == nil { + t.Fatalf("executeSwitchE: expected cancellation error, got nil") + } + // The public symmetric swap-cancelled error must mention ensure and + // wrap the underlying symlinks package message. ErrPartialSwap must + // NOT be set (rollback succeeded → recoverable condition). + if errors.Is(err, ErrPartialSwap) { + t.Errorf("did not expect ErrPartialSwap; rollback succeeded; got %v", err) + } + if a.IsPartialSwap() { + t.Errorf("IsPartialSwap should be false when rollback succeeds") + } - // The swap must have completed despite the divergent-link error. - if got := s.ActiveAccount(); got != "compte2" { - t.Errorf("swap should complete even when ensure fails; active=%q want compte2", got) + // Active account must remain the previous one — SetActiveAccount must + // NOT have been called. + if got := s.ActiveAccount(); got != "compte1" { + t.Errorf("active account should stay compte1 after rollback; got %q", got) + } + + // ~/.claude must now point at the previous home (rollback target). + link, rlErr := os.Readlink(filepath.Join(homeDir, ".claude")) + if rlErr != nil { + t.Fatalf("readlink ~/.claude: %v", rlErr) + } + if link != previousHome { + t.Errorf("~/.claude should point at previous home %q after rollback; got %q", previousHome, link) + } +} + +// TestFlipEnsureAndRollbackFailure verifies that when BOTH EnsureForAccount +// AND the rollback flip fail, the switcher sets the sticky partial-swap +// flag and returns ErrPartialSwap. The daemon is then in a documented +// degraded state where any further swap is refused until the operator +// restarts it. +func TestFlipEnsureAndRollbackFailure(t *testing.T) { + tc := newMockTmux() + + s := state.New("") + s.SetActiveAccount("compte1") + + previousHome := filepath.Join(t.TempDir(), "claude-compte1") + targetHome := filepath.Join(t.TempDir(), "claude-compte2") + if err := os.MkdirAll(previousHome, 0700); err != nil { + t.Fatalf("mkdir previous home: %v", err) + } + if err := os.MkdirAll(targetHome, 0700); err != nil { + t.Fatalf("mkdir target home: %v", err) + } + // Plant the divergent link that will cause EnsureForAccount to fail. + bogus := filepath.Join(t.TempDir(), "somewhere-else") + if err := os.MkdirAll(bogus, 0700); err != nil { + t.Fatalf("mkdir bogus: %v", err) + } + if err := os.Symlink(bogus, filepath.Join(targetHome, "session-env")); err != nil { + t.Fatalf("plant divergent link: %v", err) + } + + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Home: previousHome}, + {Name: "compte2", Home: targetHome}, + }, + Pool: config.PoolConfig{ + Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0}, + }, + } + + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + + // Force the rollback flip to fail: point homeDir at a file that cannot + // host a .claude symlink. We use a regular file; the flipSymlink + // implementation does os.Remove() then os.Symlink() under homeDir, + // which fails when homeDir is itself a file (ENOTDIR). + badHomeFile := filepath.Join(t.TempDir(), "not-a-dir") + if err := os.WriteFile(badHomeFile, []byte("block"), 0600); err != nil { + t.Fatalf("write bad home: %v", err) + } + a.homeDir = badHomeFile + a.sharedSymlinks = tmpShared(t.TempDir()) + + err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}) + if err == nil { + t.Fatalf("expected ErrPartialSwap, got nil") + } + if !errors.Is(err, ErrPartialSwap) { + t.Errorf("expected ErrPartialSwap, got %v", err) + } + if !a.IsPartialSwap() { + t.Errorf("IsPartialSwap should be true when both ensure AND rollback fail") + } + // SetActiveAccount must still not have been called. + if got := s.ActiveAccount(); got != "compte1" { + t.Errorf("active account must stay compte1 in partial-swap; got %q", got) + } + + // A subsequent swap attempt must be refused while the flag is set. + if err2 := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}); err2 == nil { + t.Errorf("expected subsequent swap to be refused in degraded state") } } diff --git a/internal/symlinks/shared_test.go b/internal/symlinks/shared_test.go index 64625d4..e472678 100644 --- a/internal/symlinks/shared_test.go +++ b/internal/symlinks/shared_test.go @@ -204,3 +204,73 @@ func TestRequiredShared_defaultsAreReasonable(t *testing.T) { } } } + +// TestRequiredSharedIsCoherent validates the contract of the package-level +// RequiredShared constant that the switcher and lifecycle manager consume +// in production. The rest of the suite exercises EnsureForAccount / +// ValidateAll with tmpdir-scoped override lists, so the actual prod +// constant (pointing under /home/ubuntu/.claude-*-shared) is never touched +// by those tests — a regression that shrinks or renames RequiredShared +// would pass every other test but silently break failover on real VMs +// (missing a link → writes to private state → transcripts duplicated). +// +// The test is filesystem-free: it only asserts the shape of the constant. +// +// 1. Exactly three entries, one per Name required by the A-failover design. +// 2. Every Target is absolute. +// 3. All three Targets share the same parent directory — there is no +// mode of operation where one shared dir lives elsewhere than the +// others. `filepath.Dir(target)` must be identical across entries. +// +// This encodes the "3 links under one shared root" invariant that +// EnsureForAccount relies on. Any future change to RequiredShared that +// breaks this invariant should force the author to update the switcher +// contract explicitly. +func TestRequiredSharedIsCoherent(t *testing.T) { + expectedNames := map[string]bool{ + "session-env": false, + "file-history": false, + "projects": false, + } + if len(RequiredShared) != len(expectedNames) { + t.Fatalf("RequiredShared must contain exactly %d entries (session-env, file-history, projects); got %d: %+v", + len(expectedNames), len(RequiredShared), RequiredShared) + } + + var sharedParent string + for i, sl := range RequiredShared { + if _, ok := expectedNames[sl.Name]; !ok { + t.Errorf("RequiredShared[%d]: unexpected Name %q (allowed: session-env / file-history / projects)", i, sl.Name) + continue + } + if expectedNames[sl.Name] { + t.Errorf("RequiredShared[%d]: duplicate Name %q", i, sl.Name) + } + expectedNames[sl.Name] = true + + if sl.Target == "" { + t.Errorf("RequiredShared[%d] (%q): empty Target", i, sl.Name) + continue + } + if !filepath.IsAbs(sl.Target) { + t.Errorf("RequiredShared[%d] (%q): Target %q must be absolute", i, sl.Name, sl.Target) + continue + } + + parent := filepath.Dir(sl.Target) + if i == 0 { + sharedParent = parent + continue + } + if parent != sharedParent { + t.Errorf("RequiredShared[%d] (%q): parent dir %q diverges from %q — all shared targets must live under the same root", + i, sl.Name, parent, sharedParent) + } + } + + for name, seen := range expectedNames { + if !seen { + t.Errorf("RequiredShared is missing the required %q entry", name) + } + } +}