fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10)

Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency
- Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount
  still fired → daemon declared target active while shared symlinks were
  absent/divergent → transcripts silently duplicated, resume broken.
- After: ensure failure triggers rollback flip to previous account home;
  if rollback succeeds → explicit error, ActiveAccount stays on previous.
  If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all
  further swaps refused until operator intervention (daemon restart).
- New public IsPartialSwap() for watchdog / health-check integration.

Bug #10 (MOYENNE) — requiredShared contract never exercised
- All existing tests override a.sharedSymlinks with tmpdir-scoped lists,
  so symlinks.RequiredShared itself was never tested. A rename or drop
  would pass every test but silently break prod failover.
- TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with
  the exact required names, absolute targets, and a single shared parent
  directory (invariant EnsureForAccount depends on).

Tests:
- go test ./... PASS
- go test -race ./... PASS (no data race)
- 2 new switcher tests: TestFlipEnsureFailureTriggersRollback,
  TestFlipEnsureAndRollbackFailure
- 1 new symlinks test: TestRequiredSharedIsCoherent
- 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap
  (encoded the old buggy best-effort behaviour)
This commit is contained in:
Ubuntu 2026-04-16 19:53:48 +00:00
parent 8eaf0bbd35
commit 20063b1939
4 changed files with 356 additions and 24 deletions

View file

@ -1,6 +1,7 @@
package switcher
import (
"errors"
"os"
"path/filepath"
"strings"
@ -302,25 +303,34 @@ func TestFlipReconcilesSharedSymlinksOnTargetHome(t *testing.T) {
}
}
// TestFlipEnsureSymlinksFailureDoesNotAbortSwap verifies A3 best-effort:
// if EnsureForAccount returns an error (here: a divergent pre-existing link
// that the symlinks package refuses to auto-correct), the flip and the swap
// MUST still complete. The shared symlink reconcile is post-flip cleanup,
// not a gate on the failover itself — aborting here would leave the daemon
// in an inconsistent state (symlink flipped but active account not updated).
func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) {
// TestFlipEnsureFailureTriggersRollback verifies the fix for the A3 bug
// (flip+ensure inconsistency): if EnsureForAccount fails on the target home
// after the ~/.claude flip, the switcher MUST NOT mark the target account
// active. It must instead roll back the ~/.claude symlink to the previous
// account's home, leaving the daemon in the pre-swap state so subsequent
// session work keeps writing to the known-good shared state.
//
// Old (buggy) behaviour: ensure error was WARN-only, SetActiveAccount still
// happened, dedicated sessions were relaunched against a target whose
// /projects, /session-env, /file-history were missing or divergent →
// transcripts duplicated silently, resume broke, undo history diverged.
func TestFlipEnsureFailureTriggersRollback(t *testing.T) {
tc := newMockTmux()
s := state.New("")
s.SetActiveAccount("compte1")
previousHome := filepath.Join(t.TempDir(), "claude-compte1")
targetHome := filepath.Join(t.TempDir(), "claude-compte2")
if err := os.MkdirAll(previousHome, 0700); err != nil {
t.Fatalf("mkdir previous home: %v", err)
}
if err := os.MkdirAll(targetHome, 0700); err != nil {
t.Fatalf("mkdir target home: %v", err)
}
// Plant a divergent link at <targetHome>/session-env. The symlinks
// package refuses to auto-correct this (data-loss safeguard) and will
// return an error — which the switcher must swallow with a WARN log.
// return an error, which must now trigger a rollback.
bogus := filepath.Join(t.TempDir(), "somewhere-else")
if err := os.MkdirAll(bogus, 0700); err != nil {
t.Fatalf("mkdir bogus: %v", err)
@ -331,7 +341,7 @@ func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) {
cfg := &config.Config{
Accounts: []config.AccountConfig{
{Name: "compte1", Home: filepath.Join(t.TempDir(), "claude-compte1")},
{Name: "compte1", Home: previousHome},
{Name: "compte2", Home: targetHome},
},
Pool: config.PoolConfig{
@ -340,13 +350,108 @@ func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) {
}
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
a.homeDir = t.TempDir()
homeDir := t.TempDir()
a.homeDir = homeDir
a.sharedSymlinks = tmpShared(t.TempDir())
a.executeSwitch(quota.SwitchRequest{From: "compte1"})
err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"})
if err == nil {
t.Fatalf("executeSwitchE: expected cancellation error, got nil")
}
// The public symmetric swap-cancelled error must mention ensure and
// wrap the underlying symlinks package message. ErrPartialSwap must
// NOT be set (rollback succeeded → recoverable condition).
if errors.Is(err, ErrPartialSwap) {
t.Errorf("did not expect ErrPartialSwap; rollback succeeded; got %v", err)
}
if a.IsPartialSwap() {
t.Errorf("IsPartialSwap should be false when rollback succeeds")
}
// The swap must have completed despite the divergent-link error.
if got := s.ActiveAccount(); got != "compte2" {
t.Errorf("swap should complete even when ensure fails; active=%q want compte2", got)
// Active account must remain the previous one — SetActiveAccount must
// NOT have been called.
if got := s.ActiveAccount(); got != "compte1" {
t.Errorf("active account should stay compte1 after rollback; got %q", got)
}
// ~/.claude must now point at the previous home (rollback target).
link, rlErr := os.Readlink(filepath.Join(homeDir, ".claude"))
if rlErr != nil {
t.Fatalf("readlink ~/.claude: %v", rlErr)
}
if link != previousHome {
t.Errorf("~/.claude should point at previous home %q after rollback; got %q", previousHome, link)
}
}
// TestFlipEnsureAndRollbackFailure verifies that when BOTH EnsureForAccount
// AND the rollback flip fail, the switcher sets the sticky partial-swap
// flag and returns ErrPartialSwap. The daemon is then in a documented
// degraded state where any further swap is refused until the operator
// restarts it.
func TestFlipEnsureAndRollbackFailure(t *testing.T) {
tc := newMockTmux()
s := state.New("")
s.SetActiveAccount("compte1")
previousHome := filepath.Join(t.TempDir(), "claude-compte1")
targetHome := filepath.Join(t.TempDir(), "claude-compte2")
if err := os.MkdirAll(previousHome, 0700); err != nil {
t.Fatalf("mkdir previous home: %v", err)
}
if err := os.MkdirAll(targetHome, 0700); err != nil {
t.Fatalf("mkdir target home: %v", err)
}
// Plant the divergent link that will cause EnsureForAccount to fail.
bogus := filepath.Join(t.TempDir(), "somewhere-else")
if err := os.MkdirAll(bogus, 0700); err != nil {
t.Fatalf("mkdir bogus: %v", err)
}
if err := os.Symlink(bogus, filepath.Join(targetHome, "session-env")); err != nil {
t.Fatalf("plant divergent link: %v", err)
}
cfg := &config.Config{
Accounts: []config.AccountConfig{
{Name: "compte1", Home: previousHome},
{Name: "compte2", Home: targetHome},
},
Pool: config.PoolConfig{
Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0},
},
}
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
// Force the rollback flip to fail: point homeDir at a file that cannot
// host a .claude symlink. We use a regular file; the flipSymlink
// implementation does os.Remove() then os.Symlink() under homeDir,
// which fails when homeDir is itself a file (ENOTDIR).
badHomeFile := filepath.Join(t.TempDir(), "not-a-dir")
if err := os.WriteFile(badHomeFile, []byte("block"), 0600); err != nil {
t.Fatalf("write bad home: %v", err)
}
a.homeDir = badHomeFile
a.sharedSymlinks = tmpShared(t.TempDir())
err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"})
if err == nil {
t.Fatalf("expected ErrPartialSwap, got nil")
}
if !errors.Is(err, ErrPartialSwap) {
t.Errorf("expected ErrPartialSwap, got %v", err)
}
if !a.IsPartialSwap() {
t.Errorf("IsPartialSwap should be true when both ensure AND rollback fail")
}
// SetActiveAccount must still not have been called.
if got := s.ActiveAccount(); got != "compte1" {
t.Errorf("active account must stay compte1 in partial-swap; got %q", got)
}
// A subsequent swap attempt must be refused while the flag is set.
if err2 := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}); err2 == nil {
t.Errorf("expected subsequent swap to be refused in degraded state")
}
}