fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10)
Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
This commit is contained in:
parent
8eaf0bbd35
commit
20063b1939
4 changed files with 356 additions and 24 deletions
|
|
@ -4,6 +4,7 @@ package switcher
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
|
|
@ -11,6 +12,7 @@ import (
|
|||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
|
||||
|
|
@ -21,6 +23,16 @@ import (
|
|||
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
|
||||
)
|
||||
|
||||
// ErrPartialSwap is returned (and wrapped) when the switcher flipped
|
||||
// ~/.claude to the target home, EnsureForAccount failed on the target,
|
||||
// and the rollback flip back to the previous home ALSO failed. The daemon
|
||||
// is in a documented degraded state: the active-account setter was NOT
|
||||
// called, but the filesystem symlink may point at an account whose shared
|
||||
// state is inconsistent. Operator intervention is required. Callers can
|
||||
// interrogate AccountSwitcher.IsPartialSwap() to expose the flag to
|
||||
// health-checks / watchdogs.
|
||||
var ErrPartialSwap = errors.New("switcher: partial swap — flip succeeded but ensure + rollback both failed")
|
||||
|
||||
// SwitchState represents the current phase of a failover operation.
|
||||
type SwitchState string
|
||||
|
||||
|
|
@ -58,6 +70,14 @@ type AccountSwitcher struct {
|
|||
// suite never touches the operator's real /home/ubuntu/.claude-*
|
||||
// shared directories. When nil, symlinks.RequiredShared is used.
|
||||
sharedSymlinks []symlinks.SharedSymlink
|
||||
// partialSwap is set to 1 when a flip+ensure+rollback sequence left the
|
||||
// daemon in an inconsistent state (symlink possibly flipped, but active
|
||||
// account NOT updated, and rollback flip ALSO failed). Health-checks /
|
||||
// watchdogs read this flag via IsPartialSwap(). It is sticky: once set,
|
||||
// it stays set until the operator restarts the daemon after fixing the
|
||||
// filesystem state. We use atomic access so watchdog goroutines can read
|
||||
// it without blocking the switcher.
|
||||
partialSwap atomic.Bool
|
||||
}
|
||||
|
||||
// New creates an AccountSwitcher.
|
||||
|
|
@ -94,8 +114,30 @@ func (a *AccountSwitcher) Run(ctx context.Context) {
|
|||
|
||||
// executeSwitch performs the full failover sequence.
|
||||
func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) {
|
||||
if err := a.executeSwitchE(req); err != nil {
|
||||
// executeSwitchE already logs the detail; we swallow the error here
|
||||
// because the public Run loop has no return channel. The partialSwap
|
||||
// flag (if set) remains visible via IsPartialSwap().
|
||||
a.logger.Printf("[switcher] SWAP aborted: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// executeSwitchE runs the swap and returns an error describing any abort or
|
||||
// partial-swap condition. Split out from executeSwitch so tests can assert
|
||||
// on the error value without routing through a channel.
|
||||
func (a *AccountSwitcher) executeSwitchE(req quota.SwitchRequest) error {
|
||||
a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime)
|
||||
|
||||
// Refuse to proceed if a previous swap left the daemon in an
|
||||
// inconsistent state. The operator must intervene (fix the filesystem,
|
||||
// restart the daemon) before any further failover can be attempted —
|
||||
// otherwise we'd stack symlink flips on top of a broken state.
|
||||
if a.partialSwap.Load() {
|
||||
err := fmt.Errorf("refusing swap: daemon is in partial-swap degraded state (operator intervention required)")
|
||||
a.logger.Printf("[switcher] %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
// 1. SAVING — capture resume UUIDs from all working sessions plus
|
||||
// every dedicated session unconditionally (dedicated sessions are
|
||||
// user-driven and may not be tracked as "working" in state, but their
|
||||
|
|
@ -110,21 +152,51 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) {
|
|||
if target == nil {
|
||||
a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From)
|
||||
a.currentState = StateNormal
|
||||
return
|
||||
return nil
|
||||
}
|
||||
previous := a.findAccountByName(req.From)
|
||||
|
||||
if err := a.flipSymlink(target.Home); err != nil {
|
||||
a.logger.Printf("[switcher] flipSymlink error: %v", err)
|
||||
}
|
||||
// Best-effort: make sure the target account home exposes the three
|
||||
// shared-state symlinks (session-env, file-history, projects). The main
|
||||
// ~/.claude flip is already done, so an error here must NOT abort the
|
||||
// swap — we just log it so the operator can investigate. Without this
|
||||
// call, a fresh target account with no shared links would silently
|
||||
// start writing into private /projects/session-env/file-history dirs
|
||||
// and diverge from the primary account's transcripts.
|
||||
// Ensure the target account home exposes the three shared-state
|
||||
// symlinks (session-env, file-history, projects). If this fails we
|
||||
// MUST NOT proceed with SetActiveAccount — the daemon would otherwise
|
||||
// declare the target "active" while its shared state is divergent,
|
||||
// silently writing transcripts into private /projects directories and
|
||||
// breaking `claude --resume` across sessions. Instead we attempt to
|
||||
// roll back the ~/.claude flip to the previous account. If the
|
||||
// rollback also fails, the daemon is in a documented degraded state
|
||||
// (ErrPartialSwap) and the operator must intervene.
|
||||
if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil {
|
||||
a.logger.Printf("[switcher] WARN ensure shared symlinks for %q: %v", target.Home, err)
|
||||
a.logger.Printf("[switcher] ensure shared symlinks for %q failed: %v — attempting rollback", target.Home, err)
|
||||
if previous == nil || previous.Home == "" {
|
||||
// No known previous home to roll back to — set the degraded
|
||||
// flag and bail out. This is equivalent to a rollback failure
|
||||
// because the filesystem is pointed at a broken target.
|
||||
a.partialSwap.Store(true)
|
||||
a.currentState = StateNormal
|
||||
return fmt.Errorf("%w: ensure failed (%v) and no previous account home is known for rollback", ErrPartialSwap, err)
|
||||
}
|
||||
if rbErr := a.flipSymlink(previous.Home); rbErr != nil {
|
||||
// Both the ensure AND the rollback failed. The daemon is now
|
||||
// in a documented inconsistent state: ~/.claude may point at
|
||||
// target whose shared-state is divergent, but SetActiveAccount
|
||||
// has NOT been called so state.ActiveAccount is still the
|
||||
// previous account. No further failover can be attempted
|
||||
// until the operator intervenes.
|
||||
a.partialSwap.Store(true)
|
||||
a.logger.Printf("[switcher] CRITICAL partial swap: ensure=%v rollback=%v — daemon in degraded state, operator intervention required", err, rbErr)
|
||||
a.currentState = StateNormal
|
||||
return fmt.Errorf("%w: ensure=%v rollback=%v", ErrPartialSwap, err, rbErr)
|
||||
}
|
||||
// Rollback succeeded — symlink is back on the previous account,
|
||||
// SetActiveAccount was NEVER called, state is consistent with
|
||||
// "no swap happened". Return an explicit error so the caller
|
||||
// knows the swap was cancelled.
|
||||
a.logger.Printf("[switcher] rollback successful: ~/.claude → %s (swap cancelled)", previous.Home)
|
||||
a.currentState = StateNormal
|
||||
return fmt.Errorf("swap cancelled: ensure shared symlinks failed on target %q: %w", target.Home, err)
|
||||
}
|
||||
a.killAllPoolSessions()
|
||||
a.recreatePoolSessions()
|
||||
|
|
@ -151,6 +223,31 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) {
|
|||
}
|
||||
|
||||
a.currentState = StateNormal
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsPartialSwap reports whether the switcher is in a degraded state after a
|
||||
// flip+ensure+rollback sequence all failed. Health-checks and watchdogs use
|
||||
// this signal to surface an operator-actionable alert. The flag is sticky
|
||||
// for the lifetime of the process: once set, it remains set until the daemon
|
||||
// is restarted (after the operator has fixed the filesystem).
|
||||
func (a *AccountSwitcher) IsPartialSwap() bool {
|
||||
return a.partialSwap.Load()
|
||||
}
|
||||
|
||||
// findAccountByName returns the account config entry matching name, or nil.
|
||||
// Unlike findTargetAccount (which returns the first NON-matching account),
|
||||
// this is used by the rollback path to recover the previous home.
|
||||
func (a *AccountSwitcher) findAccountByName(name string) *config.AccountConfig {
|
||||
if name == "" {
|
||||
return nil
|
||||
}
|
||||
for i := range a.config.Accounts {
|
||||
if a.config.Accounts[i].Name == name {
|
||||
return &a.config.Accounts[i]
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// saveDedicatedUUIDs captures the resume UUID for every configured dedicated
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package switcher
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
|
@ -302,25 +303,34 @@ func TestFlipReconcilesSharedSymlinksOnTargetHome(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// TestFlipEnsureSymlinksFailureDoesNotAbortSwap verifies A3 best-effort:
|
||||
// if EnsureForAccount returns an error (here: a divergent pre-existing link
|
||||
// that the symlinks package refuses to auto-correct), the flip and the swap
|
||||
// MUST still complete. The shared symlink reconcile is post-flip cleanup,
|
||||
// not a gate on the failover itself — aborting here would leave the daemon
|
||||
// in an inconsistent state (symlink flipped but active account not updated).
|
||||
func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) {
|
||||
// TestFlipEnsureFailureTriggersRollback verifies the fix for the A3 bug
|
||||
// (flip+ensure inconsistency): if EnsureForAccount fails on the target home
|
||||
// after the ~/.claude flip, the switcher MUST NOT mark the target account
|
||||
// active. It must instead roll back the ~/.claude symlink to the previous
|
||||
// account's home, leaving the daemon in the pre-swap state so subsequent
|
||||
// session work keeps writing to the known-good shared state.
|
||||
//
|
||||
// Old (buggy) behaviour: ensure error was WARN-only, SetActiveAccount still
|
||||
// happened, dedicated sessions were relaunched against a target whose
|
||||
// /projects, /session-env, /file-history were missing or divergent →
|
||||
// transcripts duplicated silently, resume broke, undo history diverged.
|
||||
func TestFlipEnsureFailureTriggersRollback(t *testing.T) {
|
||||
tc := newMockTmux()
|
||||
|
||||
s := state.New("")
|
||||
s.SetActiveAccount("compte1")
|
||||
|
||||
previousHome := filepath.Join(t.TempDir(), "claude-compte1")
|
||||
targetHome := filepath.Join(t.TempDir(), "claude-compte2")
|
||||
if err := os.MkdirAll(previousHome, 0700); err != nil {
|
||||
t.Fatalf("mkdir previous home: %v", err)
|
||||
}
|
||||
if err := os.MkdirAll(targetHome, 0700); err != nil {
|
||||
t.Fatalf("mkdir target home: %v", err)
|
||||
}
|
||||
// Plant a divergent link at <targetHome>/session-env. The symlinks
|
||||
// package refuses to auto-correct this (data-loss safeguard) and will
|
||||
// return an error — which the switcher must swallow with a WARN log.
|
||||
// return an error, which must now trigger a rollback.
|
||||
bogus := filepath.Join(t.TempDir(), "somewhere-else")
|
||||
if err := os.MkdirAll(bogus, 0700); err != nil {
|
||||
t.Fatalf("mkdir bogus: %v", err)
|
||||
|
|
@ -331,7 +341,7 @@ func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) {
|
|||
|
||||
cfg := &config.Config{
|
||||
Accounts: []config.AccountConfig{
|
||||
{Name: "compte1", Home: filepath.Join(t.TempDir(), "claude-compte1")},
|
||||
{Name: "compte1", Home: previousHome},
|
||||
{Name: "compte2", Home: targetHome},
|
||||
},
|
||||
Pool: config.PoolConfig{
|
||||
|
|
@ -340,13 +350,108 @@ func TestFlipEnsureSymlinksFailureDoesNotAbortSwap(t *testing.T) {
|
|||
}
|
||||
|
||||
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
|
||||
a.homeDir = t.TempDir()
|
||||
homeDir := t.TempDir()
|
||||
a.homeDir = homeDir
|
||||
a.sharedSymlinks = tmpShared(t.TempDir())
|
||||
|
||||
a.executeSwitch(quota.SwitchRequest{From: "compte1"})
|
||||
err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"})
|
||||
if err == nil {
|
||||
t.Fatalf("executeSwitchE: expected cancellation error, got nil")
|
||||
}
|
||||
// The public symmetric swap-cancelled error must mention ensure and
|
||||
// wrap the underlying symlinks package message. ErrPartialSwap must
|
||||
// NOT be set (rollback succeeded → recoverable condition).
|
||||
if errors.Is(err, ErrPartialSwap) {
|
||||
t.Errorf("did not expect ErrPartialSwap; rollback succeeded; got %v", err)
|
||||
}
|
||||
if a.IsPartialSwap() {
|
||||
t.Errorf("IsPartialSwap should be false when rollback succeeds")
|
||||
}
|
||||
|
||||
// The swap must have completed despite the divergent-link error.
|
||||
if got := s.ActiveAccount(); got != "compte2" {
|
||||
t.Errorf("swap should complete even when ensure fails; active=%q want compte2", got)
|
||||
// Active account must remain the previous one — SetActiveAccount must
|
||||
// NOT have been called.
|
||||
if got := s.ActiveAccount(); got != "compte1" {
|
||||
t.Errorf("active account should stay compte1 after rollback; got %q", got)
|
||||
}
|
||||
|
||||
// ~/.claude must now point at the previous home (rollback target).
|
||||
link, rlErr := os.Readlink(filepath.Join(homeDir, ".claude"))
|
||||
if rlErr != nil {
|
||||
t.Fatalf("readlink ~/.claude: %v", rlErr)
|
||||
}
|
||||
if link != previousHome {
|
||||
t.Errorf("~/.claude should point at previous home %q after rollback; got %q", previousHome, link)
|
||||
}
|
||||
}
|
||||
|
||||
// TestFlipEnsureAndRollbackFailure verifies that when BOTH EnsureForAccount
|
||||
// AND the rollback flip fail, the switcher sets the sticky partial-swap
|
||||
// flag and returns ErrPartialSwap. The daemon is then in a documented
|
||||
// degraded state where any further swap is refused until the operator
|
||||
// restarts it.
|
||||
func TestFlipEnsureAndRollbackFailure(t *testing.T) {
|
||||
tc := newMockTmux()
|
||||
|
||||
s := state.New("")
|
||||
s.SetActiveAccount("compte1")
|
||||
|
||||
previousHome := filepath.Join(t.TempDir(), "claude-compte1")
|
||||
targetHome := filepath.Join(t.TempDir(), "claude-compte2")
|
||||
if err := os.MkdirAll(previousHome, 0700); err != nil {
|
||||
t.Fatalf("mkdir previous home: %v", err)
|
||||
}
|
||||
if err := os.MkdirAll(targetHome, 0700); err != nil {
|
||||
t.Fatalf("mkdir target home: %v", err)
|
||||
}
|
||||
// Plant the divergent link that will cause EnsureForAccount to fail.
|
||||
bogus := filepath.Join(t.TempDir(), "somewhere-else")
|
||||
if err := os.MkdirAll(bogus, 0700); err != nil {
|
||||
t.Fatalf("mkdir bogus: %v", err)
|
||||
}
|
||||
if err := os.Symlink(bogus, filepath.Join(targetHome, "session-env")); err != nil {
|
||||
t.Fatalf("plant divergent link: %v", err)
|
||||
}
|
||||
|
||||
cfg := &config.Config{
|
||||
Accounts: []config.AccountConfig{
|
||||
{Name: "compte1", Home: previousHome},
|
||||
{Name: "compte2", Home: targetHome},
|
||||
},
|
||||
Pool: config.PoolConfig{
|
||||
Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 0, Max: 0},
|
||||
},
|
||||
}
|
||||
|
||||
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
|
||||
|
||||
// Force the rollback flip to fail: point homeDir at a file that cannot
|
||||
// host a .claude symlink. We use a regular file; the flipSymlink
|
||||
// implementation does os.Remove() then os.Symlink() under homeDir,
|
||||
// which fails when homeDir is itself a file (ENOTDIR).
|
||||
badHomeFile := filepath.Join(t.TempDir(), "not-a-dir")
|
||||
if err := os.WriteFile(badHomeFile, []byte("block"), 0600); err != nil {
|
||||
t.Fatalf("write bad home: %v", err)
|
||||
}
|
||||
a.homeDir = badHomeFile
|
||||
a.sharedSymlinks = tmpShared(t.TempDir())
|
||||
|
||||
err := a.executeSwitchE(quota.SwitchRequest{From: "compte1"})
|
||||
if err == nil {
|
||||
t.Fatalf("expected ErrPartialSwap, got nil")
|
||||
}
|
||||
if !errors.Is(err, ErrPartialSwap) {
|
||||
t.Errorf("expected ErrPartialSwap, got %v", err)
|
||||
}
|
||||
if !a.IsPartialSwap() {
|
||||
t.Errorf("IsPartialSwap should be true when both ensure AND rollback fail")
|
||||
}
|
||||
// SetActiveAccount must still not have been called.
|
||||
if got := s.ActiveAccount(); got != "compte1" {
|
||||
t.Errorf("active account must stay compte1 in partial-swap; got %q", got)
|
||||
}
|
||||
|
||||
// A subsequent swap attempt must be refused while the flag is set.
|
||||
if err2 := a.executeSwitchE(quota.SwitchRequest{From: "compte1"}); err2 == nil {
|
||||
t.Errorf("expected subsequent swap to be refused in degraded state")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -204,3 +204,73 @@ func TestRequiredShared_defaultsAreReasonable(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestRequiredSharedIsCoherent validates the contract of the package-level
|
||||
// RequiredShared constant that the switcher and lifecycle manager consume
|
||||
// in production. The rest of the suite exercises EnsureForAccount /
|
||||
// ValidateAll with tmpdir-scoped override lists, so the actual prod
|
||||
// constant (pointing under /home/ubuntu/.claude-*-shared) is never touched
|
||||
// by those tests — a regression that shrinks or renames RequiredShared
|
||||
// would pass every other test but silently break failover on real VMs
|
||||
// (missing a link → writes to private state → transcripts duplicated).
|
||||
//
|
||||
// The test is filesystem-free: it only asserts the shape of the constant.
|
||||
//
|
||||
// 1. Exactly three entries, one per Name required by the A-failover design.
|
||||
// 2. Every Target is absolute.
|
||||
// 3. All three Targets share the same parent directory — there is no
|
||||
// mode of operation where one shared dir lives elsewhere than the
|
||||
// others. `filepath.Dir(target)` must be identical across entries.
|
||||
//
|
||||
// This encodes the "3 links under one shared root" invariant that
|
||||
// EnsureForAccount relies on. Any future change to RequiredShared that
|
||||
// breaks this invariant should force the author to update the switcher
|
||||
// contract explicitly.
|
||||
func TestRequiredSharedIsCoherent(t *testing.T) {
|
||||
expectedNames := map[string]bool{
|
||||
"session-env": false,
|
||||
"file-history": false,
|
||||
"projects": false,
|
||||
}
|
||||
if len(RequiredShared) != len(expectedNames) {
|
||||
t.Fatalf("RequiredShared must contain exactly %d entries (session-env, file-history, projects); got %d: %+v",
|
||||
len(expectedNames), len(RequiredShared), RequiredShared)
|
||||
}
|
||||
|
||||
var sharedParent string
|
||||
for i, sl := range RequiredShared {
|
||||
if _, ok := expectedNames[sl.Name]; !ok {
|
||||
t.Errorf("RequiredShared[%d]: unexpected Name %q (allowed: session-env / file-history / projects)", i, sl.Name)
|
||||
continue
|
||||
}
|
||||
if expectedNames[sl.Name] {
|
||||
t.Errorf("RequiredShared[%d]: duplicate Name %q", i, sl.Name)
|
||||
}
|
||||
expectedNames[sl.Name] = true
|
||||
|
||||
if sl.Target == "" {
|
||||
t.Errorf("RequiredShared[%d] (%q): empty Target", i, sl.Name)
|
||||
continue
|
||||
}
|
||||
if !filepath.IsAbs(sl.Target) {
|
||||
t.Errorf("RequiredShared[%d] (%q): Target %q must be absolute", i, sl.Name, sl.Target)
|
||||
continue
|
||||
}
|
||||
|
||||
parent := filepath.Dir(sl.Target)
|
||||
if i == 0 {
|
||||
sharedParent = parent
|
||||
continue
|
||||
}
|
||||
if parent != sharedParent {
|
||||
t.Errorf("RequiredShared[%d] (%q): parent dir %q diverges from %q — all shared targets must live under the same root",
|
||||
i, sl.Name, parent, sharedParent)
|
||||
}
|
||||
}
|
||||
|
||||
for name, seen := range expectedNames {
|
||||
if !seen {
|
||||
t.Errorf("RequiredShared is missing the required %q entry", name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue