fix(switcher+symlinks): rollback on ensure failure (Bug #1) + requiredShared contract test (Bug #10)
Bug #1 (CRITIQUE) — A3 flip+ensure inconsistency - Before: EnsureForAccount failure after flip was WARN-only, SetActiveAccount still fired → daemon declared target active while shared symlinks were absent/divergent → transcripts silently duplicated, resume broken. - After: ensure failure triggers rollback flip to previous account home; if rollback succeeds → explicit error, ActiveAccount stays on previous. If rollback ALSO fails → sticky partialSwap flag + ErrPartialSwap; all further swaps refused until operator intervention (daemon restart). - New public IsPartialSwap() for watchdog / health-check integration. Bug #10 (MOYENNE) — requiredShared contract never exercised - All existing tests override a.sharedSymlinks with tmpdir-scoped lists, so symlinks.RequiredShared itself was never tested. A rename or drop would pass every test but silently break prod failover. - TestRequiredSharedIsCoherent asserts (no filesystem): 3 entries with the exact required names, absolute targets, and a single shared parent directory (invariant EnsureForAccount depends on). Tests: - go test ./... PASS - go test -race ./... PASS (no data race) - 2 new switcher tests: TestFlipEnsureFailureTriggersRollback, TestFlipEnsureAndRollbackFailure - 1 new symlinks test: TestRequiredSharedIsCoherent - 1 obsolete test replaced: TestFlipEnsureSymlinksFailureDoesNotAbortSwap (encoded the old buggy best-effort behaviour)
This commit is contained in:
parent
8eaf0bbd35
commit
20063b1939
4 changed files with 356 additions and 24 deletions
|
|
@ -4,6 +4,7 @@ package switcher
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
|
|
@ -11,6 +12,7 @@ import (
|
|||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
|
||||
|
|
@ -21,6 +23,16 @@ import (
|
|||
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
|
||||
)
|
||||
|
||||
// ErrPartialSwap is returned (and wrapped) when the switcher flipped
|
||||
// ~/.claude to the target home, EnsureForAccount failed on the target,
|
||||
// and the rollback flip back to the previous home ALSO failed. The daemon
|
||||
// is in a documented degraded state: the active-account setter was NOT
|
||||
// called, but the filesystem symlink may point at an account whose shared
|
||||
// state is inconsistent. Operator intervention is required. Callers can
|
||||
// interrogate AccountSwitcher.IsPartialSwap() to expose the flag to
|
||||
// health-checks / watchdogs.
|
||||
var ErrPartialSwap = errors.New("switcher: partial swap — flip succeeded but ensure + rollback both failed")
|
||||
|
||||
// SwitchState represents the current phase of a failover operation.
|
||||
type SwitchState string
|
||||
|
||||
|
|
@ -58,6 +70,14 @@ type AccountSwitcher struct {
|
|||
// suite never touches the operator's real /home/ubuntu/.claude-*
|
||||
// shared directories. When nil, symlinks.RequiredShared is used.
|
||||
sharedSymlinks []symlinks.SharedSymlink
|
||||
// partialSwap is set to 1 when a flip+ensure+rollback sequence left the
|
||||
// daemon in an inconsistent state (symlink possibly flipped, but active
|
||||
// account NOT updated, and rollback flip ALSO failed). Health-checks /
|
||||
// watchdogs read this flag via IsPartialSwap(). It is sticky: once set,
|
||||
// it stays set until the operator restarts the daemon after fixing the
|
||||
// filesystem state. We use atomic access so watchdog goroutines can read
|
||||
// it without blocking the switcher.
|
||||
partialSwap atomic.Bool
|
||||
}
|
||||
|
||||
// New creates an AccountSwitcher.
|
||||
|
|
@ -94,8 +114,30 @@ func (a *AccountSwitcher) Run(ctx context.Context) {
|
|||
|
||||
// executeSwitch performs the full failover sequence.
|
||||
func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) {
|
||||
if err := a.executeSwitchE(req); err != nil {
|
||||
// executeSwitchE already logs the detail; we swallow the error here
|
||||
// because the public Run loop has no return channel. The partialSwap
|
||||
// flag (if set) remains visible via IsPartialSwap().
|
||||
a.logger.Printf("[switcher] SWAP aborted: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// executeSwitchE runs the swap and returns an error describing any abort or
|
||||
// partial-swap condition. Split out from executeSwitch so tests can assert
|
||||
// on the error value without routing through a channel.
|
||||
func (a *AccountSwitcher) executeSwitchE(req quota.SwitchRequest) error {
|
||||
a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime)
|
||||
|
||||
// Refuse to proceed if a previous swap left the daemon in an
|
||||
// inconsistent state. The operator must intervene (fix the filesystem,
|
||||
// restart the daemon) before any further failover can be attempted —
|
||||
// otherwise we'd stack symlink flips on top of a broken state.
|
||||
if a.partialSwap.Load() {
|
||||
err := fmt.Errorf("refusing swap: daemon is in partial-swap degraded state (operator intervention required)")
|
||||
a.logger.Printf("[switcher] %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
// 1. SAVING — capture resume UUIDs from all working sessions plus
|
||||
// every dedicated session unconditionally (dedicated sessions are
|
||||
// user-driven and may not be tracked as "working" in state, but their
|
||||
|
|
@ -110,21 +152,51 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) {
|
|||
if target == nil {
|
||||
a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From)
|
||||
a.currentState = StateNormal
|
||||
return
|
||||
return nil
|
||||
}
|
||||
previous := a.findAccountByName(req.From)
|
||||
|
||||
if err := a.flipSymlink(target.Home); err != nil {
|
||||
a.logger.Printf("[switcher] flipSymlink error: %v", err)
|
||||
}
|
||||
// Best-effort: make sure the target account home exposes the three
|
||||
// shared-state symlinks (session-env, file-history, projects). The main
|
||||
// ~/.claude flip is already done, so an error here must NOT abort the
|
||||
// swap — we just log it so the operator can investigate. Without this
|
||||
// call, a fresh target account with no shared links would silently
|
||||
// start writing into private /projects/session-env/file-history dirs
|
||||
// and diverge from the primary account's transcripts.
|
||||
// Ensure the target account home exposes the three shared-state
|
||||
// symlinks (session-env, file-history, projects). If this fails we
|
||||
// MUST NOT proceed with SetActiveAccount — the daemon would otherwise
|
||||
// declare the target "active" while its shared state is divergent,
|
||||
// silently writing transcripts into private /projects directories and
|
||||
// breaking `claude --resume` across sessions. Instead we attempt to
|
||||
// roll back the ~/.claude flip to the previous account. If the
|
||||
// rollback also fails, the daemon is in a documented degraded state
|
||||
// (ErrPartialSwap) and the operator must intervene.
|
||||
if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil {
|
||||
a.logger.Printf("[switcher] WARN ensure shared symlinks for %q: %v", target.Home, err)
|
||||
a.logger.Printf("[switcher] ensure shared symlinks for %q failed: %v — attempting rollback", target.Home, err)
|
||||
if previous == nil || previous.Home == "" {
|
||||
// No known previous home to roll back to — set the degraded
|
||||
// flag and bail out. This is equivalent to a rollback failure
|
||||
// because the filesystem is pointed at a broken target.
|
||||
a.partialSwap.Store(true)
|
||||
a.currentState = StateNormal
|
||||
return fmt.Errorf("%w: ensure failed (%v) and no previous account home is known for rollback", ErrPartialSwap, err)
|
||||
}
|
||||
if rbErr := a.flipSymlink(previous.Home); rbErr != nil {
|
||||
// Both the ensure AND the rollback failed. The daemon is now
|
||||
// in a documented inconsistent state: ~/.claude may point at
|
||||
// target whose shared-state is divergent, but SetActiveAccount
|
||||
// has NOT been called so state.ActiveAccount is still the
|
||||
// previous account. No further failover can be attempted
|
||||
// until the operator intervenes.
|
||||
a.partialSwap.Store(true)
|
||||
a.logger.Printf("[switcher] CRITICAL partial swap: ensure=%v rollback=%v — daemon in degraded state, operator intervention required", err, rbErr)
|
||||
a.currentState = StateNormal
|
||||
return fmt.Errorf("%w: ensure=%v rollback=%v", ErrPartialSwap, err, rbErr)
|
||||
}
|
||||
// Rollback succeeded — symlink is back on the previous account,
|
||||
// SetActiveAccount was NEVER called, state is consistent with
|
||||
// "no swap happened". Return an explicit error so the caller
|
||||
// knows the swap was cancelled.
|
||||
a.logger.Printf("[switcher] rollback successful: ~/.claude → %s (swap cancelled)", previous.Home)
|
||||
a.currentState = StateNormal
|
||||
return fmt.Errorf("swap cancelled: ensure shared symlinks failed on target %q: %w", target.Home, err)
|
||||
}
|
||||
a.killAllPoolSessions()
|
||||
a.recreatePoolSessions()
|
||||
|
|
@ -151,6 +223,31 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) {
|
|||
}
|
||||
|
||||
a.currentState = StateNormal
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsPartialSwap reports whether the switcher is in a degraded state after a
|
||||
// flip+ensure+rollback sequence all failed. Health-checks and watchdogs use
|
||||
// this signal to surface an operator-actionable alert. The flag is sticky
|
||||
// for the lifetime of the process: once set, it remains set until the daemon
|
||||
// is restarted (after the operator has fixed the filesystem).
|
||||
func (a *AccountSwitcher) IsPartialSwap() bool {
|
||||
return a.partialSwap.Load()
|
||||
}
|
||||
|
||||
// findAccountByName returns the account config entry matching name, or nil.
|
||||
// Unlike findTargetAccount (which returns the first NON-matching account),
|
||||
// this is used by the rollback path to recover the previous home.
|
||||
func (a *AccountSwitcher) findAccountByName(name string) *config.AccountConfig {
|
||||
if name == "" {
|
||||
return nil
|
||||
}
|
||||
for i := range a.config.Accounts {
|
||||
if a.config.Accounts[i].Name == name {
|
||||
return &a.config.Accounts[i]
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// saveDedicatedUUIDs captures the resume UUID for every configured dedicated
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue