Wire symlinks.EnsureForAccount into executeSwitch, called immediately after the ~/.claude flip. Guarantees the three shared-state links (session-env, file-history, projects) exist on the target account home even for freshly-provisioned accounts, preventing silent transcript duplication and undo-history divergence on first resume. Best-effort: errors are logged as WARN but never abort the swap. If we returned here the daemon would be left inconsistent (symlink flipped, SetActiveAccount never called). Operator sees the warning in logs and resolves divergent links manually. Tests: - TestFlipReconcilesSharedSymlinksOnTargetHome: empty target home gets all three links pointing at canonical targets after the flip. - TestFlipEnsureSymlinksFailureDoesNotAbortSwap: a planted divergent link triggers the symlinks-package error; the swap completes anyway and the active account is updated. Hermetic: added AccountSwitcher.sharedSymlinks override so tests scope the reconcile inside t.TempDir() and never touch /home/ubuntu/.claude-*-shared. Existing tests migrated to this pattern and hardcoded /tmp/claude-*-xxxx paths replaced with tmpdirs. Phase 1 / Chantier A — task A3.
389 lines
13 KiB
Go
389 lines
13 KiB
Go
// Package switcher implements the account-switcher state machine.
|
|
// It is the only component allowed to flip the active Claude account.
|
|
package switcher
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/notify"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/quota"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/symlinks"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
|
|
)
|
|
|
|
// SwitchState represents the current phase of a failover operation.
|
|
type SwitchState string
|
|
|
|
const (
|
|
StateNormal SwitchState = "normal"
|
|
StateSaving SwitchState = "saving"
|
|
StateSwitching SwitchState = "switching"
|
|
StateResuming SwitchState = "resuming"
|
|
)
|
|
|
|
// resumeRe matches `claude --resume <uuid>` in pane capture output.
|
|
var resumeRe = regexp.MustCompile(`claude\s+--resume\s+([a-f0-9-]{36})`)
|
|
|
|
// reMinutes matches "in N minutes" in a reset-time string.
|
|
var reMinutes = regexp.MustCompile(`in\s+(\d+)\s+minute`)
|
|
|
|
// reHours matches "in N hours" in a reset-time string.
|
|
var reHours = regexp.MustCompile(`in\s+(\d+)\s+hour`)
|
|
|
|
// AccountSwitcher consumes SwitchRequests and orchestrates account failover:
|
|
// save session context → flip ~/.claude symlink → restart sessions.
|
|
type AccountSwitcher struct {
|
|
tmux tmux.Client
|
|
state *state.State
|
|
config *config.Config
|
|
switchCh <-chan quota.SwitchRequest
|
|
notifier *notify.Notifier
|
|
currentState SwitchState
|
|
logger *log.Logger
|
|
// homeDir is the directory containing the .claude symlink. Overridable for tests.
|
|
// When empty, os.UserHomeDir() is used.
|
|
homeDir string
|
|
// sharedSymlinks is the list of shared-state links reconciled on the
|
|
// target account home after every flip. Overridable for tests so the
|
|
// suite never touches the operator's real /home/ubuntu/.claude-*
|
|
// shared directories. When nil, symlinks.RequiredShared is used.
|
|
sharedSymlinks []symlinks.SharedSymlink
|
|
}
|
|
|
|
// New creates an AccountSwitcher.
|
|
// notifier may be nil; notifications are skipped when absent.
|
|
func New(
|
|
tc tmux.Client,
|
|
s *state.State,
|
|
cfg *config.Config,
|
|
switchCh <-chan quota.SwitchRequest,
|
|
notifier *notify.Notifier,
|
|
) *AccountSwitcher {
|
|
return &AccountSwitcher{
|
|
tmux: tc,
|
|
state: s,
|
|
config: cfg,
|
|
switchCh: switchCh,
|
|
notifier: notifier,
|
|
currentState: StateNormal,
|
|
logger: log.Default(),
|
|
}
|
|
}
|
|
|
|
// Run starts the switcher event loop until ctx is cancelled.
|
|
func (a *AccountSwitcher) Run(ctx context.Context) {
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case req := <-a.switchCh:
|
|
a.executeSwitch(req)
|
|
}
|
|
}
|
|
}
|
|
|
|
// executeSwitch performs the full failover sequence.
|
|
func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) {
|
|
a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime)
|
|
|
|
// 1. SAVING — capture resume UUIDs from all working sessions plus
|
|
// every dedicated session unconditionally (dedicated sessions are
|
|
// user-driven and may not be tracked as "working" in state, but their
|
|
// UUIDs are the most valuable to preserve across a swap).
|
|
a.currentState = StateSaving
|
|
a.saveAllSessions()
|
|
a.saveDedicatedUUIDs()
|
|
|
|
// 2. SWITCHING — find target, flip symlink, restart sessions.
|
|
a.currentState = StateSwitching
|
|
target := a.findTargetAccount(req.From)
|
|
if target == nil {
|
|
a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From)
|
|
a.currentState = StateNormal
|
|
return
|
|
}
|
|
|
|
if err := a.flipSymlink(target.Home); err != nil {
|
|
a.logger.Printf("[switcher] flipSymlink error: %v", err)
|
|
}
|
|
// Best-effort: make sure the target account home exposes the three
|
|
// shared-state symlinks (session-env, file-history, projects). The main
|
|
// ~/.claude flip is already done, so an error here must NOT abort the
|
|
// swap — we just log it so the operator can investigate. Without this
|
|
// call, a fresh target account with no shared links would silently
|
|
// start writing into private /projects/session-env/file-history dirs
|
|
// and diverge from the primary account's transcripts.
|
|
if err := symlinks.EnsureForAccount(target.Home, a.requiredShared()); err != nil {
|
|
a.logger.Printf("[switcher] WARN ensure shared symlinks for %q: %v", target.Home, err)
|
|
}
|
|
a.killAllPoolSessions()
|
|
a.recreatePoolSessions()
|
|
a.relaunchDedicatedSessions(target.Home)
|
|
|
|
// Update active account and record the swap timestamp so the quota
|
|
// monitor can enforce a cooldown before requesting another one.
|
|
a.state.SetActiveAccount(target.Name)
|
|
a.state.RecordSwap(req.From, target.Name)
|
|
|
|
// 3. RESUMING — sessions are alive, dispatcher will fill them.
|
|
a.currentState = StateResuming
|
|
|
|
// 4. Notify.
|
|
msg := fmt.Sprintf("Switch %s → %s (reset: %s)", req.From, target.Name, req.ResetTime)
|
|
a.logger.Printf("[switcher] SWAP complete: %s", msg)
|
|
if a.notifier != nil {
|
|
a.notifier.Telegram("🔄 " + msg) //nolint:errcheck
|
|
}
|
|
|
|
// 5. Schedule return to primary account if reset time is known.
|
|
if req.ResetTime != "" {
|
|
go a.scheduleReturn(req.From, req.ResetTime)
|
|
}
|
|
|
|
a.currentState = StateNormal
|
|
}
|
|
|
|
// saveDedicatedUUIDs captures the resume UUID for every configured dedicated
|
|
// session, regardless of its tracked state. Dedicated sessions are typically
|
|
// user-driven and not in state="working", but their UUIDs are the most
|
|
// valuable to preserve across a swap so the user's work is not lost.
|
|
func (a *AccountSwitcher) saveDedicatedUUIDs() {
|
|
for _, ds := range a.config.Pool.Dedicated {
|
|
if !a.tmux.HasSession(ds.Name) {
|
|
continue
|
|
}
|
|
tail, err := a.tmux.CapturePaneTail(ds.Name, 200)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
uuid := extractResumeUUID(tail)
|
|
if uuid == "" {
|
|
continue
|
|
}
|
|
dir := a.resumeContextDir()
|
|
if err := os.MkdirAll(dir, 0700); err != nil {
|
|
a.logger.Printf("[switcher] mkdir %s: %v", dir, err)
|
|
continue
|
|
}
|
|
path := filepath.Join(dir, ds.Name+"-resume-id.txt")
|
|
if err := os.WriteFile(path, []byte(uuid), 0600); err != nil {
|
|
a.logger.Printf("[switcher] write %s: %v", path, err)
|
|
continue
|
|
}
|
|
a.logger.Printf("[switcher] saved dedicated resume UUID for %q: %s", ds.Name, uuid)
|
|
}
|
|
}
|
|
|
|
// relaunchDedicatedSessions sends a `claude --resume <uuid>` command to each
|
|
// dedicated session after recreation, using the target account's home via
|
|
// CLAUDE_CONFIG_DIR so the session follows the active account. If no UUID was
|
|
// captured for a session, it is left at the bash prompt for manual restart.
|
|
func (a *AccountSwitcher) relaunchDedicatedSessions(targetHome string) {
|
|
for _, ds := range a.config.Pool.Dedicated {
|
|
path := filepath.Join(a.resumeContextDir(), ds.Name+"-resume-id.txt")
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
a.logger.Printf("[switcher] no saved resume UUID for %q (%v) — leaving at shell", ds.Name, err)
|
|
continue
|
|
}
|
|
uuid := strings.TrimSpace(string(data))
|
|
if !isValidResumeUUID(uuid) {
|
|
a.logger.Printf("[switcher] invalid UUID for %q: %q", ds.Name, uuid)
|
|
continue
|
|
}
|
|
// targetHome is operator-controlled (config file); uuid is regex-validated.
|
|
// Neither is user-supplied runtime input, so shell interpolation is safe.
|
|
cmd := fmt.Sprintf("CLAUDE_CONFIG_DIR=%s claude --dangerously-skip-permissions --resume %s",
|
|
targetHome, uuid)
|
|
if err := a.tmux.SendKeys(ds.Name, cmd); err != nil {
|
|
a.logger.Printf("[switcher] relaunch %q: %v", ds.Name, err)
|
|
continue
|
|
}
|
|
a.logger.Printf("[switcher] relaunched %q on %s (resume=%s)", ds.Name, targetHome, uuid)
|
|
}
|
|
}
|
|
|
|
// isValidResumeUUID defends against corrupted resume-id files by requiring
|
|
// the canonical 36-char lowercase hex+dash UUID format.
|
|
func isValidResumeUUID(s string) bool {
|
|
if len(s) != 36 {
|
|
return false
|
|
}
|
|
return resumeRe.MatchString("claude --resume " + s)
|
|
}
|
|
|
|
// saveAllSessions captures the resume UUID for every working session.
|
|
func (a *AccountSwitcher) saveAllSessions() {
|
|
a.state.ForEachWorking(func(name string, _ *state.SessionState) {
|
|
tail, err := a.tmux.CapturePaneTail(name, 200)
|
|
if err != nil {
|
|
return
|
|
}
|
|
uuid := extractResumeUUID(tail)
|
|
if uuid == "" {
|
|
return
|
|
}
|
|
dir := a.resumeContextDir()
|
|
os.MkdirAll(dir, 0700)
|
|
path := filepath.Join(dir, name+"-resume-id.txt")
|
|
os.WriteFile(path, []byte(uuid), 0600)
|
|
a.logger.Printf("[switcher] saved resume UUID for %q", name)
|
|
})
|
|
}
|
|
|
|
// requiredShared returns the shared-symlink list used when reconciling the
|
|
// target account home after a flip. Tests may set a.sharedSymlinks to a
|
|
// tmpdir-scoped list so they never touch /home/ubuntu/.claude-*-shared.
|
|
func (a *AccountSwitcher) requiredShared() []symlinks.SharedSymlink {
|
|
if a.sharedSymlinks != nil {
|
|
return a.sharedSymlinks
|
|
}
|
|
return symlinks.RequiredShared
|
|
}
|
|
|
|
// resolveHomeDir returns the configured homeDir (test override) or the real
|
|
// user home. Tests MUST set a.homeDir to a tmpdir to avoid clobbering the
|
|
// production ~/.claude symlink.
|
|
func (a *AccountSwitcher) resolveHomeDir() (string, error) {
|
|
if a.homeDir != "" {
|
|
return a.homeDir, nil
|
|
}
|
|
home, err := os.UserHomeDir()
|
|
if err != nil {
|
|
return "", fmt.Errorf("UserHomeDir: %w", err)
|
|
}
|
|
return home, nil
|
|
}
|
|
|
|
// flipSymlink replaces ~/.claude with a symlink to targetHome.
|
|
// All paths come from config — no hardcoded values.
|
|
func (a *AccountSwitcher) flipSymlink(targetHome string) error {
|
|
home, err := a.resolveHomeDir()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
claudeLink := filepath.Join(home, ".claude")
|
|
os.Remove(claudeLink)
|
|
if err := os.Symlink(targetHome, claudeLink); err != nil {
|
|
return fmt.Errorf("symlink %s → %s: %w", claudeLink, targetHome, err)
|
|
}
|
|
a.logger.Printf("[switcher] ~/.claude → %s", targetHome)
|
|
return nil
|
|
}
|
|
|
|
// killAllPoolSessions kills all autonomous and dedicated pool sessions
|
|
// managed by this daemon. Sessions outside the configured
|
|
// StartIndex..StartIndex+Max range (e.g. manual operator sessions
|
|
// `ccl-0..ccl-9`) are left untouched.
|
|
func (a *AccountSwitcher) killAllPoolSessions() {
|
|
prefix := a.config.Pool.Autonomous.Prefix
|
|
if prefix == "" {
|
|
prefix = "ccl-auto-"
|
|
}
|
|
start := a.config.Pool.Autonomous.StartIndex
|
|
for i := start; i < start+a.config.Pool.Autonomous.Max; i++ {
|
|
a.tmux.KillSession(sessionName(prefix, i)) //nolint:errcheck
|
|
}
|
|
for _, ds := range a.config.Pool.Dedicated {
|
|
a.tmux.KillSession(ds.Name) //nolint:errcheck
|
|
}
|
|
}
|
|
|
|
// recreatePoolSessions creates fresh pool sessions after a switch,
|
|
// respecting StartIndex so the pool stays within its configured range.
|
|
func (a *AccountSwitcher) recreatePoolSessions() {
|
|
prefix := a.config.Pool.Autonomous.Prefix
|
|
if prefix == "" {
|
|
prefix = "ccl-auto-"
|
|
}
|
|
start := a.config.Pool.Autonomous.StartIndex
|
|
for i := start; i < start+a.config.Pool.Autonomous.Min; i++ {
|
|
name := sessionName(prefix, i)
|
|
if err := a.tmux.CreateSession(name, ""); err != nil {
|
|
a.logger.Printf("[switcher] recreate autonomous %q: %v", name, err)
|
|
}
|
|
}
|
|
for _, ds := range a.config.Pool.Dedicated {
|
|
if err := a.tmux.CreateSession(ds.Name, ds.Project); err != nil {
|
|
a.logger.Printf("[switcher] recreate dedicated %q: %v", ds.Name, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// findTargetAccount returns the first account that is not currentAccount.
|
|
func (a *AccountSwitcher) findTargetAccount(currentAccount string) *config.AccountConfig {
|
|
for i := range a.config.Accounts {
|
|
if a.config.Accounts[i].Name != currentAccount {
|
|
return &a.config.Accounts[i]
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// scheduleReturn waits for the quota to reset then switches back to primaryAccount.
|
|
func (a *AccountSwitcher) scheduleReturn(primaryAccount, resetTime string) {
|
|
dur := timeUntilReset(resetTime) + 5*time.Minute
|
|
a.logger.Printf("[switcher] return to %q scheduled in %v", primaryAccount, dur.Round(time.Minute))
|
|
time.Sleep(dur)
|
|
a.executeSwitch(quota.SwitchRequest{
|
|
From: a.state.ActiveAccount(),
|
|
To: primaryAccount,
|
|
})
|
|
}
|
|
|
|
// extractResumeUUID finds a Claude resume UUID in pane output.
|
|
func extractResumeUUID(content string) string {
|
|
m := resumeRe.FindStringSubmatch(content)
|
|
if len(m) >= 2 {
|
|
return m[1]
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// resumeContextDir returns the directory for per-session resume UUIDs.
|
|
// Honours a.homeDir override so tests never write to the real ~/.claude-context.
|
|
func (a *AccountSwitcher) resumeContextDir() string {
|
|
home, _ := a.resolveHomeDir()
|
|
return filepath.Join(home, ".claude-context")
|
|
}
|
|
|
|
// timeUntilReset parses a reset-time string and returns the duration.
|
|
// Returns a 2-hour fallback when parsing fails.
|
|
func timeUntilReset(resetTime string) time.Duration {
|
|
lower := strings.ToLower(strings.TrimSpace(resetTime))
|
|
if m := reMinutes.FindStringSubmatch(lower); len(m) >= 2 {
|
|
n, _ := strconv.Atoi(m[1])
|
|
return time.Duration(n) * time.Minute
|
|
}
|
|
if m := reHours.FindStringSubmatch(lower); len(m) >= 2 {
|
|
n, _ := strconv.Atoi(m[1])
|
|
return time.Duration(n) * time.Hour
|
|
}
|
|
return 2 * time.Hour
|
|
}
|
|
|
|
func sessionName(prefix string, i int) string {
|
|
return prefix + itoa(i)
|
|
}
|
|
|
|
func itoa(n int) string {
|
|
if n == 0 {
|
|
return "0"
|
|
}
|
|
b := make([]byte, 0, 10)
|
|
for n > 0 {
|
|
b = append([]byte{byte('0' + n%10)}, b...)
|
|
n /= 10
|
|
}
|
|
return string(b)
|
|
}
|