diff --git a/.gitignore b/.gitignore index a83ec2b..5cda16e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,10 @@ dist/ *.exe *.test *.out +/claude-failover + +# Security review marker (Claude Code hook, expires after 10min) +.security-reviewed # Go vendor/ diff --git a/cmd/claude-failover/main.go b/cmd/claude-failover/main.go index d77ea14..8bc8fc5 100644 --- a/cmd/claude-failover/main.go +++ b/cmd/claude-failover/main.go @@ -8,12 +8,19 @@ import ( "os" "os/signal" "syscall" + "time" "forge.secuaas.ovh/olivier/claude-failover/internal/api" "forge.secuaas.ovh/olivier/claude-failover/internal/config" + "forge.secuaas.ovh/olivier/claude-failover/internal/dispatcher" + "forge.secuaas.ovh/olivier/claude-failover/internal/janitor" "forge.secuaas.ovh/olivier/claude-failover/internal/lifecycle" + "forge.secuaas.ovh/olivier/claude-failover/internal/notify" + "forge.secuaas.ovh/olivier/claude-failover/internal/quota" "forge.secuaas.ovh/olivier/claude-failover/internal/state" + "forge.secuaas.ovh/olivier/claude-failover/internal/switcher" "forge.secuaas.ovh/olivier/claude-failover/internal/tmux" + "forge.secuaas.ovh/olivier/claude-failover/internal/watcher" ) const version = "0.1.0" @@ -53,6 +60,43 @@ func main() { go lm.Run(ctx) + // Notifier — reads credentials from environment variables. + notifier := notify.New(cfg) + + // Session Watcher — detects when sessions finish their tasks. + sw := watcher.New(tmuxClient, s, cfg) + go sw.Run(ctx) + + // Quota Monitor — polls panes for quota exhaustion signals. + qm := quota.New(tmuxClient, s, cfg) + go qm.Run(ctx) + + // Account Switcher — orchestrates account failover on quota exhaustion. + as := switcher.New(tmuxClient, s, cfg, qm.SwitchChan(), notifier) + go as.Run(ctx) + + // Dispatcher — assigns inbox tasks to idle sessions. + disp := dispatcher.New(tmuxClient, s, cfg, sw.DoneChan()) + go disp.Run(ctx) + + // Janitor — periodic cleanup of orphaned files and stale status.json. + jan := janitor.New(s, cfg.Dispatcher.ProjectsDir) + go jan.Run(ctx) + + // State flush loop — persists state to disk every 10 seconds. + go func() { + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + s.Flush() //nolint:errcheck + } + } + }() + // Start HTTP API server. listenAddr := cfg.MCPHTTP.Listen if listenAddr == "" { @@ -65,7 +109,8 @@ func main() { os.Exit(1) } }() - log.Printf("claude-failover v%s started, API on %s", version, listenAddr) + + log.Printf("claude-failover v%s — all goroutines running", version) <-ctx.Done() log.Printf("shutdown signal received — flushing state and exiting") diff --git a/config.example.yaml b/config.example.yaml index c33b31e..5a5e5b3 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -82,3 +82,35 @@ mcp_http: bearer_token_env: CLAUDE_FAILOVER_BEARER # Paths exposed (all read-only except explicitly listed mutating routes). enable_trigger: true # allow /trigger/dispatch, /trigger/swap + +# --------------------------------------------------------------------------- +# notifications +# --------------------------------------------------------------------------- +notifications: + telegram_token_env: TELEGRAM_BOT_TOKEN + telegram_chat_id_env: TELEGRAM_CHAT_ID + resend_api_key_env: RESEND_API_KEY + notify_email_env: CLAUDE_FAILOVER_NOTIFY_EMAIL + +# --------------------------------------------------------------------------- +# dispatcher +# --------------------------------------------------------------------------- +dispatcher: + projects_dir: /home/ubuntu/projects + idle_timeout: 60m + prompt_timeout: 30s + max_dispatch_per_task: 3 + +# --------------------------------------------------------------------------- +# watcher +# --------------------------------------------------------------------------- +watcher: + interval: 30s + done_signal_dir: /tmp + idle_timeout: 60m + +# --------------------------------------------------------------------------- +# janitor +# --------------------------------------------------------------------------- +janitor: + interval: 5m diff --git a/internal/janitor/janitor.go b/internal/janitor/janitor.go new file mode 100644 index 0000000..250264b --- /dev/null +++ b/internal/janitor/janitor.go @@ -0,0 +1,199 @@ +// Package janitor provides a periodic cleanup goroutine for the claude-failover +// daemon. It removes orphaned dispatch-meta files, rewrites agent-queue +// status.json with accurate filesystem counters, and deletes stale +// /tmp/agent-done-* files. +package janitor + +import ( + "context" + "encoding/json" + "log" + "os" + "path/filepath" + "time" + + "forge.secuaas.ovh/olivier/claude-failover/internal/state" +) + +const defaultInterval = 5 * time.Minute + +// agentQueueStatus mirrors the JSON written to .agent-queue/status.json. +// Only the counter fields and state/current_task are managed here. +type agentQueueStatus struct { + State string `json:"state"` + CurrentTask *string `json:"current_task"` + LastHB string `json:"last_heartbeat"` + InboxCount int `json:"inbox_count"` + DoneCount int `json:"done_count"` + FailedCount int `json:"failed_count"` +} + +// Janitor runs periodic filesystem cleanup tasks. +type Janitor struct { + state *state.State + projectsDir string + interval time.Duration + logger *log.Logger +} + +// New returns a Janitor with the default 5-minute interval. +func New(s *state.State, projectsDir string) *Janitor { + return &Janitor{ + state: s, + projectsDir: projectsDir, + interval: defaultInterval, + logger: log.New(os.Stderr, "[janitor] ", log.LstdFlags), + } +} + +// Run starts the cleanup loop. It blocks until ctx is cancelled. +func (j *Janitor) Run(ctx context.Context) { + ticker := time.NewTicker(j.interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + j.cleanup() + } + } +} + +// cleanup performs a single cleanup pass. +func (j *Janitor) cleanup() { + j.cleanupProjects() + j.cleanupStaleAgentDoneFiles() +} + +// cleanupProjects iterates every project sub-directory inside projectsDir. +func (j *Janitor) cleanupProjects() { + entries, err := os.ReadDir(j.projectsDir) + if err != nil { + if !os.IsNotExist(err) { + j.logger.Printf("readdir %s: %v", j.projectsDir, err) + } + return + } + + // Build the set of session names currently working, keyed by task ID. + workingByTask := make(map[string]struct{}) + j.state.ForEachWorking(func(_ string, sess *state.SessionState) { + if sess.Task != nil { + workingByTask[*sess.Task] = struct{}{} + } + }) + + for _, e := range entries { + if !e.IsDir() { + continue + } + projectDir := filepath.Join(j.projectsDir, e.Name()) + inboxDir := filepath.Join(projectDir, ".agent-queue", "inbox") + doneDir := filepath.Join(projectDir, ".agent-queue", "done") + failedDir := filepath.Join(projectDir, ".agent-queue", "failed") + + j.cleanupOrphanDispatchMeta(inboxDir, workingByTask) + j.rewriteStatusJSON(projectDir, inboxDir, doneDir, failedDir) + } +} + +// cleanupOrphanDispatchMeta removes *.md.dispatch-meta files whose associated +// task is no longer tracked by any working session. +func (j *Janitor) cleanupOrphanDispatchMeta(inboxDir string, workingByTask map[string]struct{}) { + metas, err := filepath.Glob(filepath.Join(inboxDir, "*.md.dispatch-meta")) + if err != nil || len(metas) == 0 { + return + } + for _, metaPath := range metas { + // Derive the task name from the meta filename: strip the .dispatch-meta suffix. + taskFile := metaPath[:len(metaPath)-len(".dispatch-meta")] + taskName := filepath.Base(taskFile) + // Strip .md suffix to get bare task ID. + if len(taskName) > 3 { + taskName = taskName[:len(taskName)-3] // remove ".md" + } + if _, active := workingByTask[taskName]; !active { + j.logger.Printf("removing orphaned dispatch-meta: %s", metaPath) + if err := os.Remove(metaPath); err != nil && !os.IsNotExist(err) { + j.logger.Printf("remove %s: %v", metaPath, err) + } + } + } +} + +// rewriteStatusJSON recalculates real filesystem counters and rewrites +// .agent-queue/status.json, preserving state/current_task when agent is working. +func (j *Janitor) rewriteStatusJSON(projectDir, inboxDir, doneDir, failedDir string) { + statusPath := filepath.Join(projectDir, ".agent-queue", "status.json") + + inboxCount := countMDFiles(inboxDir) + doneCount := countMDFiles(doneDir) + failedCount := countMDFiles(failedDir) + + // Read existing status to preserve agent-owned fields. + existing := agentQueueStatus{ + State: "idle", + CurrentTask: nil, + } + if data, err := os.ReadFile(statusPath); err == nil { + _ = json.Unmarshal(data, &existing) + } + + // Only update counters; do NOT touch state/current_task if agent is working. + existing.InboxCount = inboxCount + existing.DoneCount = doneCount + existing.FailedCount = failedCount + existing.LastHB = time.Now().UTC().Format(time.RFC3339) + + data, err := json.Marshal(existing) + if err != nil { + j.logger.Printf("marshal status.json for %s: %v", projectDir, err) + return + } + tmp := statusPath + ".tmp" + if err := os.WriteFile(tmp, data, 0644); err != nil { + j.logger.Printf("write tmp status.json for %s: %v", projectDir, err) + return + } + if err := os.Rename(tmp, statusPath); err != nil { + j.logger.Printf("rename status.json for %s: %v", projectDir, err) + } +} + +// countMDFiles counts *.md files in dir that are NOT *.dispatched. +func countMDFiles(dir string) int { + entries, err := os.ReadDir(dir) + if err != nil { + return 0 + } + count := 0 + for _, e := range entries { + name := e.Name() + if filepath.Ext(name) == ".md" { + count++ + } + } + return count +} + +// cleanupStaleAgentDoneFiles deletes /tmp/agent-done-* files older than 1 hour. +func (j *Janitor) cleanupStaleAgentDoneFiles() { + matches, err := filepath.Glob("/tmp/agent-done-*") + if err != nil || len(matches) == 0 { + return + } + cutoff := time.Now().Add(-1 * time.Hour) + for _, path := range matches { + info, err := os.Stat(path) + if err != nil { + continue + } + if info.ModTime().Before(cutoff) { + j.logger.Printf("removing stale agent-done file: %s", path) + if err := os.Remove(path); err != nil && !os.IsNotExist(err) { + j.logger.Printf("remove %s: %v", path, err) + } + } + } +} diff --git a/internal/janitor/janitor_test.go b/internal/janitor/janitor_test.go new file mode 100644 index 0000000..67ba730 --- /dev/null +++ b/internal/janitor/janitor_test.go @@ -0,0 +1,96 @@ +package janitor + +import ( + "fmt" + "os" + "path/filepath" + "testing" + "time" + + "forge.secuaas.ovh/olivier/claude-failover/internal/state" +) + +func TestNew(t *testing.T) { + s := state.New("/tmp/test-state.json") + j := New(s, "/tmp/projects") + + if j == nil { + t.Fatal("New() returned nil") + } + if j.interval != defaultInterval { + t.Errorf("expected interval %v, got %v", defaultInterval, j.interval) + } + if j.state != s { + t.Error("state not assigned correctly") + } + if j.projectsDir != "/tmp/projects" { + t.Errorf("projectsDir not assigned correctly: %s", j.projectsDir) + } + if j.logger == nil { + t.Error("logger should not be nil") + } +} + +func TestCleanupStaleAgentDoneFiles(t *testing.T) { + // Create a stale file (older than 1h). + staleFile := fmt.Sprintf("/tmp/agent-done-test-%d", time.Now().UnixNano()) + if err := os.WriteFile(staleFile, []byte("done"), 0644); err != nil { + t.Fatalf("create stale file: %v", err) + } + // Age the file to 2 hours ago. + old := time.Now().Add(-2 * time.Hour) + if err := os.Chtimes(staleFile, old, old); err != nil { + t.Fatalf("chtimes: %v", err) + } + + // Create a recent file (should NOT be deleted). + recentFile := fmt.Sprintf("/tmp/agent-done-test-recent-%d", time.Now().UnixNano()) + if err := os.WriteFile(recentFile, []byte("recent"), 0644); err != nil { + t.Fatalf("create recent file: %v", err) + } + defer os.Remove(recentFile) + + s := state.New("/tmp/test-state.json") + j := New(s, "/tmp/projects") + j.cleanupStaleAgentDoneFiles() + + // Stale file should be gone. + if _, err := os.Stat(staleFile); !os.IsNotExist(err) { + os.Remove(staleFile) + t.Errorf("stale file %s should have been deleted", staleFile) + } + + // Recent file should still exist. + if _, err := os.Stat(recentFile); os.IsNotExist(err) { + t.Errorf("recent file %s should NOT have been deleted", recentFile) + } +} + +func TestCleanupOrphanDispatchMeta(t *testing.T) { + tmpDir := t.TempDir() + inboxDir := filepath.Join(tmpDir, ".agent-queue", "inbox") + if err := os.MkdirAll(inboxDir, 0755); err != nil { + t.Fatal(err) + } + + // Create a task file and its dispatch-meta. + taskFile := filepath.Join(inboxDir, "my-task.md") + metaFile := taskFile + ".dispatch-meta" + os.WriteFile(taskFile, []byte("task"), 0644) + os.WriteFile(metaFile, []byte("meta"), 0644) + + s := state.New("/tmp/test-state.json") + j := New(s, tmpDir) + + // No working sessions → dispatch-meta should be removed. + workingByTask := make(map[string]struct{}) + j.cleanupOrphanDispatchMeta(inboxDir, workingByTask) + + if _, err := os.Stat(metaFile); !os.IsNotExist(err) { + t.Errorf("orphan dispatch-meta %s should have been deleted", metaFile) + } + // Task file itself should still exist. + if _, err := os.Stat(taskFile); os.IsNotExist(err) { + t.Errorf("task file %s should NOT have been deleted", taskFile) + } +} diff --git a/internal/switcher/account_switcher.go b/internal/switcher/account_switcher.go new file mode 100644 index 0000000..7684872 --- /dev/null +++ b/internal/switcher/account_switcher.go @@ -0,0 +1,281 @@ +// Package switcher implements the account-switcher state machine. +// It is the only component allowed to flip the active Claude account. +package switcher + +import ( + "context" + "fmt" + "log" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "time" + + "forge.secuaas.ovh/olivier/claude-failover/internal/config" + "forge.secuaas.ovh/olivier/claude-failover/internal/notify" + "forge.secuaas.ovh/olivier/claude-failover/internal/quota" + "forge.secuaas.ovh/olivier/claude-failover/internal/state" + "forge.secuaas.ovh/olivier/claude-failover/internal/tmux" +) + +// SwitchState represents the current phase of a failover operation. +type SwitchState string + +const ( + StateNormal SwitchState = "normal" + StateSaving SwitchState = "saving" + StateSwitching SwitchState = "switching" + StateResuming SwitchState = "resuming" +) + +// resumeRe matches `claude --resume ` in pane capture output. +var resumeRe = regexp.MustCompile(`claude\s+--resume\s+([a-f0-9-]{36})`) + +// reMinutes matches "in N minutes" in a reset-time string. +var reMinutes = regexp.MustCompile(`in\s+(\d+)\s+minute`) + +// reHours matches "in N hours" in a reset-time string. +var reHours = regexp.MustCompile(`in\s+(\d+)\s+hour`) + +// AccountSwitcher consumes SwitchRequests and orchestrates account failover: +// save session context → flip ~/.claude symlink → restart sessions. +type AccountSwitcher struct { + tmux tmux.Client + state *state.State + config *config.Config + switchCh <-chan quota.SwitchRequest + notifier *notify.Notifier + currentState SwitchState + logger *log.Logger + // homeDir is the directory containing the .claude symlink. Overridable for tests. + // When empty, os.UserHomeDir() is used. + homeDir string +} + +// New creates an AccountSwitcher. +// notifier may be nil; notifications are skipped when absent. +func New( + tc tmux.Client, + s *state.State, + cfg *config.Config, + switchCh <-chan quota.SwitchRequest, + notifier *notify.Notifier, +) *AccountSwitcher { + return &AccountSwitcher{ + tmux: tc, + state: s, + config: cfg, + switchCh: switchCh, + notifier: notifier, + currentState: StateNormal, + logger: log.Default(), + } +} + +// Run starts the switcher event loop until ctx is cancelled. +func (a *AccountSwitcher) Run(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case req := <-a.switchCh: + a.executeSwitch(req) + } + } +} + +// executeSwitch performs the full failover sequence. +func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { + a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime) + + // 1. SAVING — capture resume UUIDs from all working sessions. + a.currentState = StateSaving + a.saveAllSessions() + + // 2. SWITCHING — find target, flip symlink, restart sessions. + a.currentState = StateSwitching + target := a.findTargetAccount(req.From) + if target == nil { + a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From) + a.currentState = StateNormal + return + } + + if err := a.flipSymlink(target.Home); err != nil { + a.logger.Printf("[switcher] flipSymlink error: %v", err) + } + a.killAllPoolSessions() + a.recreatePoolSessions() + + // Update active account. + a.state.SetActiveAccount(target.Name) + + // 3. RESUMING — sessions are alive, dispatcher will fill them. + a.currentState = StateResuming + + // 4. Notify. + msg := fmt.Sprintf("Switch %s → %s (reset: %s)", req.From, target.Name, req.ResetTime) + a.logger.Printf("[switcher] SWAP complete: %s", msg) + if a.notifier != nil { + a.notifier.Telegram("🔄 " + msg) //nolint:errcheck + } + + // 5. Schedule return to primary account if reset time is known. + if req.ResetTime != "" { + go a.scheduleReturn(req.From, req.ResetTime) + } + + a.currentState = StateNormal +} + +// saveAllSessions captures the resume UUID for every working session. +func (a *AccountSwitcher) saveAllSessions() { + a.state.ForEachWorking(func(name string, _ *state.SessionState) { + tail, err := a.tmux.CapturePaneTail(name, 200) + if err != nil { + return + } + uuid := extractResumeUUID(tail) + if uuid == "" { + return + } + dir := a.resumeContextDir() + os.MkdirAll(dir, 0700) + path := filepath.Join(dir, name+"-resume-id.txt") + os.WriteFile(path, []byte(uuid), 0600) + a.logger.Printf("[switcher] saved resume UUID for %q", name) + }) +} + +// resolveHomeDir returns the configured homeDir (test override) or the real +// user home. Tests MUST set a.homeDir to a tmpdir to avoid clobbering the +// production ~/.claude symlink. +func (a *AccountSwitcher) resolveHomeDir() (string, error) { + if a.homeDir != "" { + return a.homeDir, nil + } + home, err := os.UserHomeDir() + if err != nil { + return "", fmt.Errorf("UserHomeDir: %w", err) + } + return home, nil +} + +// flipSymlink replaces ~/.claude with a symlink to targetHome. +// All paths come from config — no hardcoded values. +func (a *AccountSwitcher) flipSymlink(targetHome string) error { + home, err := a.resolveHomeDir() + if err != nil { + return err + } + claudeLink := filepath.Join(home, ".claude") + os.Remove(claudeLink) + if err := os.Symlink(targetHome, claudeLink); err != nil { + return fmt.Errorf("symlink %s → %s: %w", claudeLink, targetHome, err) + } + a.logger.Printf("[switcher] ~/.claude → %s", targetHome) + return nil +} + +// killAllPoolSessions kills all autonomous and dedicated pool sessions. +func (a *AccountSwitcher) killAllPoolSessions() { + prefix := a.config.Pool.Autonomous.Prefix + if prefix == "" { + prefix = "ccl-auto-" + } + for i := 0; i < a.config.Pool.Autonomous.Max; i++ { + a.tmux.KillSession(sessionName(prefix, i)) //nolint:errcheck + } + for _, ds := range a.config.Pool.Dedicated { + a.tmux.KillSession(ds.Name) //nolint:errcheck + } +} + +// recreatePoolSessions creates fresh pool sessions after a switch. +func (a *AccountSwitcher) recreatePoolSessions() { + prefix := a.config.Pool.Autonomous.Prefix + if prefix == "" { + prefix = "ccl-auto-" + } + for i := 0; i < a.config.Pool.Autonomous.Min; i++ { + name := sessionName(prefix, i) + if err := a.tmux.CreateSession(name, ""); err != nil { + a.logger.Printf("[switcher] recreate autonomous %q: %v", name, err) + } + } + for _, ds := range a.config.Pool.Dedicated { + if err := a.tmux.CreateSession(ds.Name, ds.Project); err != nil { + a.logger.Printf("[switcher] recreate dedicated %q: %v", ds.Name, err) + } + } +} + +// findTargetAccount returns the first account that is not currentAccount. +func (a *AccountSwitcher) findTargetAccount(currentAccount string) *config.AccountConfig { + for i := range a.config.Accounts { + if a.config.Accounts[i].Name != currentAccount { + return &a.config.Accounts[i] + } + } + return nil +} + +// scheduleReturn waits for the quota to reset then switches back to primaryAccount. +func (a *AccountSwitcher) scheduleReturn(primaryAccount, resetTime string) { + dur := timeUntilReset(resetTime) + 5*time.Minute + a.logger.Printf("[switcher] return to %q scheduled in %v", primaryAccount, dur.Round(time.Minute)) + time.Sleep(dur) + a.executeSwitch(quota.SwitchRequest{ + From: a.state.ActiveAccount(), + To: primaryAccount, + }) +} + +// extractResumeUUID finds a Claude resume UUID in pane output. +func extractResumeUUID(content string) string { + m := resumeRe.FindStringSubmatch(content) + if len(m) >= 2 { + return m[1] + } + return "" +} + +// resumeContextDir returns the directory for per-session resume UUIDs. +// Honours a.homeDir override so tests never write to the real ~/.claude-context. +func (a *AccountSwitcher) resumeContextDir() string { + home, _ := a.resolveHomeDir() + return filepath.Join(home, ".claude-context") +} + +// timeUntilReset parses a reset-time string and returns the duration. +// Returns a 2-hour fallback when parsing fails. +func timeUntilReset(resetTime string) time.Duration { + lower := strings.ToLower(strings.TrimSpace(resetTime)) + if m := reMinutes.FindStringSubmatch(lower); len(m) >= 2 { + n, _ := strconv.Atoi(m[1]) + return time.Duration(n) * time.Minute + } + if m := reHours.FindStringSubmatch(lower); len(m) >= 2 { + n, _ := strconv.Atoi(m[1]) + return time.Duration(n) * time.Hour + } + return 2 * time.Hour +} + +func sessionName(prefix string, i int) string { + return prefix + itoa(i) +} + +func itoa(n int) string { + if n == 0 { + return "0" + } + b := make([]byte, 0, 10) + for n > 0 { + b = append([]byte{byte('0' + n%10)}, b...) + n /= 10 + } + return string(b) +} diff --git a/internal/switcher/account_switcher_test.go b/internal/switcher/account_switcher_test.go new file mode 100644 index 0000000..5cc6dc8 --- /dev/null +++ b/internal/switcher/account_switcher_test.go @@ -0,0 +1,166 @@ +package switcher + +import ( + "testing" + "time" + + "forge.secuaas.ovh/olivier/claude-failover/internal/config" + "forge.secuaas.ovh/olivier/claude-failover/internal/quota" + "forge.secuaas.ovh/olivier/claude-failover/internal/state" +) + +// mockTmux for switcher tests. +type mockTmux struct { + sessions map[string]bool + paneOutput map[string]string + killCalls []string + createCalls []string +} + +func newMockTmux() *mockTmux { + return &mockTmux{ + sessions: make(map[string]bool), + paneOutput: make(map[string]string), + } +} + +func (m *mockTmux) HasSession(name string) bool { return m.sessions[name] } +func (m *mockTmux) CreateSession(name, _ string) error { + m.sessions[name] = true + m.createCalls = append(m.createCalls, name) + return nil +} +func (m *mockTmux) KillSession(name string) error { + delete(m.sessions, name) + m.killCalls = append(m.killCalls, name) + return nil +} +func (m *mockTmux) SendKeys(_, _ string) error { return nil } +func (m *mockTmux) CapturePaneTail(session string, _ int) (string, error) { + return m.paneOutput[session], nil +} + +// TestFindTargetAccount returns the first account that differs from current. +func TestFindTargetAccount(t *testing.T) { + tc := newMockTmux() + s := state.New("") + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Priority: 1}, + {Name: "compte2", Priority: 2}, + }, + } + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + + target := a.findTargetAccount("compte1") + if target == nil || target.Name != "compte2" { + t.Errorf("expected compte2, got %v", target) + } +} + +// TestFindTargetAccountSingleAccount returns nil when only one account exists. +func TestFindTargetAccountSingleAccount(t *testing.T) { + tc := newMockTmux() + s := state.New("") + cfg := &config.Config{ + Accounts: []config.AccountConfig{{Name: "solo"}}, + } + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + + if got := a.findTargetAccount("solo"); got != nil { + t.Errorf("expected nil for single account, got %v", got) + } +} + +// TestExtractResumeUUID parses UUID from pane output. +func TestExtractResumeUUID(t *testing.T) { + input := "$ claude --resume a1b2c3d4-e5f6-7890-abcd-ef1234567890 --model sonnet" + got := extractResumeUUID(input) + want := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + if got != want { + t.Errorf("expected %q, got %q", want, got) + } +} + +// TestExtractResumeUUIDMissing returns empty string when no UUID present. +func TestExtractResumeUUIDMissing(t *testing.T) { + if got := extractResumeUUID("no uuid here"); got != "" { + t.Errorf("expected empty, got %q", got) + } +} + +// TestTimeUntilReset parses minute and hour formats correctly. +func TestTimeUntilReset(t *testing.T) { + cases := []struct { + input string + want time.Duration + }{ + {"in 45 minutes", 45 * time.Minute}, + {"in 2 hours", 2 * time.Hour}, + {"in 1 hour", 1 * time.Hour}, + {"", 2 * time.Hour}, + {"8pm", 2 * time.Hour}, // fallback for unrecognised formats + } + for _, c := range cases { + if got := timeUntilReset(c.input); got != c.want { + t.Errorf("timeUntilReset(%q) = %v, want %v", c.input, got, c.want) + } + } +} + +// TestKillAndRecreatePoolSessions verifies that executeSwitch restarts sessions. +func TestKillAndRecreatePoolSessions(t *testing.T) { + tc := newMockTmux() + tc.sessions["ccl-auto-0"] = true + tc.sessions["ccl-auto-1"] = true + tc.sessions["dedicated-1"] = true + + s := state.New("") + s.SetActiveAccount("compte1") + + cfg := &config.Config{ + Accounts: []config.AccountConfig{ + {Name: "compte1", Home: t.TempDir()}, + {Name: "compte2", Home: t.TempDir()}, + }, + Pool: config.PoolConfig{ + Dedicated: []config.DedicatedSession{{Name: "dedicated-1", Project: "/tmp"}}, + Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 2, Max: 2}, + }, + } + + a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil) + // CRITICAL: isolate symlink manipulation in a tmpdir so the test never + // touches the real ~/.claude (regression: a reboot used to leave Claude + // Code unusable because the test had repointed ~/.claude to /tmp/...). + a.homeDir = t.TempDir() + a.executeSwitch(quota.SwitchRequest{From: "compte1"}) + + // Active account must have changed. + if got := s.ActiveAccount(); got != "compte2" { + t.Errorf("expected active account compte2, got %q", got) + } + + // All old sessions must have been killed. + for _, name := range []string{"ccl-auto-0", "ccl-auto-1", "dedicated-1"} { + found := false + for _, k := range tc.killCalls { + if k == name { + found = true + break + } + } + if !found { + t.Errorf("expected %q to be killed", name) + } + } + + // Min pool sessions must be recreated. + recreated := map[string]bool{} + for _, c := range tc.createCalls { + recreated[c] = true + } + if !recreated["ccl-auto-0"] || !recreated["ccl-auto-1"] { + t.Errorf("expected autonomous sessions recreated; createCalls=%v", tc.createCalls) + } +} diff --git a/scripts/claude-failover.service b/scripts/claude-failover.service new file mode 100644 index 0000000..4489b9d --- /dev/null +++ b/scripts/claude-failover.service @@ -0,0 +1,17 @@ +[Unit] +Description=Claude Failover — Session Orchestrator +After=network.target + +[Service] +Type=simple +User=ubuntu +ExecStartPre=/usr/bin/loginctl enable-linger ubuntu +ExecStart=/usr/local/bin/claude-failover --config /etc/claude-failover/config.yaml +Restart=always +RestartSec=5 +KillMode=mixed +TimeoutStopSec=60 +EnvironmentFile=-/etc/claude-failover/env + +[Install] +WantedBy=multi-user.target