Compare commits
No commits in common. "28f82a66c8c663bf985661413ffe7387e27f7846" and "133165b4324697640781b0262edf556104fa30f5" have entirely different histories.
28f82a66c8
...
133165b432
8 changed files with 1 additions and 841 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -5,10 +5,6 @@ dist/
|
|||
*.exe
|
||||
*.test
|
||||
*.out
|
||||
/claude-failover
|
||||
|
||||
# Security review marker (Claude Code hook, expires after 10min)
|
||||
.security-reviewed
|
||||
|
||||
# Go
|
||||
vendor/
|
||||
|
|
|
|||
|
|
@ -8,19 +8,12 @@ import (
|
|||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/api"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/dispatcher"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/janitor"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/lifecycle"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/notify"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/quota"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/switcher"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/watcher"
|
||||
)
|
||||
|
||||
const version = "0.1.0"
|
||||
|
|
@ -60,43 +53,6 @@ func main() {
|
|||
|
||||
go lm.Run(ctx)
|
||||
|
||||
// Notifier — reads credentials from environment variables.
|
||||
notifier := notify.New(cfg)
|
||||
|
||||
// Session Watcher — detects when sessions finish their tasks.
|
||||
sw := watcher.New(tmuxClient, s, cfg)
|
||||
go sw.Run(ctx)
|
||||
|
||||
// Quota Monitor — polls panes for quota exhaustion signals.
|
||||
qm := quota.New(tmuxClient, s, cfg)
|
||||
go qm.Run(ctx)
|
||||
|
||||
// Account Switcher — orchestrates account failover on quota exhaustion.
|
||||
as := switcher.New(tmuxClient, s, cfg, qm.SwitchChan(), notifier)
|
||||
go as.Run(ctx)
|
||||
|
||||
// Dispatcher — assigns inbox tasks to idle sessions.
|
||||
disp := dispatcher.New(tmuxClient, s, cfg, sw.DoneChan())
|
||||
go disp.Run(ctx)
|
||||
|
||||
// Janitor — periodic cleanup of orphaned files and stale status.json.
|
||||
jan := janitor.New(s, cfg.Dispatcher.ProjectsDir)
|
||||
go jan.Run(ctx)
|
||||
|
||||
// State flush loop — persists state to disk every 10 seconds.
|
||||
go func() {
|
||||
ticker := time.NewTicker(10 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
s.Flush() //nolint:errcheck
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Start HTTP API server.
|
||||
listenAddr := cfg.MCPHTTP.Listen
|
||||
if listenAddr == "" {
|
||||
|
|
@ -109,8 +65,7 @@ func main() {
|
|||
os.Exit(1)
|
||||
}
|
||||
}()
|
||||
|
||||
log.Printf("claude-failover v%s — all goroutines running", version)
|
||||
log.Printf("claude-failover v%s started, API on %s", version, listenAddr)
|
||||
|
||||
<-ctx.Done()
|
||||
log.Printf("shutdown signal received — flushing state and exiting")
|
||||
|
|
|
|||
|
|
@ -82,35 +82,3 @@ mcp_http:
|
|||
bearer_token_env: CLAUDE_FAILOVER_BEARER
|
||||
# Paths exposed (all read-only except explicitly listed mutating routes).
|
||||
enable_trigger: true # allow /trigger/dispatch, /trigger/swap
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# notifications
|
||||
# ---------------------------------------------------------------------------
|
||||
notifications:
|
||||
telegram_token_env: TELEGRAM_BOT_TOKEN
|
||||
telegram_chat_id_env: TELEGRAM_CHAT_ID
|
||||
resend_api_key_env: RESEND_API_KEY
|
||||
notify_email_env: CLAUDE_FAILOVER_NOTIFY_EMAIL
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# dispatcher
|
||||
# ---------------------------------------------------------------------------
|
||||
dispatcher:
|
||||
projects_dir: /home/ubuntu/projects
|
||||
idle_timeout: 60m
|
||||
prompt_timeout: 30s
|
||||
max_dispatch_per_task: 3
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# watcher
|
||||
# ---------------------------------------------------------------------------
|
||||
watcher:
|
||||
interval: 30s
|
||||
done_signal_dir: /tmp
|
||||
idle_timeout: 60m
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# janitor
|
||||
# ---------------------------------------------------------------------------
|
||||
janitor:
|
||||
interval: 5m
|
||||
|
|
|
|||
|
|
@ -1,199 +0,0 @@
|
|||
// Package janitor provides a periodic cleanup goroutine for the claude-failover
|
||||
// daemon. It removes orphaned dispatch-meta files, rewrites agent-queue
|
||||
// status.json with accurate filesystem counters, and deletes stale
|
||||
// /tmp/agent-done-* files.
|
||||
package janitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
|
||||
)
|
||||
|
||||
const defaultInterval = 5 * time.Minute
|
||||
|
||||
// agentQueueStatus mirrors the JSON written to .agent-queue/status.json.
|
||||
// Only the counter fields and state/current_task are managed here.
|
||||
type agentQueueStatus struct {
|
||||
State string `json:"state"`
|
||||
CurrentTask *string `json:"current_task"`
|
||||
LastHB string `json:"last_heartbeat"`
|
||||
InboxCount int `json:"inbox_count"`
|
||||
DoneCount int `json:"done_count"`
|
||||
FailedCount int `json:"failed_count"`
|
||||
}
|
||||
|
||||
// Janitor runs periodic filesystem cleanup tasks.
|
||||
type Janitor struct {
|
||||
state *state.State
|
||||
projectsDir string
|
||||
interval time.Duration
|
||||
logger *log.Logger
|
||||
}
|
||||
|
||||
// New returns a Janitor with the default 5-minute interval.
|
||||
func New(s *state.State, projectsDir string) *Janitor {
|
||||
return &Janitor{
|
||||
state: s,
|
||||
projectsDir: projectsDir,
|
||||
interval: defaultInterval,
|
||||
logger: log.New(os.Stderr, "[janitor] ", log.LstdFlags),
|
||||
}
|
||||
}
|
||||
|
||||
// Run starts the cleanup loop. It blocks until ctx is cancelled.
|
||||
func (j *Janitor) Run(ctx context.Context) {
|
||||
ticker := time.NewTicker(j.interval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
j.cleanup()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup performs a single cleanup pass.
|
||||
func (j *Janitor) cleanup() {
|
||||
j.cleanupProjects()
|
||||
j.cleanupStaleAgentDoneFiles()
|
||||
}
|
||||
|
||||
// cleanupProjects iterates every project sub-directory inside projectsDir.
|
||||
func (j *Janitor) cleanupProjects() {
|
||||
entries, err := os.ReadDir(j.projectsDir)
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
j.logger.Printf("readdir %s: %v", j.projectsDir, err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Build the set of session names currently working, keyed by task ID.
|
||||
workingByTask := make(map[string]struct{})
|
||||
j.state.ForEachWorking(func(_ string, sess *state.SessionState) {
|
||||
if sess.Task != nil {
|
||||
workingByTask[*sess.Task] = struct{}{}
|
||||
}
|
||||
})
|
||||
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
projectDir := filepath.Join(j.projectsDir, e.Name())
|
||||
inboxDir := filepath.Join(projectDir, ".agent-queue", "inbox")
|
||||
doneDir := filepath.Join(projectDir, ".agent-queue", "done")
|
||||
failedDir := filepath.Join(projectDir, ".agent-queue", "failed")
|
||||
|
||||
j.cleanupOrphanDispatchMeta(inboxDir, workingByTask)
|
||||
j.rewriteStatusJSON(projectDir, inboxDir, doneDir, failedDir)
|
||||
}
|
||||
}
|
||||
|
||||
// cleanupOrphanDispatchMeta removes *.md.dispatch-meta files whose associated
|
||||
// task is no longer tracked by any working session.
|
||||
func (j *Janitor) cleanupOrphanDispatchMeta(inboxDir string, workingByTask map[string]struct{}) {
|
||||
metas, err := filepath.Glob(filepath.Join(inboxDir, "*.md.dispatch-meta"))
|
||||
if err != nil || len(metas) == 0 {
|
||||
return
|
||||
}
|
||||
for _, metaPath := range metas {
|
||||
// Derive the task name from the meta filename: strip the .dispatch-meta suffix.
|
||||
taskFile := metaPath[:len(metaPath)-len(".dispatch-meta")]
|
||||
taskName := filepath.Base(taskFile)
|
||||
// Strip .md suffix to get bare task ID.
|
||||
if len(taskName) > 3 {
|
||||
taskName = taskName[:len(taskName)-3] // remove ".md"
|
||||
}
|
||||
if _, active := workingByTask[taskName]; !active {
|
||||
j.logger.Printf("removing orphaned dispatch-meta: %s", metaPath)
|
||||
if err := os.Remove(metaPath); err != nil && !os.IsNotExist(err) {
|
||||
j.logger.Printf("remove %s: %v", metaPath, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// rewriteStatusJSON recalculates real filesystem counters and rewrites
|
||||
// .agent-queue/status.json, preserving state/current_task when agent is working.
|
||||
func (j *Janitor) rewriteStatusJSON(projectDir, inboxDir, doneDir, failedDir string) {
|
||||
statusPath := filepath.Join(projectDir, ".agent-queue", "status.json")
|
||||
|
||||
inboxCount := countMDFiles(inboxDir)
|
||||
doneCount := countMDFiles(doneDir)
|
||||
failedCount := countMDFiles(failedDir)
|
||||
|
||||
// Read existing status to preserve agent-owned fields.
|
||||
existing := agentQueueStatus{
|
||||
State: "idle",
|
||||
CurrentTask: nil,
|
||||
}
|
||||
if data, err := os.ReadFile(statusPath); err == nil {
|
||||
_ = json.Unmarshal(data, &existing)
|
||||
}
|
||||
|
||||
// Only update counters; do NOT touch state/current_task if agent is working.
|
||||
existing.InboxCount = inboxCount
|
||||
existing.DoneCount = doneCount
|
||||
existing.FailedCount = failedCount
|
||||
existing.LastHB = time.Now().UTC().Format(time.RFC3339)
|
||||
|
||||
data, err := json.Marshal(existing)
|
||||
if err != nil {
|
||||
j.logger.Printf("marshal status.json for %s: %v", projectDir, err)
|
||||
return
|
||||
}
|
||||
tmp := statusPath + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||
j.logger.Printf("write tmp status.json for %s: %v", projectDir, err)
|
||||
return
|
||||
}
|
||||
if err := os.Rename(tmp, statusPath); err != nil {
|
||||
j.logger.Printf("rename status.json for %s: %v", projectDir, err)
|
||||
}
|
||||
}
|
||||
|
||||
// countMDFiles counts *.md files in dir that are NOT *.dispatched.
|
||||
func countMDFiles(dir string) int {
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
count := 0
|
||||
for _, e := range entries {
|
||||
name := e.Name()
|
||||
if filepath.Ext(name) == ".md" {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// cleanupStaleAgentDoneFiles deletes /tmp/agent-done-* files older than 1 hour.
|
||||
func (j *Janitor) cleanupStaleAgentDoneFiles() {
|
||||
matches, err := filepath.Glob("/tmp/agent-done-*")
|
||||
if err != nil || len(matches) == 0 {
|
||||
return
|
||||
}
|
||||
cutoff := time.Now().Add(-1 * time.Hour)
|
||||
for _, path := range matches {
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if info.ModTime().Before(cutoff) {
|
||||
j.logger.Printf("removing stale agent-done file: %s", path)
|
||||
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
||||
j.logger.Printf("remove %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,96 +0,0 @@
|
|||
package janitor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
|
||||
)
|
||||
|
||||
func TestNew(t *testing.T) {
|
||||
s := state.New("/tmp/test-state.json")
|
||||
j := New(s, "/tmp/projects")
|
||||
|
||||
if j == nil {
|
||||
t.Fatal("New() returned nil")
|
||||
}
|
||||
if j.interval != defaultInterval {
|
||||
t.Errorf("expected interval %v, got %v", defaultInterval, j.interval)
|
||||
}
|
||||
if j.state != s {
|
||||
t.Error("state not assigned correctly")
|
||||
}
|
||||
if j.projectsDir != "/tmp/projects" {
|
||||
t.Errorf("projectsDir not assigned correctly: %s", j.projectsDir)
|
||||
}
|
||||
if j.logger == nil {
|
||||
t.Error("logger should not be nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCleanupStaleAgentDoneFiles(t *testing.T) {
|
||||
// Create a stale file (older than 1h).
|
||||
staleFile := fmt.Sprintf("/tmp/agent-done-test-%d", time.Now().UnixNano())
|
||||
if err := os.WriteFile(staleFile, []byte("done"), 0644); err != nil {
|
||||
t.Fatalf("create stale file: %v", err)
|
||||
}
|
||||
// Age the file to 2 hours ago.
|
||||
old := time.Now().Add(-2 * time.Hour)
|
||||
if err := os.Chtimes(staleFile, old, old); err != nil {
|
||||
t.Fatalf("chtimes: %v", err)
|
||||
}
|
||||
|
||||
// Create a recent file (should NOT be deleted).
|
||||
recentFile := fmt.Sprintf("/tmp/agent-done-test-recent-%d", time.Now().UnixNano())
|
||||
if err := os.WriteFile(recentFile, []byte("recent"), 0644); err != nil {
|
||||
t.Fatalf("create recent file: %v", err)
|
||||
}
|
||||
defer os.Remove(recentFile)
|
||||
|
||||
s := state.New("/tmp/test-state.json")
|
||||
j := New(s, "/tmp/projects")
|
||||
j.cleanupStaleAgentDoneFiles()
|
||||
|
||||
// Stale file should be gone.
|
||||
if _, err := os.Stat(staleFile); !os.IsNotExist(err) {
|
||||
os.Remove(staleFile)
|
||||
t.Errorf("stale file %s should have been deleted", staleFile)
|
||||
}
|
||||
|
||||
// Recent file should still exist.
|
||||
if _, err := os.Stat(recentFile); os.IsNotExist(err) {
|
||||
t.Errorf("recent file %s should NOT have been deleted", recentFile)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCleanupOrphanDispatchMeta(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
inboxDir := filepath.Join(tmpDir, ".agent-queue", "inbox")
|
||||
if err := os.MkdirAll(inboxDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create a task file and its dispatch-meta.
|
||||
taskFile := filepath.Join(inboxDir, "my-task.md")
|
||||
metaFile := taskFile + ".dispatch-meta"
|
||||
os.WriteFile(taskFile, []byte("task"), 0644)
|
||||
os.WriteFile(metaFile, []byte("meta"), 0644)
|
||||
|
||||
s := state.New("/tmp/test-state.json")
|
||||
j := New(s, tmpDir)
|
||||
|
||||
// No working sessions → dispatch-meta should be removed.
|
||||
workingByTask := make(map[string]struct{})
|
||||
j.cleanupOrphanDispatchMeta(inboxDir, workingByTask)
|
||||
|
||||
if _, err := os.Stat(metaFile); !os.IsNotExist(err) {
|
||||
t.Errorf("orphan dispatch-meta %s should have been deleted", metaFile)
|
||||
}
|
||||
// Task file itself should still exist.
|
||||
if _, err := os.Stat(taskFile); os.IsNotExist(err) {
|
||||
t.Errorf("task file %s should NOT have been deleted", taskFile)
|
||||
}
|
||||
}
|
||||
|
|
@ -1,281 +0,0 @@
|
|||
// Package switcher implements the account-switcher state machine.
|
||||
// It is the only component allowed to flip the active Claude account.
|
||||
package switcher
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/notify"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/quota"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
|
||||
)
|
||||
|
||||
// SwitchState represents the current phase of a failover operation.
|
||||
type SwitchState string
|
||||
|
||||
const (
|
||||
StateNormal SwitchState = "normal"
|
||||
StateSaving SwitchState = "saving"
|
||||
StateSwitching SwitchState = "switching"
|
||||
StateResuming SwitchState = "resuming"
|
||||
)
|
||||
|
||||
// resumeRe matches `claude --resume <uuid>` in pane capture output.
|
||||
var resumeRe = regexp.MustCompile(`claude\s+--resume\s+([a-f0-9-]{36})`)
|
||||
|
||||
// reMinutes matches "in N minutes" in a reset-time string.
|
||||
var reMinutes = regexp.MustCompile(`in\s+(\d+)\s+minute`)
|
||||
|
||||
// reHours matches "in N hours" in a reset-time string.
|
||||
var reHours = regexp.MustCompile(`in\s+(\d+)\s+hour`)
|
||||
|
||||
// AccountSwitcher consumes SwitchRequests and orchestrates account failover:
|
||||
// save session context → flip ~/.claude symlink → restart sessions.
|
||||
type AccountSwitcher struct {
|
||||
tmux tmux.Client
|
||||
state *state.State
|
||||
config *config.Config
|
||||
switchCh <-chan quota.SwitchRequest
|
||||
notifier *notify.Notifier
|
||||
currentState SwitchState
|
||||
logger *log.Logger
|
||||
// homeDir is the directory containing the .claude symlink. Overridable for tests.
|
||||
// When empty, os.UserHomeDir() is used.
|
||||
homeDir string
|
||||
}
|
||||
|
||||
// New creates an AccountSwitcher.
|
||||
// notifier may be nil; notifications are skipped when absent.
|
||||
func New(
|
||||
tc tmux.Client,
|
||||
s *state.State,
|
||||
cfg *config.Config,
|
||||
switchCh <-chan quota.SwitchRequest,
|
||||
notifier *notify.Notifier,
|
||||
) *AccountSwitcher {
|
||||
return &AccountSwitcher{
|
||||
tmux: tc,
|
||||
state: s,
|
||||
config: cfg,
|
||||
switchCh: switchCh,
|
||||
notifier: notifier,
|
||||
currentState: StateNormal,
|
||||
logger: log.Default(),
|
||||
}
|
||||
}
|
||||
|
||||
// Run starts the switcher event loop until ctx is cancelled.
|
||||
func (a *AccountSwitcher) Run(ctx context.Context) {
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case req := <-a.switchCh:
|
||||
a.executeSwitch(req)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// executeSwitch performs the full failover sequence.
|
||||
func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) {
|
||||
a.logger.Printf("[switcher] SWAP initiated from=%q reset=%q", req.From, req.ResetTime)
|
||||
|
||||
// 1. SAVING — capture resume UUIDs from all working sessions.
|
||||
a.currentState = StateSaving
|
||||
a.saveAllSessions()
|
||||
|
||||
// 2. SWITCHING — find target, flip symlink, restart sessions.
|
||||
a.currentState = StateSwitching
|
||||
target := a.findTargetAccount(req.From)
|
||||
if target == nil {
|
||||
a.logger.Printf("[switcher] no alternate account found for %q — aborting swap", req.From)
|
||||
a.currentState = StateNormal
|
||||
return
|
||||
}
|
||||
|
||||
if err := a.flipSymlink(target.Home); err != nil {
|
||||
a.logger.Printf("[switcher] flipSymlink error: %v", err)
|
||||
}
|
||||
a.killAllPoolSessions()
|
||||
a.recreatePoolSessions()
|
||||
|
||||
// Update active account.
|
||||
a.state.SetActiveAccount(target.Name)
|
||||
|
||||
// 3. RESUMING — sessions are alive, dispatcher will fill them.
|
||||
a.currentState = StateResuming
|
||||
|
||||
// 4. Notify.
|
||||
msg := fmt.Sprintf("Switch %s → %s (reset: %s)", req.From, target.Name, req.ResetTime)
|
||||
a.logger.Printf("[switcher] SWAP complete: %s", msg)
|
||||
if a.notifier != nil {
|
||||
a.notifier.Telegram("🔄 " + msg) //nolint:errcheck
|
||||
}
|
||||
|
||||
// 5. Schedule return to primary account if reset time is known.
|
||||
if req.ResetTime != "" {
|
||||
go a.scheduleReturn(req.From, req.ResetTime)
|
||||
}
|
||||
|
||||
a.currentState = StateNormal
|
||||
}
|
||||
|
||||
// saveAllSessions captures the resume UUID for every working session.
|
||||
func (a *AccountSwitcher) saveAllSessions() {
|
||||
a.state.ForEachWorking(func(name string, _ *state.SessionState) {
|
||||
tail, err := a.tmux.CapturePaneTail(name, 200)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
uuid := extractResumeUUID(tail)
|
||||
if uuid == "" {
|
||||
return
|
||||
}
|
||||
dir := a.resumeContextDir()
|
||||
os.MkdirAll(dir, 0700)
|
||||
path := filepath.Join(dir, name+"-resume-id.txt")
|
||||
os.WriteFile(path, []byte(uuid), 0600)
|
||||
a.logger.Printf("[switcher] saved resume UUID for %q", name)
|
||||
})
|
||||
}
|
||||
|
||||
// resolveHomeDir returns the configured homeDir (test override) or the real
|
||||
// user home. Tests MUST set a.homeDir to a tmpdir to avoid clobbering the
|
||||
// production ~/.claude symlink.
|
||||
func (a *AccountSwitcher) resolveHomeDir() (string, error) {
|
||||
if a.homeDir != "" {
|
||||
return a.homeDir, nil
|
||||
}
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("UserHomeDir: %w", err)
|
||||
}
|
||||
return home, nil
|
||||
}
|
||||
|
||||
// flipSymlink replaces ~/.claude with a symlink to targetHome.
|
||||
// All paths come from config — no hardcoded values.
|
||||
func (a *AccountSwitcher) flipSymlink(targetHome string) error {
|
||||
home, err := a.resolveHomeDir()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
claudeLink := filepath.Join(home, ".claude")
|
||||
os.Remove(claudeLink)
|
||||
if err := os.Symlink(targetHome, claudeLink); err != nil {
|
||||
return fmt.Errorf("symlink %s → %s: %w", claudeLink, targetHome, err)
|
||||
}
|
||||
a.logger.Printf("[switcher] ~/.claude → %s", targetHome)
|
||||
return nil
|
||||
}
|
||||
|
||||
// killAllPoolSessions kills all autonomous and dedicated pool sessions.
|
||||
func (a *AccountSwitcher) killAllPoolSessions() {
|
||||
prefix := a.config.Pool.Autonomous.Prefix
|
||||
if prefix == "" {
|
||||
prefix = "ccl-auto-"
|
||||
}
|
||||
for i := 0; i < a.config.Pool.Autonomous.Max; i++ {
|
||||
a.tmux.KillSession(sessionName(prefix, i)) //nolint:errcheck
|
||||
}
|
||||
for _, ds := range a.config.Pool.Dedicated {
|
||||
a.tmux.KillSession(ds.Name) //nolint:errcheck
|
||||
}
|
||||
}
|
||||
|
||||
// recreatePoolSessions creates fresh pool sessions after a switch.
|
||||
func (a *AccountSwitcher) recreatePoolSessions() {
|
||||
prefix := a.config.Pool.Autonomous.Prefix
|
||||
if prefix == "" {
|
||||
prefix = "ccl-auto-"
|
||||
}
|
||||
for i := 0; i < a.config.Pool.Autonomous.Min; i++ {
|
||||
name := sessionName(prefix, i)
|
||||
if err := a.tmux.CreateSession(name, ""); err != nil {
|
||||
a.logger.Printf("[switcher] recreate autonomous %q: %v", name, err)
|
||||
}
|
||||
}
|
||||
for _, ds := range a.config.Pool.Dedicated {
|
||||
if err := a.tmux.CreateSession(ds.Name, ds.Project); err != nil {
|
||||
a.logger.Printf("[switcher] recreate dedicated %q: %v", ds.Name, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// findTargetAccount returns the first account that is not currentAccount.
|
||||
func (a *AccountSwitcher) findTargetAccount(currentAccount string) *config.AccountConfig {
|
||||
for i := range a.config.Accounts {
|
||||
if a.config.Accounts[i].Name != currentAccount {
|
||||
return &a.config.Accounts[i]
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// scheduleReturn waits for the quota to reset then switches back to primaryAccount.
|
||||
func (a *AccountSwitcher) scheduleReturn(primaryAccount, resetTime string) {
|
||||
dur := timeUntilReset(resetTime) + 5*time.Minute
|
||||
a.logger.Printf("[switcher] return to %q scheduled in %v", primaryAccount, dur.Round(time.Minute))
|
||||
time.Sleep(dur)
|
||||
a.executeSwitch(quota.SwitchRequest{
|
||||
From: a.state.ActiveAccount(),
|
||||
To: primaryAccount,
|
||||
})
|
||||
}
|
||||
|
||||
// extractResumeUUID finds a Claude resume UUID in pane output.
|
||||
func extractResumeUUID(content string) string {
|
||||
m := resumeRe.FindStringSubmatch(content)
|
||||
if len(m) >= 2 {
|
||||
return m[1]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// resumeContextDir returns the directory for per-session resume UUIDs.
|
||||
// Honours a.homeDir override so tests never write to the real ~/.claude-context.
|
||||
func (a *AccountSwitcher) resumeContextDir() string {
|
||||
home, _ := a.resolveHomeDir()
|
||||
return filepath.Join(home, ".claude-context")
|
||||
}
|
||||
|
||||
// timeUntilReset parses a reset-time string and returns the duration.
|
||||
// Returns a 2-hour fallback when parsing fails.
|
||||
func timeUntilReset(resetTime string) time.Duration {
|
||||
lower := strings.ToLower(strings.TrimSpace(resetTime))
|
||||
if m := reMinutes.FindStringSubmatch(lower); len(m) >= 2 {
|
||||
n, _ := strconv.Atoi(m[1])
|
||||
return time.Duration(n) * time.Minute
|
||||
}
|
||||
if m := reHours.FindStringSubmatch(lower); len(m) >= 2 {
|
||||
n, _ := strconv.Atoi(m[1])
|
||||
return time.Duration(n) * time.Hour
|
||||
}
|
||||
return 2 * time.Hour
|
||||
}
|
||||
|
||||
func sessionName(prefix string, i int) string {
|
||||
return prefix + itoa(i)
|
||||
}
|
||||
|
||||
func itoa(n int) string {
|
||||
if n == 0 {
|
||||
return "0"
|
||||
}
|
||||
b := make([]byte, 0, 10)
|
||||
for n > 0 {
|
||||
b = append([]byte{byte('0' + n%10)}, b...)
|
||||
n /= 10
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
|
@ -1,166 +0,0 @@
|
|||
package switcher
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/quota"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
|
||||
)
|
||||
|
||||
// mockTmux for switcher tests.
|
||||
type mockTmux struct {
|
||||
sessions map[string]bool
|
||||
paneOutput map[string]string
|
||||
killCalls []string
|
||||
createCalls []string
|
||||
}
|
||||
|
||||
func newMockTmux() *mockTmux {
|
||||
return &mockTmux{
|
||||
sessions: make(map[string]bool),
|
||||
paneOutput: make(map[string]string),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *mockTmux) HasSession(name string) bool { return m.sessions[name] }
|
||||
func (m *mockTmux) CreateSession(name, _ string) error {
|
||||
m.sessions[name] = true
|
||||
m.createCalls = append(m.createCalls, name)
|
||||
return nil
|
||||
}
|
||||
func (m *mockTmux) KillSession(name string) error {
|
||||
delete(m.sessions, name)
|
||||
m.killCalls = append(m.killCalls, name)
|
||||
return nil
|
||||
}
|
||||
func (m *mockTmux) SendKeys(_, _ string) error { return nil }
|
||||
func (m *mockTmux) CapturePaneTail(session string, _ int) (string, error) {
|
||||
return m.paneOutput[session], nil
|
||||
}
|
||||
|
||||
// TestFindTargetAccount returns the first account that differs from current.
|
||||
func TestFindTargetAccount(t *testing.T) {
|
||||
tc := newMockTmux()
|
||||
s := state.New("")
|
||||
cfg := &config.Config{
|
||||
Accounts: []config.AccountConfig{
|
||||
{Name: "compte1", Priority: 1},
|
||||
{Name: "compte2", Priority: 2},
|
||||
},
|
||||
}
|
||||
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
|
||||
|
||||
target := a.findTargetAccount("compte1")
|
||||
if target == nil || target.Name != "compte2" {
|
||||
t.Errorf("expected compte2, got %v", target)
|
||||
}
|
||||
}
|
||||
|
||||
// TestFindTargetAccountSingleAccount returns nil when only one account exists.
|
||||
func TestFindTargetAccountSingleAccount(t *testing.T) {
|
||||
tc := newMockTmux()
|
||||
s := state.New("")
|
||||
cfg := &config.Config{
|
||||
Accounts: []config.AccountConfig{{Name: "solo"}},
|
||||
}
|
||||
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
|
||||
|
||||
if got := a.findTargetAccount("solo"); got != nil {
|
||||
t.Errorf("expected nil for single account, got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestExtractResumeUUID parses UUID from pane output.
|
||||
func TestExtractResumeUUID(t *testing.T) {
|
||||
input := "$ claude --resume a1b2c3d4-e5f6-7890-abcd-ef1234567890 --model sonnet"
|
||||
got := extractResumeUUID(input)
|
||||
want := "a1b2c3d4-e5f6-7890-abcd-ef1234567890"
|
||||
if got != want {
|
||||
t.Errorf("expected %q, got %q", want, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestExtractResumeUUIDMissing returns empty string when no UUID present.
|
||||
func TestExtractResumeUUIDMissing(t *testing.T) {
|
||||
if got := extractResumeUUID("no uuid here"); got != "" {
|
||||
t.Errorf("expected empty, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestTimeUntilReset parses minute and hour formats correctly.
|
||||
func TestTimeUntilReset(t *testing.T) {
|
||||
cases := []struct {
|
||||
input string
|
||||
want time.Duration
|
||||
}{
|
||||
{"in 45 minutes", 45 * time.Minute},
|
||||
{"in 2 hours", 2 * time.Hour},
|
||||
{"in 1 hour", 1 * time.Hour},
|
||||
{"", 2 * time.Hour},
|
||||
{"8pm", 2 * time.Hour}, // fallback for unrecognised formats
|
||||
}
|
||||
for _, c := range cases {
|
||||
if got := timeUntilReset(c.input); got != c.want {
|
||||
t.Errorf("timeUntilReset(%q) = %v, want %v", c.input, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestKillAndRecreatePoolSessions verifies that executeSwitch restarts sessions.
|
||||
func TestKillAndRecreatePoolSessions(t *testing.T) {
|
||||
tc := newMockTmux()
|
||||
tc.sessions["ccl-auto-0"] = true
|
||||
tc.sessions["ccl-auto-1"] = true
|
||||
tc.sessions["dedicated-1"] = true
|
||||
|
||||
s := state.New("")
|
||||
s.SetActiveAccount("compte1")
|
||||
|
||||
cfg := &config.Config{
|
||||
Accounts: []config.AccountConfig{
|
||||
{Name: "compte1", Home: t.TempDir()},
|
||||
{Name: "compte2", Home: t.TempDir()},
|
||||
},
|
||||
Pool: config.PoolConfig{
|
||||
Dedicated: []config.DedicatedSession{{Name: "dedicated-1", Project: "/tmp"}},
|
||||
Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Min: 2, Max: 2},
|
||||
},
|
||||
}
|
||||
|
||||
a := New(tc, s, cfg, make(chan quota.SwitchRequest), nil)
|
||||
// CRITICAL: isolate symlink manipulation in a tmpdir so the test never
|
||||
// touches the real ~/.claude (regression: a reboot used to leave Claude
|
||||
// Code unusable because the test had repointed ~/.claude to /tmp/...).
|
||||
a.homeDir = t.TempDir()
|
||||
a.executeSwitch(quota.SwitchRequest{From: "compte1"})
|
||||
|
||||
// Active account must have changed.
|
||||
if got := s.ActiveAccount(); got != "compte2" {
|
||||
t.Errorf("expected active account compte2, got %q", got)
|
||||
}
|
||||
|
||||
// All old sessions must have been killed.
|
||||
for _, name := range []string{"ccl-auto-0", "ccl-auto-1", "dedicated-1"} {
|
||||
found := false
|
||||
for _, k := range tc.killCalls {
|
||||
if k == name {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("expected %q to be killed", name)
|
||||
}
|
||||
}
|
||||
|
||||
// Min pool sessions must be recreated.
|
||||
recreated := map[string]bool{}
|
||||
for _, c := range tc.createCalls {
|
||||
recreated[c] = true
|
||||
}
|
||||
if !recreated["ccl-auto-0"] || !recreated["ccl-auto-1"] {
|
||||
t.Errorf("expected autonomous sessions recreated; createCalls=%v", tc.createCalls)
|
||||
}
|
||||
}
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
[Unit]
|
||||
Description=Claude Failover — Session Orchestrator
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=ubuntu
|
||||
ExecStartPre=/usr/bin/loginctl enable-linger ubuntu
|
||||
ExecStart=/usr/local/bin/claude-failover --config /etc/claude-failover/config.yaml
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
KillMode=mixed
|
||||
TimeoutStopSec=60
|
||||
EnvironmentFile=-/etc/claude-failover/env
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Loading…
Add table
Add a link
Reference in a new issue