Wire symlinks.ValidateAll into the lifecycle manager so the daemon refuses to start if any configured account is missing one of the shared-state symlinks or if a link diverges from the canonical target. Previously, a missing link on a freshly deployed VM would silently create a divergent state tree per account (duplicate JSONL transcripts, broken undo history) — exactly the failure mode the symlinks package (A1) was introduced to prevent. The check runs once at startup before EnsureAllSessions, guarding a single well-defined invariant: "every account home shares the same projects/, file-history/ and session-env/ roots". No auto-heal on divergence — we fail fast with an explicit error so the operator fixes it manually rather than one account's state being overwritten. Part of Phase 1 Chantier A — Failover robuste. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
129 lines
3.8 KiB
Go
129 lines
3.8 KiB
Go
// Package main is the entrypoint for the claude-failover daemon.
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"flag"
|
|
"log"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
"time"
|
|
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/api"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/dispatcher"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/janitor"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/lifecycle"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/notify"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/quota"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/switcher"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/watcher"
|
|
)
|
|
|
|
const version = "0.1.0"
|
|
|
|
func main() {
|
|
var cfgPath string
|
|
flag.StringVar(&cfgPath, "config", "config.yaml", "path to YAML config file")
|
|
flag.Parse()
|
|
|
|
log.SetFlags(log.LstdFlags | log.Lmicroseconds | log.LUTC)
|
|
log.Printf("claude-failover v%s starting (config=%s)", version, cfgPath)
|
|
|
|
cfg, err := config.Load(cfgPath)
|
|
if err != nil {
|
|
log.Fatalf("config load failed: %v", err)
|
|
}
|
|
log.Printf("config loaded: %d account(s), pool min=%d max=%d",
|
|
len(cfg.Accounts), cfg.Pool.Autonomous.Min, cfg.Pool.Autonomous.Max)
|
|
|
|
// Initialise state — reload from disk if a snapshot exists.
|
|
stateFile := cfg.Checkpoint.Dir + "/state.json"
|
|
s, err := state.LoadFromFile(stateFile)
|
|
if err != nil {
|
|
log.Fatalf("state init failed: %v", err)
|
|
}
|
|
log.Printf("state loaded (%d sessions tracked)", len(s.Sessions))
|
|
|
|
// Initialise tmux client and lifecycle manager.
|
|
tmuxClient := tmux.NewExecClient()
|
|
lm := lifecycle.New(tmuxClient, s, cfg)
|
|
|
|
// Validate (and self-heal) the shared-state symlinks BEFORE spawning
|
|
// any sessions. A divergent link would silently fork transcripts
|
|
// between accounts and make failover destructive, so we fail fast here
|
|
// rather than after work is in flight.
|
|
if err := lm.ValidateSharedSymlinks(); err != nil {
|
|
log.Fatalf("shared symlinks validation failed: %v", err)
|
|
}
|
|
|
|
lm.EnsureAllSessions()
|
|
|
|
// Block until SIGINT or SIGTERM.
|
|
ctx, cancel := signal.NotifyContext(context.Background(),
|
|
syscall.SIGINT, syscall.SIGTERM)
|
|
defer cancel()
|
|
|
|
go lm.Run(ctx)
|
|
|
|
// Notifier — reads credentials from environment variables.
|
|
notifier := notify.New(cfg)
|
|
|
|
// Session Watcher — detects when sessions finish their tasks.
|
|
sw := watcher.New(tmuxClient, s, cfg)
|
|
go sw.Run(ctx)
|
|
|
|
// Quota Monitor — polls panes for quota exhaustion signals.
|
|
qm := quota.New(tmuxClient, s, cfg)
|
|
go qm.Run(ctx)
|
|
|
|
// Account Switcher — orchestrates account failover on quota exhaustion.
|
|
as := switcher.New(tmuxClient, s, cfg, qm.SwitchChan(), notifier)
|
|
go as.Run(ctx)
|
|
|
|
// Dispatcher — assigns inbox tasks to idle sessions.
|
|
disp := dispatcher.New(tmuxClient, s, cfg, sw.DoneChan())
|
|
go disp.Run(ctx)
|
|
|
|
// Janitor — periodic cleanup of orphaned files and stale status.json.
|
|
jan := janitor.New(s, cfg.Dispatcher.ProjectsDir)
|
|
go jan.Run(ctx)
|
|
|
|
// State flush loop — persists state to disk every 10 seconds.
|
|
go func() {
|
|
ticker := time.NewTicker(10 * time.Second)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
s.Flush() //nolint:errcheck
|
|
}
|
|
}
|
|
}()
|
|
|
|
// Start HTTP API server.
|
|
listenAddr := cfg.MCPHTTP.Listen
|
|
if listenAddr == "" {
|
|
listenAddr = "127.0.0.1:9090"
|
|
}
|
|
srv := api.New(listenAddr, s)
|
|
go func() {
|
|
if err := srv.Start(); err != nil {
|
|
log.Printf("API server error: %v", err)
|
|
os.Exit(1)
|
|
}
|
|
}()
|
|
|
|
log.Printf("claude-failover v%s — all goroutines running", version)
|
|
|
|
<-ctx.Done()
|
|
log.Printf("shutdown signal received — flushing state and exiting")
|
|
if err := s.Flush(); err != nil {
|
|
log.Printf("state flush warning: %v", err)
|
|
}
|
|
}
|