claude-failover/cmd/claude-failover/main.go
Ubuntu e16e3526a0 feat(lifecycle): validate shared symlinks at daemon startup (A2)
Wire symlinks.ValidateAll into the lifecycle manager so the daemon
refuses to start if any configured account is missing one of the
shared-state symlinks or if a link diverges from the canonical target.

Previously, a missing link on a freshly deployed VM would silently
create a divergent state tree per account (duplicate JSONL transcripts,
broken undo history) — exactly the failure mode the symlinks package
(A1) was introduced to prevent.

The check runs once at startup before EnsureAllSessions, guarding a
single well-defined invariant: "every account home shares the same
projects/, file-history/ and session-env/ roots". No auto-heal on
divergence — we fail fast with an explicit error so the operator fixes
it manually rather than one account's state being overwritten.

Part of Phase 1 Chantier A — Failover robuste.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-16 19:03:43 +00:00

129 lines
3.8 KiB
Go

// Package main is the entrypoint for the claude-failover daemon.
package main
import (
"context"
"flag"
"log"
"os"
"os/signal"
"syscall"
"time"
"forge.secuaas.ovh/olivier/claude-failover/internal/api"
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
"forge.secuaas.ovh/olivier/claude-failover/internal/dispatcher"
"forge.secuaas.ovh/olivier/claude-failover/internal/janitor"
"forge.secuaas.ovh/olivier/claude-failover/internal/lifecycle"
"forge.secuaas.ovh/olivier/claude-failover/internal/notify"
"forge.secuaas.ovh/olivier/claude-failover/internal/quota"
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
"forge.secuaas.ovh/olivier/claude-failover/internal/switcher"
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
"forge.secuaas.ovh/olivier/claude-failover/internal/watcher"
)
const version = "0.1.0"
func main() {
var cfgPath string
flag.StringVar(&cfgPath, "config", "config.yaml", "path to YAML config file")
flag.Parse()
log.SetFlags(log.LstdFlags | log.Lmicroseconds | log.LUTC)
log.Printf("claude-failover v%s starting (config=%s)", version, cfgPath)
cfg, err := config.Load(cfgPath)
if err != nil {
log.Fatalf("config load failed: %v", err)
}
log.Printf("config loaded: %d account(s), pool min=%d max=%d",
len(cfg.Accounts), cfg.Pool.Autonomous.Min, cfg.Pool.Autonomous.Max)
// Initialise state — reload from disk if a snapshot exists.
stateFile := cfg.Checkpoint.Dir + "/state.json"
s, err := state.LoadFromFile(stateFile)
if err != nil {
log.Fatalf("state init failed: %v", err)
}
log.Printf("state loaded (%d sessions tracked)", len(s.Sessions))
// Initialise tmux client and lifecycle manager.
tmuxClient := tmux.NewExecClient()
lm := lifecycle.New(tmuxClient, s, cfg)
// Validate (and self-heal) the shared-state symlinks BEFORE spawning
// any sessions. A divergent link would silently fork transcripts
// between accounts and make failover destructive, so we fail fast here
// rather than after work is in flight.
if err := lm.ValidateSharedSymlinks(); err != nil {
log.Fatalf("shared symlinks validation failed: %v", err)
}
lm.EnsureAllSessions()
// Block until SIGINT or SIGTERM.
ctx, cancel := signal.NotifyContext(context.Background(),
syscall.SIGINT, syscall.SIGTERM)
defer cancel()
go lm.Run(ctx)
// Notifier — reads credentials from environment variables.
notifier := notify.New(cfg)
// Session Watcher — detects when sessions finish their tasks.
sw := watcher.New(tmuxClient, s, cfg)
go sw.Run(ctx)
// Quota Monitor — polls panes for quota exhaustion signals.
qm := quota.New(tmuxClient, s, cfg)
go qm.Run(ctx)
// Account Switcher — orchestrates account failover on quota exhaustion.
as := switcher.New(tmuxClient, s, cfg, qm.SwitchChan(), notifier)
go as.Run(ctx)
// Dispatcher — assigns inbox tasks to idle sessions.
disp := dispatcher.New(tmuxClient, s, cfg, sw.DoneChan())
go disp.Run(ctx)
// Janitor — periodic cleanup of orphaned files and stale status.json.
jan := janitor.New(s, cfg.Dispatcher.ProjectsDir)
go jan.Run(ctx)
// State flush loop — persists state to disk every 10 seconds.
go func() {
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
s.Flush() //nolint:errcheck
}
}
}()
// Start HTTP API server.
listenAddr := cfg.MCPHTTP.Listen
if listenAddr == "" {
listenAddr = "127.0.0.1:9090"
}
srv := api.New(listenAddr, s)
go func() {
if err := srv.Start(); err != nil {
log.Printf("API server error: %v", err)
os.Exit(1)
}
}()
log.Printf("claude-failover v%s — all goroutines running", version)
<-ctx.Done()
log.Printf("shutdown signal received — flushing state and exiting")
if err := s.Flush(); err != nil {
log.Printf("state flush warning: %v", err)
}
}