feat: SessionLifecycleManager — auto-detect and repair dead tmux sessions

- Add internal/lifecycle/manager.go with Manager struct, Run() ticker loop
  (15s interval), EnsureAllSessions() for boot-time session creation, and
  reconcile() that recreates idle sessions and recovers working ones via
  SetFailed + CreateSession
- Add state.SetFailed() to record crash timestamp on SessionState
- Add internal/lifecycle/manager_test.go with mock tmux client and 3 tests:
  TestReconcileCreatesDeadSession, TestReconcileRecoversCrashedSession,
  TestEnsureAllSessions — all pass
- Wire lifecycle.Manager into cmd/claude-failover/main.go after state init

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Ubuntu 2026-04-14 18:02:25 +00:00
parent 2d43580c18
commit 978b60ccf7
10 changed files with 810 additions and 32 deletions

View file

@ -1,10 +1,4 @@
// Package main is the entrypoint for the claude-failover daemon.
//
// Scope of this stub: load the YAML config from disk, log startup
// information, and block until a termination signal. The real runtime
// (dispatcher, quota-monitor, session-watcher, checkpoint, janitor,
// notifier, account-switcher goroutines) is not implemented yet — see
// docs/architecture.md.
package main
import (
@ -14,45 +8,68 @@ import (
"os"
"os/signal"
"syscall"
"forge.secuaas.ovh/olivier/claude-failover/internal/api"
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
"forge.secuaas.ovh/olivier/claude-failover/internal/lifecycle"
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
)
// Config mirrors config.example.yaml at a high level. We keep it loose
// here because this stub does not wire real YAML parsing yet; the full
// schema will live in internal/config once implementation starts.
type Config struct {
Path string
}
func loadConfig(path string) (*Config, error) {
// TODO(claude-failover): parse YAML via gopkg.in/yaml.v3 and validate.
if _, err := os.Stat(path); err != nil {
return nil, err
}
return &Config{Path: path}, nil
}
const version = "0.1.0"
func main() {
var cfgPath string
flag.StringVar(&cfgPath, "config", "config.yaml", "path to YAML config")
flag.StringVar(&cfgPath, "config", "config.yaml", "path to YAML config file")
flag.Parse()
log.SetFlags(log.LstdFlags | log.Lmicroseconds | log.LUTC)
log.Printf("claude-failover starting (config=%s)", cfgPath)
log.Printf("claude-failover v%s starting (config=%s)", version, cfgPath)
cfg, err := loadConfig(cfgPath)
cfg, err := config.Load(cfgPath)
if err != nil {
log.Fatalf("config load failed: %v", err)
}
log.Printf("config loaded: %s", cfg.Path)
log.Printf("config loaded: %d account(s), pool min=%d max=%d",
len(cfg.Accounts), cfg.Pool.Autonomous.Min, cfg.Pool.Autonomous.Max)
// TODO: spawn goroutines — dispatcher, quota-monitor, session-watcher,
// checkpoint, janitor, notifier, account-switcher.
// Initialise state — reload from disk if a snapshot exists.
stateFile := cfg.Checkpoint.Dir + "/state.json"
s, err := state.LoadFromFile(stateFile)
if err != nil {
log.Fatalf("state init failed: %v", err)
}
log.Printf("state loaded (%d sessions tracked)", len(s.Sessions))
// Initialise tmux client and lifecycle manager.
tmuxClient := tmux.NewExecClient()
lm := lifecycle.New(tmuxClient, s, cfg)
lm.EnsureAllSessions()
// Block until SIGINT or SIGTERM.
ctx, cancel := signal.NotifyContext(context.Background(),
syscall.SIGINT, syscall.SIGTERM)
defer cancel()
log.Printf("claude-failover ready (stub — no workers running)")
go lm.Run(ctx)
// Start HTTP API server.
listenAddr := cfg.MCPHTTP.Listen
if listenAddr == "" {
listenAddr = "127.0.0.1:9090"
}
srv := api.New(listenAddr, s)
go func() {
if err := srv.Start(); err != nil {
log.Printf("API server error: %v", err)
os.Exit(1)
}
}()
log.Printf("claude-failover v%s started, API on %s", version, listenAddr)
<-ctx.Done()
log.Printf("shutdown signal received, exiting")
log.Printf("shutdown signal received — flushing state and exiting")
if err := s.Flush(); err != nil {
log.Printf("state flush warning: %v", err)
}
}