feat: SessionLifecycleManager — auto-detect and repair dead tmux sessions
- Add internal/lifecycle/manager.go with Manager struct, Run() ticker loop (15s interval), EnsureAllSessions() for boot-time session creation, and reconcile() that recreates idle sessions and recovers working ones via SetFailed + CreateSession - Add state.SetFailed() to record crash timestamp on SessionState - Add internal/lifecycle/manager_test.go with mock tmux client and 3 tests: TestReconcileCreatesDeadSession, TestReconcileRecoversCrashedSession, TestEnsureAllSessions — all pass - Wire lifecycle.Manager into cmd/claude-failover/main.go after state init Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2d43580c18
commit
978b60ccf7
10 changed files with 810 additions and 32 deletions
|
|
@ -1,10 +1,4 @@
|
|||
// Package main is the entrypoint for the claude-failover daemon.
|
||||
//
|
||||
// Scope of this stub: load the YAML config from disk, log startup
|
||||
// information, and block until a termination signal. The real runtime
|
||||
// (dispatcher, quota-monitor, session-watcher, checkpoint, janitor,
|
||||
// notifier, account-switcher goroutines) is not implemented yet — see
|
||||
// docs/architecture.md.
|
||||
package main
|
||||
|
||||
import (
|
||||
|
|
@ -14,45 +8,68 @@ import (
|
|||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/api"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/lifecycle"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
|
||||
)
|
||||
|
||||
// Config mirrors config.example.yaml at a high level. We keep it loose
|
||||
// here because this stub does not wire real YAML parsing yet; the full
|
||||
// schema will live in internal/config once implementation starts.
|
||||
type Config struct {
|
||||
Path string
|
||||
}
|
||||
|
||||
func loadConfig(path string) (*Config, error) {
|
||||
// TODO(claude-failover): parse YAML via gopkg.in/yaml.v3 and validate.
|
||||
if _, err := os.Stat(path); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &Config{Path: path}, nil
|
||||
}
|
||||
const version = "0.1.0"
|
||||
|
||||
func main() {
|
||||
var cfgPath string
|
||||
flag.StringVar(&cfgPath, "config", "config.yaml", "path to YAML config")
|
||||
flag.StringVar(&cfgPath, "config", "config.yaml", "path to YAML config file")
|
||||
flag.Parse()
|
||||
|
||||
log.SetFlags(log.LstdFlags | log.Lmicroseconds | log.LUTC)
|
||||
log.Printf("claude-failover starting (config=%s)", cfgPath)
|
||||
log.Printf("claude-failover v%s starting (config=%s)", version, cfgPath)
|
||||
|
||||
cfg, err := loadConfig(cfgPath)
|
||||
cfg, err := config.Load(cfgPath)
|
||||
if err != nil {
|
||||
log.Fatalf("config load failed: %v", err)
|
||||
}
|
||||
log.Printf("config loaded: %s", cfg.Path)
|
||||
log.Printf("config loaded: %d account(s), pool min=%d max=%d",
|
||||
len(cfg.Accounts), cfg.Pool.Autonomous.Min, cfg.Pool.Autonomous.Max)
|
||||
|
||||
// TODO: spawn goroutines — dispatcher, quota-monitor, session-watcher,
|
||||
// checkpoint, janitor, notifier, account-switcher.
|
||||
// Initialise state — reload from disk if a snapshot exists.
|
||||
stateFile := cfg.Checkpoint.Dir + "/state.json"
|
||||
s, err := state.LoadFromFile(stateFile)
|
||||
if err != nil {
|
||||
log.Fatalf("state init failed: %v", err)
|
||||
}
|
||||
log.Printf("state loaded (%d sessions tracked)", len(s.Sessions))
|
||||
|
||||
// Initialise tmux client and lifecycle manager.
|
||||
tmuxClient := tmux.NewExecClient()
|
||||
lm := lifecycle.New(tmuxClient, s, cfg)
|
||||
lm.EnsureAllSessions()
|
||||
|
||||
// Block until SIGINT or SIGTERM.
|
||||
ctx, cancel := signal.NotifyContext(context.Background(),
|
||||
syscall.SIGINT, syscall.SIGTERM)
|
||||
defer cancel()
|
||||
|
||||
log.Printf("claude-failover ready (stub — no workers running)")
|
||||
go lm.Run(ctx)
|
||||
|
||||
// Start HTTP API server.
|
||||
listenAddr := cfg.MCPHTTP.Listen
|
||||
if listenAddr == "" {
|
||||
listenAddr = "127.0.0.1:9090"
|
||||
}
|
||||
srv := api.New(listenAddr, s)
|
||||
go func() {
|
||||
if err := srv.Start(); err != nil {
|
||||
log.Printf("API server error: %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}()
|
||||
log.Printf("claude-failover v%s started, API on %s", version, listenAddr)
|
||||
|
||||
<-ctx.Done()
|
||||
log.Printf("shutdown signal received, exiting")
|
||||
log.Printf("shutdown signal received — flushing state and exiting")
|
||||
if err := s.Flush(); err != nil {
|
||||
log.Printf("state flush warning: %v", err)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue