feat: SessionLifecycleManager — auto-detect and repair dead tmux sessions

- Add internal/lifecycle/manager.go with Manager struct, Run() ticker loop
  (15s interval), EnsureAllSessions() for boot-time session creation, and
  reconcile() that recreates idle sessions and recovers working ones via
  SetFailed + CreateSession
- Add state.SetFailed() to record crash timestamp on SessionState
- Add internal/lifecycle/manager_test.go with mock tmux client and 3 tests:
  TestReconcileCreatesDeadSession, TestReconcileRecoversCrashedSession,
  TestEnsureAllSessions — all pass
- Wire lifecycle.Manager into cmd/claude-failover/main.go after state init

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Ubuntu 2026-04-14 18:02:25 +00:00
parent 2d43580c18
commit 978b60ccf7
10 changed files with 810 additions and 32 deletions

View file

@ -0,0 +1,164 @@
// Package lifecycle provides the SessionLifecycleManager, which continuously
// monitors tmux sessions and recreates any that have died unexpectedly.
package lifecycle
import (
"context"
"log"
"time"
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
)
// Manager reconciles the desired pool state (from config) against the actual
// tmux sessions, recreating any that have disappeared.
type Manager struct {
tmux tmux.Client
state *state.State
config *config.Config
logger *log.Logger
interval time.Duration
}
// New creates a Manager with a default reconciliation interval of 15 seconds.
func New(tc tmux.Client, s *state.State, cfg *config.Config) *Manager {
return &Manager{
tmux: tc,
state: s,
config: cfg,
logger: log.Default(),
interval: 15 * time.Second,
}
}
// Run starts the reconciliation loop, ticking every m.interval until ctx is cancelled.
func (m *Manager) Run(ctx context.Context) {
ticker := time.NewTicker(m.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
m.reconcile()
}
}
}
// EnsureAllSessions creates all configured sessions that are not yet present in tmux.
// It is intended to be called once at daemon startup before Run is launched.
func (m *Manager) EnsureAllSessions() {
for _, ds := range m.config.Pool.Dedicated {
if !m.tmux.HasSession(ds.Name) {
if err := m.tmux.CreateSession(ds.Name, ds.Project); err != nil {
m.logger.Printf("[lifecycle] EnsureAllSessions: failed to create session %q: %v", ds.Name, err)
} else {
m.logger.Printf("[lifecycle] EnsureAllSessions: created session %q (workdir=%s)", ds.Name, ds.Project)
m.state.SetIdle(ds.Name)
}
}
}
// Ensure autonomous pool sessions (prefix + index).
prefix := m.config.Pool.Autonomous.Prefix
if prefix == "" {
prefix = "ccl-auto-"
}
for i := 0; i < m.config.Pool.Autonomous.Min; i++ {
name := sessionName(prefix, i)
if !m.tmux.HasSession(name) {
if err := m.tmux.CreateSession(name, ""); err != nil {
m.logger.Printf("[lifecycle] EnsureAllSessions: failed to create autonomous session %q: %v", name, err)
} else {
m.logger.Printf("[lifecycle] EnsureAllSessions: created autonomous session %q", name)
m.state.SetIdle(name)
}
}
}
}
// reconcile checks every configured session and repairs missing ones.
func (m *Manager) reconcile() {
// Reconcile dedicated sessions.
for _, ds := range m.config.Pool.Dedicated {
m.reconcileSession(ds.Name, ds.Project)
}
// Reconcile the autonomous pool (min sessions).
prefix := m.config.Pool.Autonomous.Prefix
if prefix == "" {
prefix = "ccl-auto-"
}
for i := 0; i < m.config.Pool.Autonomous.Min; i++ {
name := sessionName(prefix, i)
m.reconcileSession(name, "")
}
}
// reconcileSession handles a single named session.
func (m *Manager) reconcileSession(name, workdir string) {
has := m.tmux.HasSession(name)
st := m.state.GetSession(name)
if has {
// Session exists — if it's supposed to be working, verify it still looks active.
if st != nil && st.State == "working" {
tail, err := m.tmux.CapturePaneTail(name, 5)
if err != nil {
m.logger.Printf("[lifecycle] reconcile: cannot capture pane for %q: %v", name, err)
}
// A session that has exited to the shell prompt after a Claude process crash
// will show a shell prompt. We just log a warning here; deeper heuristics can
// be added in future phases.
_ = tail
}
return
}
// Session is missing.
if st == nil || st.State == "idle" || st.State == "" {
m.logger.Printf("[lifecycle] RECREATED: session %q was absent (state=idle) — creating", name)
if err := m.tmux.CreateSession(name, workdir); err != nil {
m.logger.Printf("[lifecycle] reconcile: failed to recreate %q: %v", name, err)
return
}
m.state.SetIdle(name)
} else if st.State == "working" {
m.logger.Printf("[lifecycle] RECOVERED: session %q crashed while working (task=%v) — marking failed and recreating",
name, deref(st.Task))
m.state.SetFailed(name)
if err := m.tmux.CreateSession(name, workdir); err != nil {
m.logger.Printf("[lifecycle] reconcile: failed to recreate %q after recovery: %v", name, err)
return
}
m.state.SetIdle(name)
}
}
// sessionName builds a session name from a prefix and a zero-based index.
func sessionName(prefix string, i int) string {
return prefix + itoa(i)
}
// itoa converts an integer to its decimal string representation without importing strconv.
func itoa(n int) string {
if n == 0 {
return "0"
}
b := make([]byte, 0, 10)
for n > 0 {
b = append([]byte{byte('0' + n%10)}, b...)
n /= 10
}
return string(b)
}
// deref safely dereferences a *string, returning "<nil>" if nil.
func deref(s *string) string {
if s == nil {
return "<nil>"
}
return *s
}

View file

@ -0,0 +1,150 @@
package lifecycle
import (
"testing"
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
)
// mockTmux is a minimal in-memory implementation of tmux.Client for tests.
type mockTmux struct {
sessions map[string]bool
createCalls []string
killCalls []string
sendKeysCalls []string
}
func newMockTmux() *mockTmux {
return &mockTmux{sessions: make(map[string]bool)}
}
func (m *mockTmux) HasSession(name string) bool {
return m.sessions[name]
}
func (m *mockTmux) CreateSession(name, workdir string) error {
m.sessions[name] = true
m.createCalls = append(m.createCalls, name)
return nil
}
func (m *mockTmux) KillSession(name string) error {
delete(m.sessions, name)
m.killCalls = append(m.killCalls, name)
return nil
}
func (m *mockTmux) SendKeys(session, keys string) error {
m.sendKeysCalls = append(m.sendKeysCalls, session)
return nil
}
func (m *mockTmux) CapturePaneTail(session string, lines int) (string, error) {
return "", nil
}
// minimalConfig returns a config with one dedicated session and no autonomous pool.
func minimalConfig(sessionName, project string) *config.Config {
return &config.Config{
Pool: config.PoolConfig{
Dedicated: []config.DedicatedSession{
{Name: sessionName, Project: project},
},
Autonomous: config.AutonomousConfig{
Prefix: "ccl-auto-",
Min: 0,
Max: 0,
},
},
}
}
// TestReconcileCreatesDeadSession verifies that when a session is absent and
// its state is idle, reconcile recreates it.
func TestReconcileCreatesDeadSession(t *testing.T) {
tc := newMockTmux()
// Session does NOT exist in tmux.
tc.sessions["my-session"] = false
s := state.New("/tmp/test-state-idle.json")
s.SetIdle("my-session")
cfg := minimalConfig("my-session", "/tmp/project")
m := New(tc, s, cfg)
m.reconcile()
if len(tc.createCalls) != 1 || tc.createCalls[0] != "my-session" {
t.Errorf("expected CreateSession(my-session) to be called once; createCalls=%v", tc.createCalls)
}
if got := s.GetSession("my-session"); got == nil || got.State != "idle" {
t.Errorf("expected session state idle after recreate, got %v", got)
}
}
// TestReconcileRecoversCrashedSession verifies that a missing session whose
// state is "working" gets marked failed before being recreated.
func TestReconcileRecoversCrashedSession(t *testing.T) {
tc := newMockTmux()
// Session does NOT exist in tmux but was working.
tc.sessions["worker"] = false
s := state.New("/tmp/test-state-working.json")
s.SetWorking("worker", "task-abc")
cfg := minimalConfig("worker", "")
m := New(tc, s, cfg)
m.reconcile()
if len(tc.createCalls) != 1 || tc.createCalls[0] != "worker" {
t.Errorf("expected CreateSession(worker) once after recovery; createCalls=%v", tc.createCalls)
}
// State should transition: working -> failed -> idle (SetIdle called after recreate).
got := s.GetSession("worker")
if got == nil {
t.Fatal("session state is nil after recovery")
}
if got.State != "idle" {
t.Errorf("expected session state idle after recovery, got %q", got.State)
}
if got.LastFail == nil {
t.Error("expected LastFail to be set after SetFailed was called")
}
}
// TestEnsureAllSessions verifies that EnsureAllSessions creates all sessions
// that are missing from tmux.
func TestEnsureAllSessions(t *testing.T) {
tc := newMockTmux()
// None of the sessions exist yet.
s := state.New("/tmp/test-state-ensure.json")
cfg := &config.Config{
Pool: config.PoolConfig{
Dedicated: []config.DedicatedSession{
{Name: "sess-a", Project: "/tmp/a"},
{Name: "sess-b", Project: "/tmp/b"},
},
Autonomous: config.AutonomousConfig{
Prefix: "auto-",
Min: 2,
Max: 5,
},
},
}
m := New(tc, s, cfg)
m.EnsureAllSessions()
// Expect: sess-a, sess-b, auto-0, auto-1 = 4 sessions created.
if len(tc.createCalls) != 4 {
t.Errorf("expected 4 CreateSession calls, got %d: %v", len(tc.createCalls), tc.createCalls)
}
want := map[string]bool{"sess-a": true, "sess-b": true, "auto-0": true, "auto-1": true}
for _, name := range tc.createCalls {
if !want[name] {
t.Errorf("unexpected session created: %q", name)
}
}
}