feat: SessionLifecycleManager — auto-detect and repair dead tmux sessions
- Add internal/lifecycle/manager.go with Manager struct, Run() ticker loop (15s interval), EnsureAllSessions() for boot-time session creation, and reconcile() that recreates idle sessions and recovers working ones via SetFailed + CreateSession - Add state.SetFailed() to record crash timestamp on SessionState - Add internal/lifecycle/manager_test.go with mock tmux client and 3 tests: TestReconcileCreatesDeadSession, TestReconcileRecoversCrashedSession, TestEnsureAllSessions — all pass - Wire lifecycle.Manager into cmd/claude-failover/main.go after state init Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2d43580c18
commit
978b60ccf7
10 changed files with 810 additions and 32 deletions
150
internal/lifecycle/manager_test.go
Normal file
150
internal/lifecycle/manager_test.go
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
package lifecycle
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
|
||||
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
|
||||
)
|
||||
|
||||
// mockTmux is a minimal in-memory implementation of tmux.Client for tests.
|
||||
type mockTmux struct {
|
||||
sessions map[string]bool
|
||||
createCalls []string
|
||||
killCalls []string
|
||||
sendKeysCalls []string
|
||||
}
|
||||
|
||||
func newMockTmux() *mockTmux {
|
||||
return &mockTmux{sessions: make(map[string]bool)}
|
||||
}
|
||||
|
||||
func (m *mockTmux) HasSession(name string) bool {
|
||||
return m.sessions[name]
|
||||
}
|
||||
|
||||
func (m *mockTmux) CreateSession(name, workdir string) error {
|
||||
m.sessions[name] = true
|
||||
m.createCalls = append(m.createCalls, name)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockTmux) KillSession(name string) error {
|
||||
delete(m.sessions, name)
|
||||
m.killCalls = append(m.killCalls, name)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockTmux) SendKeys(session, keys string) error {
|
||||
m.sendKeysCalls = append(m.sendKeysCalls, session)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockTmux) CapturePaneTail(session string, lines int) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// minimalConfig returns a config with one dedicated session and no autonomous pool.
|
||||
func minimalConfig(sessionName, project string) *config.Config {
|
||||
return &config.Config{
|
||||
Pool: config.PoolConfig{
|
||||
Dedicated: []config.DedicatedSession{
|
||||
{Name: sessionName, Project: project},
|
||||
},
|
||||
Autonomous: config.AutonomousConfig{
|
||||
Prefix: "ccl-auto-",
|
||||
Min: 0,
|
||||
Max: 0,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileCreatesDeadSession verifies that when a session is absent and
|
||||
// its state is idle, reconcile recreates it.
|
||||
func TestReconcileCreatesDeadSession(t *testing.T) {
|
||||
tc := newMockTmux()
|
||||
// Session does NOT exist in tmux.
|
||||
tc.sessions["my-session"] = false
|
||||
|
||||
s := state.New("/tmp/test-state-idle.json")
|
||||
s.SetIdle("my-session")
|
||||
|
||||
cfg := minimalConfig("my-session", "/tmp/project")
|
||||
m := New(tc, s, cfg)
|
||||
m.reconcile()
|
||||
|
||||
if len(tc.createCalls) != 1 || tc.createCalls[0] != "my-session" {
|
||||
t.Errorf("expected CreateSession(my-session) to be called once; createCalls=%v", tc.createCalls)
|
||||
}
|
||||
if got := s.GetSession("my-session"); got == nil || got.State != "idle" {
|
||||
t.Errorf("expected session state idle after recreate, got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestReconcileRecoversCrashedSession verifies that a missing session whose
|
||||
// state is "working" gets marked failed before being recreated.
|
||||
func TestReconcileRecoversCrashedSession(t *testing.T) {
|
||||
tc := newMockTmux()
|
||||
// Session does NOT exist in tmux but was working.
|
||||
tc.sessions["worker"] = false
|
||||
|
||||
s := state.New("/tmp/test-state-working.json")
|
||||
s.SetWorking("worker", "task-abc")
|
||||
|
||||
cfg := minimalConfig("worker", "")
|
||||
m := New(tc, s, cfg)
|
||||
m.reconcile()
|
||||
|
||||
if len(tc.createCalls) != 1 || tc.createCalls[0] != "worker" {
|
||||
t.Errorf("expected CreateSession(worker) once after recovery; createCalls=%v", tc.createCalls)
|
||||
}
|
||||
// State should transition: working -> failed -> idle (SetIdle called after recreate).
|
||||
got := s.GetSession("worker")
|
||||
if got == nil {
|
||||
t.Fatal("session state is nil after recovery")
|
||||
}
|
||||
if got.State != "idle" {
|
||||
t.Errorf("expected session state idle after recovery, got %q", got.State)
|
||||
}
|
||||
if got.LastFail == nil {
|
||||
t.Error("expected LastFail to be set after SetFailed was called")
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnsureAllSessions verifies that EnsureAllSessions creates all sessions
|
||||
// that are missing from tmux.
|
||||
func TestEnsureAllSessions(t *testing.T) {
|
||||
tc := newMockTmux()
|
||||
// None of the sessions exist yet.
|
||||
|
||||
s := state.New("/tmp/test-state-ensure.json")
|
||||
cfg := &config.Config{
|
||||
Pool: config.PoolConfig{
|
||||
Dedicated: []config.DedicatedSession{
|
||||
{Name: "sess-a", Project: "/tmp/a"},
|
||||
{Name: "sess-b", Project: "/tmp/b"},
|
||||
},
|
||||
Autonomous: config.AutonomousConfig{
|
||||
Prefix: "auto-",
|
||||
Min: 2,
|
||||
Max: 5,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
m := New(tc, s, cfg)
|
||||
m.EnsureAllSessions()
|
||||
|
||||
// Expect: sess-a, sess-b, auto-0, auto-1 = 4 sessions created.
|
||||
if len(tc.createCalls) != 4 {
|
||||
t.Errorf("expected 4 CreateSession calls, got %d: %v", len(tc.createCalls), tc.createCalls)
|
||||
}
|
||||
|
||||
want := map[string]bool{"sess-a": true, "sess-b": true, "auto-0": true, "auto-1": true}
|
||||
for _, name := range tc.createCalls {
|
||||
if !want[name] {
|
||||
t.Errorf("unexpected session created: %q", name)
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue