2026-04-14 18:02:25 +00:00
|
|
|
// Package lifecycle provides the SessionLifecycleManager, which continuously
|
|
|
|
|
// monitors tmux sessions and recreates any that have died unexpectedly.
|
|
|
|
|
package lifecycle
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"log"
|
|
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
|
|
|
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
|
|
|
|
|
"forge.secuaas.ovh/olivier/claude-failover/internal/tmux"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// Manager reconciles the desired pool state (from config) against the actual
|
|
|
|
|
// tmux sessions, recreating any that have disappeared.
|
|
|
|
|
type Manager struct {
|
|
|
|
|
tmux tmux.Client
|
|
|
|
|
state *state.State
|
|
|
|
|
config *config.Config
|
|
|
|
|
logger *log.Logger
|
|
|
|
|
interval time.Duration
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// New creates a Manager with a default reconciliation interval of 15 seconds.
|
|
|
|
|
func New(tc tmux.Client, s *state.State, cfg *config.Config) *Manager {
|
|
|
|
|
return &Manager{
|
|
|
|
|
tmux: tc,
|
|
|
|
|
state: s,
|
|
|
|
|
config: cfg,
|
|
|
|
|
logger: log.Default(),
|
|
|
|
|
interval: 15 * time.Second,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Run starts the reconciliation loop, ticking every m.interval until ctx is cancelled.
|
|
|
|
|
func (m *Manager) Run(ctx context.Context) {
|
|
|
|
|
ticker := time.NewTicker(m.interval)
|
|
|
|
|
defer ticker.Stop()
|
|
|
|
|
for {
|
|
|
|
|
select {
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
return
|
|
|
|
|
case <-ticker.C:
|
|
|
|
|
m.reconcile()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// EnsureAllSessions creates all configured sessions that are not yet present in tmux.
|
|
|
|
|
// It is intended to be called once at daemon startup before Run is launched.
|
|
|
|
|
func (m *Manager) EnsureAllSessions() {
|
|
|
|
|
for _, ds := range m.config.Pool.Dedicated {
|
|
|
|
|
if !m.tmux.HasSession(ds.Name) {
|
|
|
|
|
if err := m.tmux.CreateSession(ds.Name, ds.Project); err != nil {
|
|
|
|
|
m.logger.Printf("[lifecycle] EnsureAllSessions: failed to create session %q: %v", ds.Name, err)
|
|
|
|
|
} else {
|
|
|
|
|
m.logger.Printf("[lifecycle] EnsureAllSessions: created session %q (workdir=%s)", ds.Name, ds.Project)
|
|
|
|
|
m.state.SetIdle(ds.Name)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
feat(pool): add start_index so manual and auto pools can coexist
Production had two disjoint tmux pools named alike but for different
purposes:
ccl-0..ccl-9 — manual/interactive sessions (operator)
ccl-auto-11..ccl-auto-20 — autonomous dispatcher pool
Until now the daemon's loops iterated prefix + 0..Max, so with the
deployed config ("prefix: ccl-auto", min=2, max=10) the dispatcher
looked for sessions "ccl-auto0..ccl-auto9" that never existed, while
the real auto pool ccl-auto-11..20 was invisible. Net effect: no task
was ever dispatched, and killAllPoolSessions fabricated phantom
"ccl-auto0/1" sessions on each swap.
- AutonomousConfig gains StartIndex (yaml start_index, default 0).
Behaviour is unchanged when StartIndex is 0.
- Monitor, switcher (kill + recreate), dispatcher (findFreeSession),
and lifecycle (EnsureAll + reconcile) all iterate
[StartIndex, StartIndex+Max) so the daemon only touches its own
range and leaves ccl-0..ccl-9 alone.
- Production config updated to prefix: "ccl-auto-", start_index: 11,
min: 10, max: 10 — covering the 10 real ccl-auto-11..20 sessions.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 20:39:57 +00:00
|
|
|
// Ensure autonomous pool sessions (prefix + index, starting at StartIndex).
|
2026-04-14 18:02:25 +00:00
|
|
|
prefix := m.config.Pool.Autonomous.Prefix
|
|
|
|
|
if prefix == "" {
|
|
|
|
|
prefix = "ccl-auto-"
|
|
|
|
|
}
|
feat(pool): add start_index so manual and auto pools can coexist
Production had two disjoint tmux pools named alike but for different
purposes:
ccl-0..ccl-9 — manual/interactive sessions (operator)
ccl-auto-11..ccl-auto-20 — autonomous dispatcher pool
Until now the daemon's loops iterated prefix + 0..Max, so with the
deployed config ("prefix: ccl-auto", min=2, max=10) the dispatcher
looked for sessions "ccl-auto0..ccl-auto9" that never existed, while
the real auto pool ccl-auto-11..20 was invisible. Net effect: no task
was ever dispatched, and killAllPoolSessions fabricated phantom
"ccl-auto0/1" sessions on each swap.
- AutonomousConfig gains StartIndex (yaml start_index, default 0).
Behaviour is unchanged when StartIndex is 0.
- Monitor, switcher (kill + recreate), dispatcher (findFreeSession),
and lifecycle (EnsureAll + reconcile) all iterate
[StartIndex, StartIndex+Max) so the daemon only touches its own
range and leaves ccl-0..ccl-9 alone.
- Production config updated to prefix: "ccl-auto-", start_index: 11,
min: 10, max: 10 — covering the 10 real ccl-auto-11..20 sessions.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 20:39:57 +00:00
|
|
|
start := m.config.Pool.Autonomous.StartIndex
|
|
|
|
|
for i := start; i < start+m.config.Pool.Autonomous.Min; i++ {
|
2026-04-14 18:02:25 +00:00
|
|
|
name := sessionName(prefix, i)
|
|
|
|
|
if !m.tmux.HasSession(name) {
|
|
|
|
|
if err := m.tmux.CreateSession(name, ""); err != nil {
|
|
|
|
|
m.logger.Printf("[lifecycle] EnsureAllSessions: failed to create autonomous session %q: %v", name, err)
|
|
|
|
|
} else {
|
|
|
|
|
m.logger.Printf("[lifecycle] EnsureAllSessions: created autonomous session %q", name)
|
|
|
|
|
m.state.SetIdle(name)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// reconcile checks every configured session and repairs missing ones.
|
|
|
|
|
func (m *Manager) reconcile() {
|
|
|
|
|
// Reconcile dedicated sessions.
|
|
|
|
|
for _, ds := range m.config.Pool.Dedicated {
|
|
|
|
|
m.reconcileSession(ds.Name, ds.Project)
|
|
|
|
|
}
|
|
|
|
|
|
feat(pool): add start_index so manual and auto pools can coexist
Production had two disjoint tmux pools named alike but for different
purposes:
ccl-0..ccl-9 — manual/interactive sessions (operator)
ccl-auto-11..ccl-auto-20 — autonomous dispatcher pool
Until now the daemon's loops iterated prefix + 0..Max, so with the
deployed config ("prefix: ccl-auto", min=2, max=10) the dispatcher
looked for sessions "ccl-auto0..ccl-auto9" that never existed, while
the real auto pool ccl-auto-11..20 was invisible. Net effect: no task
was ever dispatched, and killAllPoolSessions fabricated phantom
"ccl-auto0/1" sessions on each swap.
- AutonomousConfig gains StartIndex (yaml start_index, default 0).
Behaviour is unchanged when StartIndex is 0.
- Monitor, switcher (kill + recreate), dispatcher (findFreeSession),
and lifecycle (EnsureAll + reconcile) all iterate
[StartIndex, StartIndex+Max) so the daemon only touches its own
range and leaves ccl-0..ccl-9 alone.
- Production config updated to prefix: "ccl-auto-", start_index: 11,
min: 10, max: 10 — covering the 10 real ccl-auto-11..20 sessions.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 20:39:57 +00:00
|
|
|
// Reconcile the autonomous pool (min sessions, starting at StartIndex).
|
2026-04-14 18:02:25 +00:00
|
|
|
prefix := m.config.Pool.Autonomous.Prefix
|
|
|
|
|
if prefix == "" {
|
|
|
|
|
prefix = "ccl-auto-"
|
|
|
|
|
}
|
feat(pool): add start_index so manual and auto pools can coexist
Production had two disjoint tmux pools named alike but for different
purposes:
ccl-0..ccl-9 — manual/interactive sessions (operator)
ccl-auto-11..ccl-auto-20 — autonomous dispatcher pool
Until now the daemon's loops iterated prefix + 0..Max, so with the
deployed config ("prefix: ccl-auto", min=2, max=10) the dispatcher
looked for sessions "ccl-auto0..ccl-auto9" that never existed, while
the real auto pool ccl-auto-11..20 was invisible. Net effect: no task
was ever dispatched, and killAllPoolSessions fabricated phantom
"ccl-auto0/1" sessions on each swap.
- AutonomousConfig gains StartIndex (yaml start_index, default 0).
Behaviour is unchanged when StartIndex is 0.
- Monitor, switcher (kill + recreate), dispatcher (findFreeSession),
and lifecycle (EnsureAll + reconcile) all iterate
[StartIndex, StartIndex+Max) so the daemon only touches its own
range and leaves ccl-0..ccl-9 alone.
- Production config updated to prefix: "ccl-auto-", start_index: 11,
min: 10, max: 10 — covering the 10 real ccl-auto-11..20 sessions.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 20:39:57 +00:00
|
|
|
start := m.config.Pool.Autonomous.StartIndex
|
|
|
|
|
for i := start; i < start+m.config.Pool.Autonomous.Min; i++ {
|
2026-04-14 18:02:25 +00:00
|
|
|
name := sessionName(prefix, i)
|
|
|
|
|
m.reconcileSession(name, "")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// reconcileSession handles a single named session.
|
|
|
|
|
func (m *Manager) reconcileSession(name, workdir string) {
|
|
|
|
|
has := m.tmux.HasSession(name)
|
|
|
|
|
st := m.state.GetSession(name)
|
|
|
|
|
|
|
|
|
|
if has {
|
|
|
|
|
// Session exists — if it's supposed to be working, verify it still looks active.
|
|
|
|
|
if st != nil && st.State == "working" {
|
|
|
|
|
tail, err := m.tmux.CapturePaneTail(name, 5)
|
|
|
|
|
if err != nil {
|
|
|
|
|
m.logger.Printf("[lifecycle] reconcile: cannot capture pane for %q: %v", name, err)
|
|
|
|
|
}
|
|
|
|
|
// A session that has exited to the shell prompt after a Claude process crash
|
|
|
|
|
// will show a shell prompt. We just log a warning here; deeper heuristics can
|
|
|
|
|
// be added in future phases.
|
|
|
|
|
_ = tail
|
|
|
|
|
}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Session is missing.
|
|
|
|
|
if st == nil || st.State == "idle" || st.State == "" {
|
|
|
|
|
m.logger.Printf("[lifecycle] RECREATED: session %q was absent (state=idle) — creating", name)
|
|
|
|
|
if err := m.tmux.CreateSession(name, workdir); err != nil {
|
|
|
|
|
m.logger.Printf("[lifecycle] reconcile: failed to recreate %q: %v", name, err)
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
m.state.SetIdle(name)
|
|
|
|
|
} else if st.State == "working" {
|
|
|
|
|
m.logger.Printf("[lifecycle] RECOVERED: session %q crashed while working (task=%v) — marking failed and recreating",
|
|
|
|
|
name, deref(st.Task))
|
|
|
|
|
m.state.SetFailed(name)
|
|
|
|
|
if err := m.tmux.CreateSession(name, workdir); err != nil {
|
|
|
|
|
m.logger.Printf("[lifecycle] reconcile: failed to recreate %q after recovery: %v", name, err)
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
m.state.SetIdle(name)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// sessionName builds a session name from a prefix and a zero-based index.
|
|
|
|
|
func sessionName(prefix string, i int) string {
|
|
|
|
|
return prefix + itoa(i)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// itoa converts an integer to its decimal string representation without importing strconv.
|
|
|
|
|
func itoa(n int) string {
|
|
|
|
|
if n == 0 {
|
|
|
|
|
return "0"
|
|
|
|
|
}
|
|
|
|
|
b := make([]byte, 0, 10)
|
|
|
|
|
for n > 0 {
|
|
|
|
|
b = append([]byte{byte('0' + n%10)}, b...)
|
|
|
|
|
n /= 10
|
|
|
|
|
}
|
|
|
|
|
return string(b)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// deref safely dereferences a *string, returning "<nil>" if nil.
|
|
|
|
|
func deref(s *string) string {
|
|
|
|
|
if s == nil {
|
|
|
|
|
return "<nil>"
|
|
|
|
|
}
|
|
|
|
|
return *s
|
|
|
|
|
}
|