feat: SessionLifecycleManager — auto-detect and repair dead tmux sessions
- Add internal/lifecycle/manager.go with Manager struct, Run() ticker loop (15s interval), EnsureAllSessions() for boot-time session creation, and reconcile() that recreates idle sessions and recovers working ones via SetFailed + CreateSession - Add state.SetFailed() to record crash timestamp on SessionState - Add internal/lifecycle/manager_test.go with mock tmux client and 3 tests: TestReconcileCreatesDeadSession, TestReconcileRecoversCrashedSession, TestEnsureAllSessions — all pass - Wire lifecycle.Manager into cmd/claude-failover/main.go after state init Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2d43580c18
commit
978b60ccf7
10 changed files with 810 additions and 32 deletions
150
internal/state/state.go
Normal file
150
internal/state/state.go
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
// Package state manages the in-memory + on-disk representation of all
|
||||
// session and quota state for the claude-failover daemon.
|
||||
package state
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// SessionState captures the runtime status of a single tmux session.
|
||||
type SessionState struct {
|
||||
State string `json:"state"`
|
||||
Project *string `json:"project,omitempty"`
|
||||
AssignedAt *time.Time `json:"assigned_at,omitempty"`
|
||||
Task *string `json:"task,omitempty"`
|
||||
LastFail *time.Time `json:"last_fail,omitempty"`
|
||||
}
|
||||
|
||||
// QuotaState tracks which account is active and whether dispatching is paused.
|
||||
type QuotaState struct {
|
||||
Paused bool `json:"paused"`
|
||||
ActiveAccount string `json:"active_account"`
|
||||
ResumeAt *time.Time `json:"resume_at,omitempty"`
|
||||
}
|
||||
|
||||
// State is the thread-safe runtime state persisted to a JSON file.
|
||||
type State struct {
|
||||
mu sync.RWMutex
|
||||
Sessions map[string]*SessionState `json:"sessions"`
|
||||
Quota QuotaState `json:"quota"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
filePath string
|
||||
}
|
||||
|
||||
// New creates an empty State that will be flushed to filePath.
|
||||
func New(filePath string) *State {
|
||||
return &State{
|
||||
Sessions: make(map[string]*SessionState),
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
filePath: filePath,
|
||||
}
|
||||
}
|
||||
|
||||
// LoadFromFile reads an existing state JSON file. Returns a new empty
|
||||
// State if the file does not exist.
|
||||
func LoadFromFile(path string) (*State, error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if os.IsNotExist(err) {
|
||||
return New(path), nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading state file %s: %w", path, err)
|
||||
}
|
||||
|
||||
s := New(path)
|
||||
if err := json.Unmarshal(data, s); err != nil {
|
||||
return nil, fmt.Errorf("parsing state file %s: %w", path, err)
|
||||
}
|
||||
s.filePath = path
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Flush serialises the state to disk atomically (write to tmp then rename).
|
||||
func (s *State) Flush() error {
|
||||
if s.filePath == "" {
|
||||
return nil
|
||||
}
|
||||
s.mu.RLock()
|
||||
data, err := json.MarshalIndent(s, "", " ")
|
||||
s.mu.RUnlock()
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshalling state: %w", err)
|
||||
}
|
||||
|
||||
tmp := s.filePath + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0600); err != nil {
|
||||
return fmt.Errorf("writing state tmp: %w", err)
|
||||
}
|
||||
return os.Rename(tmp, s.filePath)
|
||||
}
|
||||
|
||||
// JSON returns the current state as a JSON byte slice (for HTTP /status).
|
||||
func (s *State) JSON() []byte {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
data, _ := json.Marshal(s)
|
||||
return data
|
||||
}
|
||||
|
||||
// GetSession returns the state for the named session, or nil.
|
||||
func (s *State) GetSession(name string) *SessionState {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
return s.Sessions[name]
|
||||
}
|
||||
|
||||
// touch updates the UpdatedAt timestamp. Must be called with write lock held.
|
||||
func (s *State) touch() {
|
||||
s.UpdatedAt = time.Now().UTC()
|
||||
}
|
||||
|
||||
// SetIdle marks the named session as idle and clears task metadata.
|
||||
func (s *State) SetIdle(name string) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
sess, ok := s.Sessions[name]
|
||||
if !ok {
|
||||
sess = &SessionState{}
|
||||
s.Sessions[name] = sess
|
||||
}
|
||||
sess.State = "idle"
|
||||
sess.Task = nil
|
||||
sess.AssignedAt = nil
|
||||
s.touch()
|
||||
}
|
||||
|
||||
// SetWorking marks the named session as busy with the given task ID.
|
||||
func (s *State) SetWorking(name, task string) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
now := time.Now().UTC()
|
||||
sess, ok := s.Sessions[name]
|
||||
if !ok {
|
||||
sess = &SessionState{}
|
||||
s.Sessions[name] = sess
|
||||
}
|
||||
sess.State = "working"
|
||||
sess.Task = &task
|
||||
sess.AssignedAt = &now
|
||||
s.touch()
|
||||
}
|
||||
|
||||
// SetFailed marks the named session as failed and records the failure timestamp.
|
||||
// The task is preserved for potential requeue by the caller.
|
||||
func (s *State) SetFailed(name string) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
now := time.Now().UTC()
|
||||
sess, ok := s.Sessions[name]
|
||||
if !ok {
|
||||
sess = &SessionState{}
|
||||
s.Sessions[name] = sess
|
||||
}
|
||||
sess.State = "failed"
|
||||
sess.LastFail = &now
|
||||
s.touch()
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue