commit cf4957010f5631f125fbba92085222c3f2e696f5 Author: Olivier Date: Tue Apr 14 13:29:24 2026 +0000 feat: initial project structure diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d3dd611 --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# Binaries +bin/ +build/ +dist/ +*.exe +*.test +*.out + +# Go +vendor/ +*.prof +coverage.txt +coverage.html + +# Logs +*.log +logs/ + +# Local config & secrets +config.yaml +config.local.yaml +.env +.env.local +*.pem +*.key + +# IDE / editor +.idea/ +.vscode/ +*.swp +*.swo +.DS_Store + +# Runtime / state +state/ +checkpoints/ +tmp/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f644ffb --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 SecuAAS / 9463-7220 Québec Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..4720a5c --- /dev/null +++ b/README.md @@ -0,0 +1,68 @@ +# claude-failover + +Go daemon for Claude Code multi-account session orchestration with automatic +quota-based failover. + +## Overview + +`claude-failover` orchestrates a pool of Claude Code sessions running under +multiple Anthropic accounts. When the active account reaches its quota +threshold (5-hour usage window or weekly cap), the daemon transparently fails +over the workload to a backup account without losing in-flight session state. + +It is the runtime glue behind the SecuAAS agent pool (`ccl-0`..`ccl-9`, +`ccl-auto-11`..`ccl-auto-20`) and is engineered to hold sessions warm across +account swaps by sharing the `~/.claude/projects/` transcript tree via +symlinks. + +## Architecture (goroutines) + +The daemon is a single Go binary composed of cooperating goroutines: + +- **dispatcher** — reads `.agent-queue/inbox/*.md` across registered projects + and assigns tasks to idle sessions. +- **quota-monitor** — polls each configured Anthropic account's usage window + and triggers a failover when the active account crosses its threshold. +- **session-watcher** — tracks tmux session liveness (`ccl-*`), heartbeats, + and `.agent-queue/status.json` transitions (idle / working). +- **checkpoint** — periodically snapshots session context (current task, + last tool call, working dir) so an interrupted session can resume on a + different account. +- **janitor** — cleans stale `.dispatched` markers, archives old + `done/` tasks, prunes expired checkpoints. +- **notifier** — pushes state changes (failover fired, session degraded, + task failed) to Telegram / MCP dashboard / log aggregator. +- **account-switcher** — performs the actual swap: stop sessions on account + A, rehome symlinks, relaunch sessions on account B, replay last + checkpoint. Serialized via a single mutex so only one swap can happen at + a time. + +All goroutines communicate through typed channels plus a shared state struct +behind a `sync.RWMutex`. The daemon exposes an HTTP control plane for the +MCP server to query status and force-trigger operations. + +## Relationship to SecuAAS agent-orchestrator + +This project extracts the session-management and failover logic that +currently lives in `dev-management/agent-orchestrator/` (shell scripts: +`launch-agent.sh`, `graceful-switch.sh`, `watchdog.sh`, +`checkpoint-daemon.sh`, `start-dedicated-agents.sh`) and reimplements it +as a single Go service. See the orchestrator docs for the operational +context this daemon is designed to replace. + +## Repository layout + +``` +cmd/claude-failover/ Main entrypoint +docs/ Architecture, configuration, analysis notes +scripts/ Setup helpers (shared-projects symlink, etc.) +config.example.yaml Annotated example config +``` + +## Status + +Pre-alpha. Design and scaffolding only — no working binary yet. + +## License + +MIT — see `LICENSE`. diff --git a/cmd/claude-failover/main.go b/cmd/claude-failover/main.go new file mode 100644 index 0000000..31c5f3d --- /dev/null +++ b/cmd/claude-failover/main.go @@ -0,0 +1,58 @@ +// Package main is the entrypoint for the claude-failover daemon. +// +// Scope of this stub: load the YAML config from disk, log startup +// information, and block until a termination signal. The real runtime +// (dispatcher, quota-monitor, session-watcher, checkpoint, janitor, +// notifier, account-switcher goroutines) is not implemented yet — see +// docs/architecture.md. +package main + +import ( + "context" + "flag" + "log" + "os" + "os/signal" + "syscall" +) + +// Config mirrors config.example.yaml at a high level. We keep it loose +// here because this stub does not wire real YAML parsing yet; the full +// schema will live in internal/config once implementation starts. +type Config struct { + Path string +} + +func loadConfig(path string) (*Config, error) { + // TODO(claude-failover): parse YAML via gopkg.in/yaml.v3 and validate. + if _, err := os.Stat(path); err != nil { + return nil, err + } + return &Config{Path: path}, nil +} + +func main() { + var cfgPath string + flag.StringVar(&cfgPath, "config", "config.yaml", "path to YAML config") + flag.Parse() + + log.SetFlags(log.LstdFlags | log.Lmicroseconds | log.LUTC) + log.Printf("claude-failover starting (config=%s)", cfgPath) + + cfg, err := loadConfig(cfgPath) + if err != nil { + log.Fatalf("config load failed: %v", err) + } + log.Printf("config loaded: %s", cfg.Path) + + // TODO: spawn goroutines — dispatcher, quota-monitor, session-watcher, + // checkpoint, janitor, notifier, account-switcher. + + ctx, cancel := signal.NotifyContext(context.Background(), + syscall.SIGINT, syscall.SIGTERM) + defer cancel() + + log.Printf("claude-failover ready (stub — no workers running)") + <-ctx.Done() + log.Printf("shutdown signal received, exiting") +} diff --git a/config.example.yaml b/config.example.yaml new file mode 100644 index 0000000..c33b31e --- /dev/null +++ b/config.example.yaml @@ -0,0 +1,84 @@ +# claude-failover — example configuration +# +# Copy to config.yaml and adjust. Do NOT commit the real config: it is +# gitignored because it contains account identifiers and may reference +# local paths holding Claude Code session tokens. + +# --------------------------------------------------------------------------- +# accounts +# --------------------------------------------------------------------------- +# Declare every Anthropic account the daemon is allowed to use. Ordering +# matters: the first active account is the default primary, subsequent ones +# are tried in order during failover. +accounts: + - name: compte1 + # Directory holding this account's ~/.claude profile. The daemon + # swaps HOME-like state by rotating symlinks pointing at these dirs. + home: /home/ubuntu/.claude-compte1 + # Soft limits at which failover is preferred (not a hard cap — + # Anthropic enforces the real ceiling). + limits: + hourly_msgs: 0 # 0 disables local limit + weekly_msgs: 0 + priority: 1 # lower = preferred + + - name: compte2 + home: /home/ubuntu/.claude-compte2 + limits: + hourly_msgs: 0 + weekly_msgs: 0 + priority: 2 + +# --------------------------------------------------------------------------- +# pool +# --------------------------------------------------------------------------- +# Session pool configuration. Sessions are named ccl- and live in tmux. +pool: + # Persistent sessions dedicated to named projects. + dedicated: + - name: ccl-0 + project: /home/ubuntu/projects/dev-management + - name: ccl-1-conformvault + project: /home/ubuntu/projects/filesecure + # Autoscaling sessions for the inbox dispatcher. + autonomous: + prefix: ccl-auto- + min: 2 + max: 10 + # Shared Claude Code project tree (symlinked from every account home). + shared_projects_dir: /home/ubuntu/.claude-projects-shared + +# --------------------------------------------------------------------------- +# quota +# --------------------------------------------------------------------------- +# Thresholds at which the quota-monitor triggers a graceful swap. +quota: + # Poll interval for usage scraping. + poll_interval: 30s + # Trigger failover when 5h window consumption exceeds this ratio. + window_5h_threshold: 0.85 + # Trigger failover when weekly window exceeds this ratio. + window_week_threshold: 0.90 + # Cooldown before the same account can be re-activated. + reactivate_cooldown: 1h + +# --------------------------------------------------------------------------- +# checkpoint +# --------------------------------------------------------------------------- +# The checkpoint goroutine snapshots per-session context so a failover can +# resume on a different account. +checkpoint: + dir: /var/lib/claude-failover/checkpoints + interval: 60s + keep: 20 # per-session retention + +# --------------------------------------------------------------------------- +# mcp_http +# --------------------------------------------------------------------------- +# HTTP control plane consumed by the SecuAAS MCP gateway. +mcp_http: + listen: 127.0.0.1:7777 + # Bearer required on every request. Rotate via systemd drop-in. + bearer_token_env: CLAUDE_FAILOVER_BEARER + # Paths exposed (all read-only except explicitly listed mutating routes). + enable_trigger: true # allow /trigger/dispatch, /trigger/swap diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..4782ddf --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,125 @@ +# Architecture + +`claude-failover` is a single Go binary structured as a set of cooperating +goroutines. Each goroutine owns a narrow responsibility and communicates +through typed channels and a shared `State` struct guarded by a +`sync.RWMutex`. A single-writer discipline is enforced: only the +**account-switcher** may mutate the active-account field. + +## Goroutines + +### dispatcher + +Watches `.agent-queue/inbox/` for every registered project (inotify on +Linux) and pairs each incoming task with an idle session from the pool. +It respects: + +- per-project priority +- agent capability tags declared in the task frontmatter +- the `needs_claude_code: true` bypass flag +- dispatcher-level cooldowns to avoid flooding a freshly-launched session + +On successful assignment it renames `.md` to `.md.dispatched` +and writes a pointer into the target session's tmux prompt. + +### quota-monitor + +Polls Anthropic usage counters for every configured account. Sources: + +1. Claude Code's local telemetry files under `~/.claude/statsig/` and + `~/.claude/projects/*.jsonl` (message timestamps). +2. Optional: a reverse-engineered `/api/quota` endpoint if available. + +It computes two sliding windows (5h, 1 week) and emits a `swap-requested` +event once thresholds in the config are crossed. + +### session-watcher + +Keeps a table of tmux sessions (`ccl-*`). For each one it tracks: + +- process liveness (via `tmux has-session`) +- heartbeat timestamp from `.agent-queue/status.json` +- current `state` field (idle / working / stalled) + +Stalled sessions (heartbeat older than N minutes while `state=working`) +raise an alert on the notifier channel and become candidates for a +forced restart. + +### checkpoint + +Every `checkpoint.interval`, serializes per-session context: + +- current task id +- last recorded tool call (name + truncated args) +- cwd as reported by the session +- the last N lines of the session's scrollback + +Files are written atomically (`*.tmp` + rename) to +`checkpoint.dir//.json` and pruned to +`checkpoint.keep` entries. + +### janitor + +Periodic housekeeping: + +- removes stale `.md.dispatched` markers whose source task is gone +- archives `done/` older than a configurable horizon +- prunes expired checkpoints +- rotates the daemon's own log file when it exceeds a size threshold + +### notifier + +Fan-out of typed events (`SwapFired`, `SessionStalled`, `TaskFailed`, +`QuotaWarning`) to configured sinks: + +- Telegram bot (alerts channel) +- MCP control-plane push +- stdout / structured log aggregator + +### account-switcher + +Serializes all account swaps behind a single mutex. Swap protocol: + +1. mark active account as `draining` +2. tell each session to flush its current tool call and checkpoint +3. stop tmux sessions in reverse launch order +4. repoint the `~/.claude` symlink (or equivalent per-session HOME) to + the target account's home directory +5. relaunch sessions; replay the latest checkpoint so each session + reopens the same project and task pointer +6. mark the new account `active`, start the cooldown timer on the old one + +See [`session-switch-analysis.md`](./session-switch-analysis.md) for why +the shared-symlink approach is required (Claude Code bug #16103). + +## Shared state + +```go +type State struct { + mu sync.RWMutex + ActiveAccount string + Accounts map[string]*AccountState + Sessions map[string]*SessionState + LastSwap time.Time + PendingSwap bool +} +``` + +Readers take `RLock`; the account-switcher takes `Lock` for the duration +of a swap. All other writers go through a single-writer channel owned by +the switcher, which guarantees swap atomicity. + +## HTTP control plane + +The daemon exposes a small HTTP server (`mcp_http.listen`) consumed by +the SecuAAS MCP gateway. Routes: + +| Method | Path | Purpose | +|--------|-----------------------|--------------------------------| +| GET | `/status` | Full state snapshot | +| GET | `/accounts` | Account usage + limits | +| GET | `/sessions` | Session table | +| POST | `/trigger/swap` | Force failover (requires bearer) | +| POST | `/trigger/dispatch` | Force inbox scan | + +All routes require the bearer token from `mcp_http.bearer_token_env`. diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 0000000..da8b140 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,71 @@ +# Configuration + +Configuration is a single YAML file (`config.yaml`) read at startup. +`config.example.yaml` at the repository root is the annotated template — +copy and edit. + +## Sections + +### `accounts` + +List of Anthropic accounts the daemon may rotate through. + +| Field | Type | Description | +|------------------|----------|-------------| +| `name` | string | Short identifier used in logs and metrics. | +| `home` | path | Per-account home directory (houses the private `~/.claude` credentials). | +| `limits.hourly_msgs` | int | Local soft cap on messages per hour. `0` disables. | +| `limits.weekly_msgs` | int | Local soft cap per 7-day window. `0` disables. | +| `priority` | int | Lower numbers preferred when selecting a primary. | + +### `pool` + +Session pool layout. + +- `dedicated` — long-lived sessions bound to a specific project path. + Each entry has `name` (tmux session) and `project` (absolute path). +- `autonomous` — autoscaling pool for the inbox dispatcher. + `prefix` + serial → session names; `min`/`max` bound the pool size. +- `shared_projects_dir` — canonical Claude Code `projects/` tree, + symlinked from every account home (see + `docs/session-switch-analysis.md`). + +### `quota` + +Failover thresholds. + +- `poll_interval` — how often the quota-monitor samples usage. +- `window_5h_threshold` — fraction (0..1) of the 5h cap above which a + swap is requested. +- `window_week_threshold` — same idea for the weekly cap. +- `reactivate_cooldown` — minimum time before a drained account is + eligible again. + +### `checkpoint` + +- `dir` — where per-session snapshots are written. +- `interval` — time between snapshots. +- `keep` — retention per session (older files are pruned). + +### `mcp_http` + +HTTP control plane for the MCP gateway. + +- `listen` — bind address (keep it loopback unless fronted by a reverse + proxy). +- `bearer_token_env` — name of the env var from which to read the + bearer token required on every request. +- `enable_trigger` — gate for mutating routes (`/trigger/*`). + +## Secrets + +The config file itself holds no secrets. Bearer tokens and account +credentials are read from the environment. In production deploy via a +systemd unit with a drop-in that sources `/run/secrets/claude-failover.env` +(populated by `secuops` / OVH KMS at boot). + +## Reload + +Config reload is not supported in the initial version. A SIGHUP handler +is planned — the account-switcher makes a clean live reload feasible +(drain, reconfigure, resume). diff --git a/docs/session-switch-analysis.md b/docs/session-switch-analysis.md new file mode 100644 index 0000000..fd743d6 --- /dev/null +++ b/docs/session-switch-analysis.md @@ -0,0 +1,61 @@ +# Cross-account session resume — bug analysis + +## Symptom + +When Claude Code is relaunched under a different Anthropic account +(different `~/.claude/` directory), invoking `claude --resume ` +fails with a "session not found" error even though the session +transcript JSONL still exists on disk. The new account has no record of +the session id because Claude Code tracks resumable sessions per-account +in its local state store. + +Upstream report: Claude Code issue **#16103** (cross-account resume). + +## Consequence for claude-failover + +Naïve account swap (stop sessions on account A, start on account B +pointing at a different `~/.claude/projects/` tree) loses every running +session. That would defeat the purpose of failover — we would be forced +to kill in-flight tasks on every quota boundary. + +## Solution — shared projects tree via symlink + +The `projects/` subtree under `~/.claude/` is where the session +transcripts live. We keep a single canonical copy at +`/home/ubuntu/.claude-projects-shared/` and symlink each account's +`~/.claude/projects` to it: + +``` +/home/ubuntu/.claude-compte1/projects -> /home/ubuntu/.claude-projects-shared/ +/home/ubuntu/.claude-compte2/projects -> /home/ubuntu/.claude-projects-shared/ +``` + +With this layout: + +- Account A records session `S1` while running. Its transcript lands in + the shared directory. +- On swap, account B's Claude Code process starts with its own + credentials but sees the same `projects/` tree. +- `claude --resume S1` finds the transcript and replays it. + +The per-account state that is **not** shared — credentials, telemetry +cache, statsig flags — stays isolated because only `projects/` is +symlinked, not the whole `~/.claude`. + +## Validation checklist (for implementers) + +- [ ] `claude --resume` succeeds across accounts when `projects/` is + symlinked +- [ ] No transcript corruption when both accounts write concurrently + (they cannot — only one account is active at a time) +- [ ] Permissions on the shared dir allow the daemon user to read/write +- [ ] Backup strategy in place before first production failover + +## Open questions + +- Does Claude Code cache the session id in a per-account index that + needs to be pre-populated? If yes, the account-switcher must write a + small stub entry there on swap. +- Does a running session survive the HOME symlink flip, or must it be + restarted? Current assumption: restart is required, hence the + checkpoint goroutine. diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..aef4b20 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module forge.secuaas.ovh/secuaas/claude-failover + +go 1.24 diff --git a/scripts/setup-shared-projects.sh b/scripts/setup-shared-projects.sh new file mode 100755 index 0000000..fb81aa4 --- /dev/null +++ b/scripts/setup-shared-projects.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +# +# setup-shared-projects.sh +# +# Prepare the shared Claude Code projects/ tree used by claude-failover +# to preserve session resume across account swaps. +# +# Creates /home/ubuntu/.claude-projects-shared (if missing), then rewires +# each account's ~/.claude/projects to point at it. Existing per-account +# transcripts are merged into the shared tree (rsync -a, no overwrite of +# newer files). +# +# Usage: +# scripts/setup-shared-projects.sh [--shared DIR] [--dry-run] \ +# account_home_1 account_home_2 [account_home_N...] +# +# Example: +# scripts/setup-shared-projects.sh \ +# /home/ubuntu/.claude-compte1 \ +# /home/ubuntu/.claude-compte2 +# +# Safe to rerun: it never deletes account-private files (credentials, +# statsig cache, settings). It only touches the `projects/` subdir. + +set -euo pipefail + +SHARED="/home/ubuntu/.claude-projects-shared" +DRY_RUN=0 +ACCOUNTS=() + +while [[ $# -gt 0 ]]; do + case "$1" in + --shared) + SHARED="$2" + shift 2 + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + -h|--help) + sed -n '2,22p' "$0" + exit 0 + ;; + *) + ACCOUNTS+=("$1") + shift + ;; + esac +done + +if [[ ${#ACCOUNTS[@]} -lt 1 ]]; then + echo "error: at least one account home must be provided" >&2 + exit 2 +fi + +run() { + if [[ $DRY_RUN -eq 1 ]]; then + echo "DRY: $*" + else + echo "+ $*" + "$@" + fi +} + +run mkdir -p "$SHARED" + +for home in "${ACCOUNTS[@]}"; do + if [[ ! -d "$home" ]]; then + echo "warn: account home not found: $home (skipping)" >&2 + continue + fi + proj="$home/.claude/projects" + run mkdir -p "$home/.claude" + + if [[ -L "$proj" ]]; then + current="$(readlink "$proj")" + if [[ "$current" == "$SHARED" ]]; then + echo "ok: $proj already -> $SHARED" + continue + fi + echo "replacing symlink $proj ($current -> $SHARED)" + run rm "$proj" + elif [[ -d "$proj" ]]; then + echo "merging $proj -> $SHARED" + run rsync -a --ignore-existing "$proj/" "$SHARED/" + run rm -rf "$proj" + fi + + run ln -s "$SHARED" "$proj" +done + +echo "done. shared projects tree: $SHARED"