feat: initial project structure

This commit is contained in:
Olivier 2026-04-14 13:29:24 +00:00
commit cf4957010f
10 changed files with 621 additions and 0 deletions

37
.gitignore vendored Normal file
View file

@ -0,0 +1,37 @@
# Binaries
bin/
build/
dist/
*.exe
*.test
*.out
# Go
vendor/
*.prof
coverage.txt
coverage.html
# Logs
*.log
logs/
# Local config & secrets
config.yaml
config.local.yaml
.env
.env.local
*.pem
*.key
# IDE / editor
.idea/
.vscode/
*.swp
*.swo
.DS_Store
# Runtime / state
state/
checkpoints/
tmp/

21
LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2026 SecuAAS / 9463-7220 Québec Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

68
README.md Normal file
View file

@ -0,0 +1,68 @@
# claude-failover
Go daemon for Claude Code multi-account session orchestration with automatic
quota-based failover.
## Overview
`claude-failover` orchestrates a pool of Claude Code sessions running under
multiple Anthropic accounts. When the active account reaches its quota
threshold (5-hour usage window or weekly cap), the daemon transparently fails
over the workload to a backup account without losing in-flight session state.
It is the runtime glue behind the SecuAAS agent pool (`ccl-0`..`ccl-9`,
`ccl-auto-11`..`ccl-auto-20`) and is engineered to hold sessions warm across
account swaps by sharing the `~/.claude/projects/` transcript tree via
symlinks.
## Architecture (goroutines)
The daemon is a single Go binary composed of cooperating goroutines:
- **dispatcher** — reads `.agent-queue/inbox/*.md` across registered projects
and assigns tasks to idle sessions.
- **quota-monitor** — polls each configured Anthropic account's usage window
and triggers a failover when the active account crosses its threshold.
- **session-watcher** — tracks tmux session liveness (`ccl-*`), heartbeats,
and `.agent-queue/status.json` transitions (idle / working).
- **checkpoint** — periodically snapshots session context (current task,
last tool call, working dir) so an interrupted session can resume on a
different account.
- **janitor** — cleans stale `.dispatched` markers, archives old
`done/` tasks, prunes expired checkpoints.
- **notifier** — pushes state changes (failover fired, session degraded,
task failed) to Telegram / MCP dashboard / log aggregator.
- **account-switcher** — performs the actual swap: stop sessions on account
A, rehome symlinks, relaunch sessions on account B, replay last
checkpoint. Serialized via a single mutex so only one swap can happen at
a time.
All goroutines communicate through typed channels plus a shared state struct
behind a `sync.RWMutex`. The daemon exposes an HTTP control plane for the
MCP server to query status and force-trigger operations.
## Relationship to SecuAAS agent-orchestrator
This project extracts the session-management and failover logic that
currently lives in `dev-management/agent-orchestrator/` (shell scripts:
`launch-agent.sh`, `graceful-switch.sh`, `watchdog.sh`,
`checkpoint-daemon.sh`, `start-dedicated-agents.sh`) and reimplements it
as a single Go service. See the orchestrator docs for the operational
context this daemon is designed to replace.
## Repository layout
```
cmd/claude-failover/ Main entrypoint
docs/ Architecture, configuration, analysis notes
scripts/ Setup helpers (shared-projects symlink, etc.)
config.example.yaml Annotated example config
```
## Status
Pre-alpha. Design and scaffolding only — no working binary yet.
## License
MIT — see `LICENSE`.

View file

@ -0,0 +1,58 @@
// Package main is the entrypoint for the claude-failover daemon.
//
// Scope of this stub: load the YAML config from disk, log startup
// information, and block until a termination signal. The real runtime
// (dispatcher, quota-monitor, session-watcher, checkpoint, janitor,
// notifier, account-switcher goroutines) is not implemented yet — see
// docs/architecture.md.
package main
import (
"context"
"flag"
"log"
"os"
"os/signal"
"syscall"
)
// Config mirrors config.example.yaml at a high level. We keep it loose
// here because this stub does not wire real YAML parsing yet; the full
// schema will live in internal/config once implementation starts.
type Config struct {
Path string
}
func loadConfig(path string) (*Config, error) {
// TODO(claude-failover): parse YAML via gopkg.in/yaml.v3 and validate.
if _, err := os.Stat(path); err != nil {
return nil, err
}
return &Config{Path: path}, nil
}
func main() {
var cfgPath string
flag.StringVar(&cfgPath, "config", "config.yaml", "path to YAML config")
flag.Parse()
log.SetFlags(log.LstdFlags | log.Lmicroseconds | log.LUTC)
log.Printf("claude-failover starting (config=%s)", cfgPath)
cfg, err := loadConfig(cfgPath)
if err != nil {
log.Fatalf("config load failed: %v", err)
}
log.Printf("config loaded: %s", cfg.Path)
// TODO: spawn goroutines — dispatcher, quota-monitor, session-watcher,
// checkpoint, janitor, notifier, account-switcher.
ctx, cancel := signal.NotifyContext(context.Background(),
syscall.SIGINT, syscall.SIGTERM)
defer cancel()
log.Printf("claude-failover ready (stub — no workers running)")
<-ctx.Done()
log.Printf("shutdown signal received, exiting")
}

84
config.example.yaml Normal file
View file

@ -0,0 +1,84 @@
# claude-failover — example configuration
#
# Copy to config.yaml and adjust. Do NOT commit the real config: it is
# gitignored because it contains account identifiers and may reference
# local paths holding Claude Code session tokens.
# ---------------------------------------------------------------------------
# accounts
# ---------------------------------------------------------------------------
# Declare every Anthropic account the daemon is allowed to use. Ordering
# matters: the first active account is the default primary, subsequent ones
# are tried in order during failover.
accounts:
- name: compte1
# Directory holding this account's ~/.claude profile. The daemon
# swaps HOME-like state by rotating symlinks pointing at these dirs.
home: /home/ubuntu/.claude-compte1
# Soft limits at which failover is preferred (not a hard cap —
# Anthropic enforces the real ceiling).
limits:
hourly_msgs: 0 # 0 disables local limit
weekly_msgs: 0
priority: 1 # lower = preferred
- name: compte2
home: /home/ubuntu/.claude-compte2
limits:
hourly_msgs: 0
weekly_msgs: 0
priority: 2
# ---------------------------------------------------------------------------
# pool
# ---------------------------------------------------------------------------
# Session pool configuration. Sessions are named ccl-<N> and live in tmux.
pool:
# Persistent sessions dedicated to named projects.
dedicated:
- name: ccl-0
project: /home/ubuntu/projects/dev-management
- name: ccl-1-conformvault
project: /home/ubuntu/projects/filesecure
# Autoscaling sessions for the inbox dispatcher.
autonomous:
prefix: ccl-auto-
min: 2
max: 10
# Shared Claude Code project tree (symlinked from every account home).
shared_projects_dir: /home/ubuntu/.claude-projects-shared
# ---------------------------------------------------------------------------
# quota
# ---------------------------------------------------------------------------
# Thresholds at which the quota-monitor triggers a graceful swap.
quota:
# Poll interval for usage scraping.
poll_interval: 30s
# Trigger failover when 5h window consumption exceeds this ratio.
window_5h_threshold: 0.85
# Trigger failover when weekly window exceeds this ratio.
window_week_threshold: 0.90
# Cooldown before the same account can be re-activated.
reactivate_cooldown: 1h
# ---------------------------------------------------------------------------
# checkpoint
# ---------------------------------------------------------------------------
# The checkpoint goroutine snapshots per-session context so a failover can
# resume on a different account.
checkpoint:
dir: /var/lib/claude-failover/checkpoints
interval: 60s
keep: 20 # per-session retention
# ---------------------------------------------------------------------------
# mcp_http
# ---------------------------------------------------------------------------
# HTTP control plane consumed by the SecuAAS MCP gateway.
mcp_http:
listen: 127.0.0.1:7777
# Bearer required on every request. Rotate via systemd drop-in.
bearer_token_env: CLAUDE_FAILOVER_BEARER
# Paths exposed (all read-only except explicitly listed mutating routes).
enable_trigger: true # allow /trigger/dispatch, /trigger/swap

125
docs/architecture.md Normal file
View file

@ -0,0 +1,125 @@
# Architecture
`claude-failover` is a single Go binary structured as a set of cooperating
goroutines. Each goroutine owns a narrow responsibility and communicates
through typed channels and a shared `State` struct guarded by a
`sync.RWMutex`. A single-writer discipline is enforced: only the
**account-switcher** may mutate the active-account field.
## Goroutines
### dispatcher
Watches `.agent-queue/inbox/` for every registered project (inotify on
Linux) and pairs each incoming task with an idle session from the pool.
It respects:
- per-project priority
- agent capability tags declared in the task frontmatter
- the `needs_claude_code: true` bypass flag
- dispatcher-level cooldowns to avoid flooding a freshly-launched session
On successful assignment it renames `<task>.md` to `<task>.md.dispatched`
and writes a pointer into the target session's tmux prompt.
### quota-monitor
Polls Anthropic usage counters for every configured account. Sources:
1. Claude Code's local telemetry files under `~/.claude/statsig/` and
`~/.claude/projects/*.jsonl` (message timestamps).
2. Optional: a reverse-engineered `/api/quota` endpoint if available.
It computes two sliding windows (5h, 1 week) and emits a `swap-requested`
event once thresholds in the config are crossed.
### session-watcher
Keeps a table of tmux sessions (`ccl-*`). For each one it tracks:
- process liveness (via `tmux has-session`)
- heartbeat timestamp from `.agent-queue/status.json`
- current `state` field (idle / working / stalled)
Stalled sessions (heartbeat older than N minutes while `state=working`)
raise an alert on the notifier channel and become candidates for a
forced restart.
### checkpoint
Every `checkpoint.interval`, serializes per-session context:
- current task id
- last recorded tool call (name + truncated args)
- cwd as reported by the session
- the last N lines of the session's scrollback
Files are written atomically (`*.tmp` + rename) to
`checkpoint.dir/<session>/<timestamp>.json` and pruned to
`checkpoint.keep` entries.
### janitor
Periodic housekeeping:
- removes stale `.md.dispatched` markers whose source task is gone
- archives `done/` older than a configurable horizon
- prunes expired checkpoints
- rotates the daemon's own log file when it exceeds a size threshold
### notifier
Fan-out of typed events (`SwapFired`, `SessionStalled`, `TaskFailed`,
`QuotaWarning`) to configured sinks:
- Telegram bot (alerts channel)
- MCP control-plane push
- stdout / structured log aggregator
### account-switcher
Serializes all account swaps behind a single mutex. Swap protocol:
1. mark active account as `draining`
2. tell each session to flush its current tool call and checkpoint
3. stop tmux sessions in reverse launch order
4. repoint the `~/.claude` symlink (or equivalent per-session HOME) to
the target account's home directory
5. relaunch sessions; replay the latest checkpoint so each session
reopens the same project and task pointer
6. mark the new account `active`, start the cooldown timer on the old one
See [`session-switch-analysis.md`](./session-switch-analysis.md) for why
the shared-symlink approach is required (Claude Code bug #16103).
## Shared state
```go
type State struct {
mu sync.RWMutex
ActiveAccount string
Accounts map[string]*AccountState
Sessions map[string]*SessionState
LastSwap time.Time
PendingSwap bool
}
```
Readers take `RLock`; the account-switcher takes `Lock` for the duration
of a swap. All other writers go through a single-writer channel owned by
the switcher, which guarantees swap atomicity.
## HTTP control plane
The daemon exposes a small HTTP server (`mcp_http.listen`) consumed by
the SecuAAS MCP gateway. Routes:
| Method | Path | Purpose |
|--------|-----------------------|--------------------------------|
| GET | `/status` | Full state snapshot |
| GET | `/accounts` | Account usage + limits |
| GET | `/sessions` | Session table |
| POST | `/trigger/swap` | Force failover (requires bearer) |
| POST | `/trigger/dispatch` | Force inbox scan |
All routes require the bearer token from `mcp_http.bearer_token_env`.

71
docs/configuration.md Normal file
View file

@ -0,0 +1,71 @@
# Configuration
Configuration is a single YAML file (`config.yaml`) read at startup.
`config.example.yaml` at the repository root is the annotated template —
copy and edit.
## Sections
### `accounts`
List of Anthropic accounts the daemon may rotate through.
| Field | Type | Description |
|------------------|----------|-------------|
| `name` | string | Short identifier used in logs and metrics. |
| `home` | path | Per-account home directory (houses the private `~/.claude` credentials). |
| `limits.hourly_msgs` | int | Local soft cap on messages per hour. `0` disables. |
| `limits.weekly_msgs` | int | Local soft cap per 7-day window. `0` disables. |
| `priority` | int | Lower numbers preferred when selecting a primary. |
### `pool`
Session pool layout.
- `dedicated` — long-lived sessions bound to a specific project path.
Each entry has `name` (tmux session) and `project` (absolute path).
- `autonomous` — autoscaling pool for the inbox dispatcher.
`prefix` + serial → session names; `min`/`max` bound the pool size.
- `shared_projects_dir` — canonical Claude Code `projects/` tree,
symlinked from every account home (see
`docs/session-switch-analysis.md`).
### `quota`
Failover thresholds.
- `poll_interval` — how often the quota-monitor samples usage.
- `window_5h_threshold` — fraction (0..1) of the 5h cap above which a
swap is requested.
- `window_week_threshold` — same idea for the weekly cap.
- `reactivate_cooldown` — minimum time before a drained account is
eligible again.
### `checkpoint`
- `dir` — where per-session snapshots are written.
- `interval` — time between snapshots.
- `keep` — retention per session (older files are pruned).
### `mcp_http`
HTTP control plane for the MCP gateway.
- `listen` — bind address (keep it loopback unless fronted by a reverse
proxy).
- `bearer_token_env` — name of the env var from which to read the
bearer token required on every request.
- `enable_trigger` — gate for mutating routes (`/trigger/*`).
## Secrets
The config file itself holds no secrets. Bearer tokens and account
credentials are read from the environment. In production deploy via a
systemd unit with a drop-in that sources `/run/secrets/claude-failover.env`
(populated by `secuops` / OVH KMS at boot).
## Reload
Config reload is not supported in the initial version. A SIGHUP handler
is planned — the account-switcher makes a clean live reload feasible
(drain, reconfigure, resume).

View file

@ -0,0 +1,61 @@
# Cross-account session resume — bug analysis
## Symptom
When Claude Code is relaunched under a different Anthropic account
(different `~/.claude/` directory), invoking `claude --resume <id>`
fails with a "session not found" error even though the session
transcript JSONL still exists on disk. The new account has no record of
the session id because Claude Code tracks resumable sessions per-account
in its local state store.
Upstream report: Claude Code issue **#16103** (cross-account resume).
## Consequence for claude-failover
Naïve account swap (stop sessions on account A, start on account B
pointing at a different `~/.claude/projects/` tree) loses every running
session. That would defeat the purpose of failover — we would be forced
to kill in-flight tasks on every quota boundary.
## Solution — shared projects tree via symlink
The `projects/` subtree under `~/.claude/` is where the session
transcripts live. We keep a single canonical copy at
`/home/ubuntu/.claude-projects-shared/` and symlink each account's
`~/.claude/projects` to it:
```
/home/ubuntu/.claude-compte1/projects -> /home/ubuntu/.claude-projects-shared/
/home/ubuntu/.claude-compte2/projects -> /home/ubuntu/.claude-projects-shared/
```
With this layout:
- Account A records session `S1` while running. Its transcript lands in
the shared directory.
- On swap, account B's Claude Code process starts with its own
credentials but sees the same `projects/` tree.
- `claude --resume S1` finds the transcript and replays it.
The per-account state that is **not** shared — credentials, telemetry
cache, statsig flags — stays isolated because only `projects/` is
symlinked, not the whole `~/.claude`.
## Validation checklist (for implementers)
- [ ] `claude --resume` succeeds across accounts when `projects/` is
symlinked
- [ ] No transcript corruption when both accounts write concurrently
(they cannot — only one account is active at a time)
- [ ] Permissions on the shared dir allow the daemon user to read/write
- [ ] Backup strategy in place before first production failover
## Open questions
- Does Claude Code cache the session id in a per-account index that
needs to be pre-populated? If yes, the account-switcher must write a
small stub entry there on swap.
- Does a running session survive the HOME symlink flip, or must it be
restarted? Current assumption: restart is required, hence the
checkpoint goroutine.

3
go.mod Normal file
View file

@ -0,0 +1,3 @@
module forge.secuaas.ovh/secuaas/claude-failover
go 1.24

View file

@ -0,0 +1,93 @@
#!/usr/bin/env bash
#
# setup-shared-projects.sh
#
# Prepare the shared Claude Code projects/ tree used by claude-failover
# to preserve session resume across account swaps.
#
# Creates /home/ubuntu/.claude-projects-shared (if missing), then rewires
# each account's ~/.claude/projects to point at it. Existing per-account
# transcripts are merged into the shared tree (rsync -a, no overwrite of
# newer files).
#
# Usage:
# scripts/setup-shared-projects.sh [--shared DIR] [--dry-run] \
# account_home_1 account_home_2 [account_home_N...]
#
# Example:
# scripts/setup-shared-projects.sh \
# /home/ubuntu/.claude-compte1 \
# /home/ubuntu/.claude-compte2
#
# Safe to rerun: it never deletes account-private files (credentials,
# statsig cache, settings). It only touches the `projects/` subdir.
set -euo pipefail
SHARED="/home/ubuntu/.claude-projects-shared"
DRY_RUN=0
ACCOUNTS=()
while [[ $# -gt 0 ]]; do
case "$1" in
--shared)
SHARED="$2"
shift 2
;;
--dry-run)
DRY_RUN=1
shift
;;
-h|--help)
sed -n '2,22p' "$0"
exit 0
;;
*)
ACCOUNTS+=("$1")
shift
;;
esac
done
if [[ ${#ACCOUNTS[@]} -lt 1 ]]; then
echo "error: at least one account home must be provided" >&2
exit 2
fi
run() {
if [[ $DRY_RUN -eq 1 ]]; then
echo "DRY: $*"
else
echo "+ $*"
"$@"
fi
}
run mkdir -p "$SHARED"
for home in "${ACCOUNTS[@]}"; do
if [[ ! -d "$home" ]]; then
echo "warn: account home not found: $home (skipping)" >&2
continue
fi
proj="$home/.claude/projects"
run mkdir -p "$home/.claude"
if [[ -L "$proj" ]]; then
current="$(readlink "$proj")"
if [[ "$current" == "$SHARED" ]]; then
echo "ok: $proj already -> $SHARED"
continue
fi
echo "replacing symlink $proj ($current -> $SHARED)"
run rm "$proj"
elif [[ -d "$proj" ]]; then
echo "merging $proj -> $SHARED"
run rsync -a --ignore-existing "$proj/" "$SHARED/"
run rm -rf "$proj"
fi
run ln -s "$SHARED" "$proj"
done
echo "done. shared projects tree: $SHARED"