feat(phase2-E): multi-provider routing via secutools delegation

Adds optional delegation of agent-queue tasks to the SecuAAS secutools
AI platform (GPU / Gemini / Claude API) instead of dispatching to a
local Claude Code tmux session. Per-task opt-in via YAML frontmatter
fields preferred_ai, allow_delegation, complexity_hint — absence keeps
the Phase 1 behaviour exactly (zero breaking change).

Go side:
- internal/secutools: HTTP client with exponential-backoff retries
  (SubmitJob/GetJob/WaitForResult), DecideProvider map adapter for CLI
  use, table tests.
- internal/router: struct-typed Decide() with strict precedence
  (needs_claude_code > preferred_ai=claude-code > allow_delegation=false
  > preferred_ai > fail-safe local on unknown).
- internal/delegation: Manager submits jobs, writes .md.delegated
  markers for on-restart recovery, runs a periodic reaper that moves
  completed jobs into done/ with provider/cost footer and failed jobs
  into failed/.
- internal/dispatcher: WithDelegation() opt-in, routeTask hook before
  findFreeSession, skips .md.delegated in assignNextTask.
- internal/api: /api/delegated/status (active jobs + counters),
  /watchdog/status extended with delegation counters.
- cmd/ccl-delegate: small CLI exposing submit/get/result/decide so the
  bash dispatcher can call the same contract without duplicating logic.
- cmd/claude-failover: delegation wired opt-in via SECUTOOLS_API_KEY.

Tests:
- 29+ new unit tests across router, secutools, delegation, dispatcher,
  api packages. go test -race -count=1 clean.
- tests/phase2-E-integration.sh: bash end-to-end against a Python
  stdlib mock HTTP server, exercising the dev-management scripts.

Forward-compat with watchdog (Phase 1 B1 already ignores
state=delegated_to_secutools) so delegated tasks aren't flagged stale.
This commit is contained in:
Ubuntu 2026-04-17 02:17:19 +00:00
parent 47ab86eef9
commit 3e20085204
18 changed files with 2819 additions and 22 deletions

View file

@ -0,0 +1,190 @@
package secutools
import (
"context"
"encoding/json"
"errors"
"net/http"
"net/http/httptest"
"sync/atomic"
"testing"
"time"
)
// TestSubmitJob_HappyPath verifies the request body and headers match the
// secutools contract and the response is decoded.
func TestSubmitJob_HappyPath(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/jobs" {
t.Errorf("unexpected path %q", r.URL.Path)
}
if r.Method != http.MethodPost {
t.Errorf("unexpected method %q", r.Method)
}
if r.Header.Get("X-API-Key") != "key123" {
t.Errorf("missing/incorrect X-API-Key: %q", r.Header.Get("X-API-Key"))
}
var got JobRequest
if err := json.NewDecoder(r.Body).Decode(&got); err != nil {
t.Fatalf("decode: %v", err)
}
if got.Type != TypeAnalyze || got.PreferredAI != "gpu" {
t.Errorf("payload mismatch: %+v", got)
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"job_id":"abc","status":"pending"}`))
}))
defer srv.Close()
c := NewHTTPClient(srv.URL, "key123", srv.Client())
resp, err := c.SubmitJob(context.Background(), &JobRequest{
Type: TypeAnalyze,
Priority: PriorityHigh,
Prompt: "hi",
PreferredAI: "gpu",
})
if err != nil {
t.Fatalf("SubmitJob: %v", err)
}
if resp.JobID != "abc" || resp.Status != "pending" {
t.Errorf("unexpected response: %+v", resp)
}
}
// TestSubmitJob_HTTPError surfaces non-2xx responses as errors.
func TestSubmitJob_HTTPError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
_, _ = w.Write([]byte("boom"))
}))
defer srv.Close()
c := NewHTTPClient(srv.URL, "k", srv.Client())
if _, err := c.SubmitJob(context.Background(), &JobRequest{Type: TypeAnalyze, Prompt: "p"}); err == nil {
t.Fatal("expected error on HTTP 500, got nil")
}
}
// TestWaitForResult_PollsUntilCompleted verifies the polling loop transitions
// pending → running → completed and fetches the result.
func TestWaitForResult_PollsUntilCompleted(t *testing.T) {
var calls atomic.Int64
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
switch r.URL.Path {
case "/api/v1/jobs/job1":
n := calls.Add(1)
status := "pending"
if n >= 2 {
status = "completed"
}
_, _ = w.Write([]byte(`{"job_id":"job1","status":"` + status + `","provider":"gpu"}`))
case "/api/v1/jobs/job1/result":
_, _ = w.Write([]byte(`{"job_id":"job1","response":"done","provider":"gpu","cost_cad":0.005}`))
default:
t.Errorf("unexpected path %q", r.URL.Path)
}
}))
defer srv.Close()
c := NewHTTPClient(srv.URL, "k", srv.Client())
// Override poll cadence indirectly: short timeout proves we don't spin
// 2s per poll; the test runs in well under 10s real time.
res, err := c.WaitForResult(context.Background(), "job1", 30*time.Second)
if err != nil {
t.Fatalf("WaitForResult: %v", err)
}
if res.Response != "done" || res.Provider != "gpu" {
t.Errorf("unexpected result: %+v", res)
}
}
// TestWaitForResult_FailedJob returns ErrJobFailed when secutools reports
// terminal failure.
func TestWaitForResult_FailedJob(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"job_id":"jobX","status":"failed","error":"oom"}`))
}))
defer srv.Close()
c := NewHTTPClient(srv.URL, "k", srv.Client())
_, err := c.WaitForResult(context.Background(), "jobX", 5*time.Second)
if !errors.Is(err, ErrJobFailed) {
t.Errorf("expected ErrJobFailed, got %v", err)
}
}
// TestSubmitJob_RetriesOn5xx verifies the client retries transient 500s
// and succeeds on a later attempt. Uses a tight retry delay so the test
// runs in milliseconds.
func TestSubmitJob_RetriesOn5xx(t *testing.T) {
var calls atomic.Int64
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
n := calls.Add(1)
if n < 3 {
w.WriteHeader(http.StatusInternalServerError)
_, _ = w.Write([]byte("transient"))
return
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"job_id":"ok","status":"pending"}`))
}))
defer srv.Close()
c := NewHTTPClient(srv.URL, "k", srv.Client())
c.SetRetryPolicy(3, 1*time.Millisecond)
resp, err := c.SubmitJob(context.Background(), &JobRequest{Type: TypeAnalyze, Prompt: "p"})
if err != nil {
t.Fatalf("expected success after retries, got %v (calls=%d)", err, calls.Load())
}
if resp.JobID != "ok" {
t.Errorf("unexpected response: %+v", resp)
}
if calls.Load() != 3 {
t.Errorf("expected 3 attempts, got %d", calls.Load())
}
}
// TestSubmitJob_DoesNotRetry4xx ensures client errors short-circuit
// without burning retries (e.g. wrong API key).
func TestSubmitJob_DoesNotRetry4xx(t *testing.T) {
var calls atomic.Int64
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
calls.Add(1)
w.WriteHeader(http.StatusUnauthorized)
_, _ = w.Write([]byte("bad key"))
}))
defer srv.Close()
c := NewHTTPClient(srv.URL, "k", srv.Client())
c.SetRetryPolicy(3, 1*time.Millisecond)
_, err := c.SubmitJob(context.Background(), &JobRequest{Type: TypeAnalyze, Prompt: "p"})
if err == nil {
t.Fatal("expected error on 401")
}
if calls.Load() != 1 {
t.Errorf("4xx must not retry, got %d calls", calls.Load())
}
}
// TestWaitForResult_ContextCancel exits cleanly when the parent context is
// cancelled mid-poll.
func TestWaitForResult_ContextCancel(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"job_id":"j","status":"pending"}`))
}))
defer srv.Close()
ctx, cancel := context.WithCancel(context.Background())
cancel() // cancel immediately
c := NewHTTPClient(srv.URL, "k", srv.Client())
_, err := c.WaitForResult(ctx, "j", 10*time.Second)
if err == nil {
t.Fatal("expected error from cancelled context")
}
}