feat(phase2-E): multi-provider routing via secutools delegation
Adds optional delegation of agent-queue tasks to the SecuAAS secutools AI platform (GPU / Gemini / Claude API) instead of dispatching to a local Claude Code tmux session. Per-task opt-in via YAML frontmatter fields preferred_ai, allow_delegation, complexity_hint — absence keeps the Phase 1 behaviour exactly (zero breaking change). Go side: - internal/secutools: HTTP client with exponential-backoff retries (SubmitJob/GetJob/WaitForResult), DecideProvider map adapter for CLI use, table tests. - internal/router: struct-typed Decide() with strict precedence (needs_claude_code > preferred_ai=claude-code > allow_delegation=false > preferred_ai > fail-safe local on unknown). - internal/delegation: Manager submits jobs, writes .md.delegated markers for on-restart recovery, runs a periodic reaper that moves completed jobs into done/ with provider/cost footer and failed jobs into failed/. - internal/dispatcher: WithDelegation() opt-in, routeTask hook before findFreeSession, skips .md.delegated in assignNextTask. - internal/api: /api/delegated/status (active jobs + counters), /watchdog/status extended with delegation counters. - cmd/ccl-delegate: small CLI exposing submit/get/result/decide so the bash dispatcher can call the same contract without duplicating logic. - cmd/claude-failover: delegation wired opt-in via SECUTOOLS_API_KEY. Tests: - 29+ new unit tests across router, secutools, delegation, dispatcher, api packages. go test -race -count=1 clean. - tests/phase2-E-integration.sh: bash end-to-end against a Python stdlib mock HTTP server, exercising the dev-management scripts. Forward-compat with watchdog (Phase 1 B1 already ignores state=delegated_to_secutools) so delegated tasks aren't flagged stale.
This commit is contained in:
parent
47ab86eef9
commit
3e20085204
18 changed files with 2819 additions and 22 deletions
290
internal/secutools/client.go
Normal file
290
internal/secutools/client.go
Normal file
|
|
@ -0,0 +1,290 @@
|
|||
// Package secutools provides a minimal HTTP client for the centralized SecuAAS
|
||||
// AI-batch platform (https://api.secutools.secuaas.ovh).
|
||||
//
|
||||
// Phase 2 — Chantier E: the dispatcher delegates non-Claude-Code-eligible
|
||||
// tasks to secutools (GPU/Gemini/Claude API) instead of dispatching them to
|
||||
// a local ccl-auto tmux session. This package is the Go side of that
|
||||
// delegation: SubmitJob, GetJob, WaitForResult.
|
||||
//
|
||||
// The Client interface is intentionally narrow so tests can plug a fake
|
||||
// implementation without any network dependency.
|
||||
package secutools
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Client is the abstraction the rest of the daemon uses to talk to secutools.
|
||||
// Real callers use HTTPClient; tests substitute a mock.
|
||||
type Client interface {
|
||||
SubmitJob(ctx context.Context, req *JobRequest) (*JobResponse, error)
|
||||
GetJob(ctx context.Context, id string) (*JobStatus, error)
|
||||
WaitForResult(ctx context.Context, id string, timeout time.Duration) (*JobResult, error)
|
||||
}
|
||||
|
||||
// JobType mirrors the secutools job-type enum.
|
||||
type JobType string
|
||||
|
||||
const (
|
||||
TypeAnalyze JobType = "ai:analyze"
|
||||
TypeBatch JobType = "ai:batch"
|
||||
TypeReport JobType = "ai:report"
|
||||
TypeCorrelate JobType = "ai:correlate"
|
||||
)
|
||||
|
||||
// Priority mirrors the secutools priority enum.
|
||||
type Priority string
|
||||
|
||||
const (
|
||||
PriorityCritical Priority = "critical"
|
||||
PriorityHigh Priority = "high"
|
||||
PriorityDefault Priority = "default"
|
||||
PriorityLow Priority = "low"
|
||||
)
|
||||
|
||||
// JobRequest is the body of POST /api/v1/jobs.
|
||||
type JobRequest struct {
|
||||
Type JobType `json:"type"`
|
||||
Priority Priority `json:"priority,omitempty"`
|
||||
Prompt string `json:"prompt"`
|
||||
Data map[string]any `json:"data,omitempty"`
|
||||
MaxTokens int `json:"max_tokens,omitempty"`
|
||||
PreferredAI string `json:"preferred_ai,omitempty"`
|
||||
Source string `json:"source,omitempty"`
|
||||
}
|
||||
|
||||
// JobResponse is the immediate reply from POST /api/v1/jobs.
|
||||
type JobResponse struct {
|
||||
JobID string `json:"job_id"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
// JobStatus is the reply from GET /api/v1/jobs/:id.
|
||||
type JobStatus struct {
|
||||
JobID string `json:"job_id"`
|
||||
Status string `json:"status"` // pending | running | completed | failed | cancelled
|
||||
Provider string `json:"provider,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// JobResult is the reply from GET /api/v1/jobs/:id/result.
|
||||
type JobResult struct {
|
||||
JobID string `json:"job_id"`
|
||||
Response string `json:"response"`
|
||||
Provider string `json:"provider"`
|
||||
Model string `json:"model"`
|
||||
CostCAD float64 `json:"cost_cad"`
|
||||
Tokens int `json:"tokens,omitempty"`
|
||||
}
|
||||
|
||||
// HTTPClient is the production implementation of Client.
|
||||
type HTTPClient struct {
|
||||
baseURL string
|
||||
apiKey string
|
||||
hc *http.Client
|
||||
maxRetries int
|
||||
baseDelay time.Duration
|
||||
}
|
||||
|
||||
// NewHTTPClient returns an HTTPClient ready to talk to secutools.
|
||||
// If hc is nil, a default http.Client with a 30s timeout is used.
|
||||
//
|
||||
// The client performs up to 3 retries on transport errors and 5xx
|
||||
// responses, with exponential backoff starting at 500ms (500ms, 1s, 2s).
|
||||
// 4xx responses are returned as errors without retrying.
|
||||
func NewHTTPClient(baseURL, apiKey string, hc *http.Client) *HTTPClient {
|
||||
if hc == nil {
|
||||
hc = &http.Client{Timeout: 30 * time.Second}
|
||||
}
|
||||
return &HTTPClient{
|
||||
baseURL: baseURL,
|
||||
apiKey: apiKey,
|
||||
hc: hc,
|
||||
maxRetries: 3,
|
||||
baseDelay: 500 * time.Millisecond,
|
||||
}
|
||||
}
|
||||
|
||||
// SetRetryPolicy overrides the default retry policy. Useful for tests.
|
||||
func (c *HTTPClient) SetRetryPolicy(maxRetries int, baseDelay time.Duration) {
|
||||
c.maxRetries = maxRetries
|
||||
c.baseDelay = baseDelay
|
||||
}
|
||||
|
||||
// doWithRetry sends req and retries on transport errors or 5xx responses
|
||||
// using exponential backoff. 4xx is returned without retry. Respects ctx.
|
||||
func (c *HTTPClient) doWithRetry(ctx context.Context, build func() (*http.Request, error)) (*http.Response, error) {
|
||||
var lastErr error
|
||||
delay := c.baseDelay
|
||||
for attempt := 0; attempt <= c.maxRetries; attempt++ {
|
||||
if attempt > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
case <-time.After(delay):
|
||||
}
|
||||
delay *= 2
|
||||
}
|
||||
req, err := build()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
resp, err := c.hc.Do(req)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
// Retry 5xx; return success or 4xx immediately.
|
||||
if resp.StatusCode >= 500 && resp.StatusCode <= 599 {
|
||||
raw, _ := io.ReadAll(resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
lastErr = fmt.Errorf("HTTP %d: %s", resp.StatusCode, string(raw))
|
||||
continue
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
if lastErr == nil {
|
||||
lastErr = errors.New("secutools: unknown transport failure")
|
||||
}
|
||||
return nil, fmt.Errorf("after %d attempts: %w", c.maxRetries+1, lastErr)
|
||||
}
|
||||
|
||||
// SubmitJob POSTs req to /api/v1/jobs with retry on 5xx.
|
||||
func (c *HTTPClient) SubmitJob(ctx context.Context, req *JobRequest) (*JobResponse, error) {
|
||||
body, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshal request: %w", err)
|
||||
}
|
||||
resp, err := c.doWithRetry(ctx, func() (*http.Request, error) {
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost,
|
||||
c.baseURL+"/api/v1/jobs", bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
httpReq.Header.Set("X-API-Key", c.apiKey)
|
||||
return httpReq, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("submit job: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode/100 != 2 {
|
||||
raw, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("submit job: HTTP %d: %s", resp.StatusCode, string(raw))
|
||||
}
|
||||
|
||||
var out JobResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
|
||||
return nil, fmt.Errorf("decode submit response: %w", err)
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
// GetJob GETs /api/v1/jobs/:id with retry on 5xx.
|
||||
func (c *HTTPClient) GetJob(ctx context.Context, id string) (*JobStatus, error) {
|
||||
resp, err := c.doWithRetry(ctx, func() (*http.Request, error) {
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet,
|
||||
c.baseURL+"/api/v1/jobs/"+id, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
httpReq.Header.Set("X-API-Key", c.apiKey)
|
||||
return httpReq, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get job: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode/100 != 2 {
|
||||
raw, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("get job: HTTP %d: %s", resp.StatusCode, string(raw))
|
||||
}
|
||||
|
||||
var out JobStatus
|
||||
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
|
||||
return nil, fmt.Errorf("decode get response: %w", err)
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
// getResult fetches the final payload of a completed job with retry on 5xx.
|
||||
func (c *HTTPClient) getResult(ctx context.Context, id string) (*JobResult, error) {
|
||||
resp, err := c.doWithRetry(ctx, func() (*http.Request, error) {
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet,
|
||||
c.baseURL+"/api/v1/jobs/"+id+"/result", nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
httpReq.Header.Set("X-API-Key", c.apiKey)
|
||||
return httpReq, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get result: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode/100 != 2 {
|
||||
raw, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("get result: HTTP %d: %s", resp.StatusCode, string(raw))
|
||||
}
|
||||
|
||||
var out JobResult
|
||||
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
|
||||
return nil, fmt.Errorf("decode result: %w", err)
|
||||
}
|
||||
return &out, nil
|
||||
}
|
||||
|
||||
// ErrJobFailed is returned by WaitForResult when secutools reports the job
|
||||
// as terminally failed (no result will ever be produced).
|
||||
var ErrJobFailed = errors.New("secutools: job failed")
|
||||
|
||||
// ErrTimeout is returned by WaitForResult when the polling deadline elapses
|
||||
// before the job reaches a terminal state.
|
||||
var ErrTimeout = errors.New("secutools: wait timeout")
|
||||
|
||||
// WaitForResult polls /api/v1/jobs/:id every 2s until the job reaches a
|
||||
// terminal state (completed/failed/cancelled) or timeout elapses.
|
||||
// On completed, fetches and returns the result.
|
||||
//
|
||||
// Polling cadence is intentionally fixed (not configurable) to keep the
|
||||
// reaper goroutine simple. If callers need a different cadence they can
|
||||
// implement it themselves on top of GetJob/getResult.
|
||||
func (c *HTTPClient) WaitForResult(ctx context.Context, id string, timeout time.Duration) (*JobResult, error) {
|
||||
deadline := time.Now().Add(timeout)
|
||||
ticker := time.NewTicker(2 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
st, err := c.GetJob(ctx, id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
switch st.Status {
|
||||
case "completed":
|
||||
return c.getResult(ctx, id)
|
||||
case "failed", "cancelled":
|
||||
return nil, fmt.Errorf("%w: status=%s err=%s", ErrJobFailed, st.Status, st.Error)
|
||||
}
|
||||
|
||||
if time.Now().After(deadline) {
|
||||
return nil, ErrTimeout
|
||||
}
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue