claude-failover/internal/secutools/client.go

291 lines
8.7 KiB
Go
Raw Normal View History

feat(phase2-E): multi-provider routing via secutools delegation Adds optional delegation of agent-queue tasks to the SecuAAS secutools AI platform (GPU / Gemini / Claude API) instead of dispatching to a local Claude Code tmux session. Per-task opt-in via YAML frontmatter fields preferred_ai, allow_delegation, complexity_hint — absence keeps the Phase 1 behaviour exactly (zero breaking change). Go side: - internal/secutools: HTTP client with exponential-backoff retries (SubmitJob/GetJob/WaitForResult), DecideProvider map adapter for CLI use, table tests. - internal/router: struct-typed Decide() with strict precedence (needs_claude_code > preferred_ai=claude-code > allow_delegation=false > preferred_ai > fail-safe local on unknown). - internal/delegation: Manager submits jobs, writes .md.delegated markers for on-restart recovery, runs a periodic reaper that moves completed jobs into done/ with provider/cost footer and failed jobs into failed/. - internal/dispatcher: WithDelegation() opt-in, routeTask hook before findFreeSession, skips .md.delegated in assignNextTask. - internal/api: /api/delegated/status (active jobs + counters), /watchdog/status extended with delegation counters. - cmd/ccl-delegate: small CLI exposing submit/get/result/decide so the bash dispatcher can call the same contract without duplicating logic. - cmd/claude-failover: delegation wired opt-in via SECUTOOLS_API_KEY. Tests: - 29+ new unit tests across router, secutools, delegation, dispatcher, api packages. go test -race -count=1 clean. - tests/phase2-E-integration.sh: bash end-to-end against a Python stdlib mock HTTP server, exercising the dev-management scripts. Forward-compat with watchdog (Phase 1 B1 already ignores state=delegated_to_secutools) so delegated tasks aren't flagged stale.
2026-04-17 02:17:19 +00:00
// Package secutools provides a minimal HTTP client for the centralized SecuAAS
// AI-batch platform (https://api.secutools.secuaas.ovh).
//
// Phase 2 — Chantier E: the dispatcher delegates non-Claude-Code-eligible
// tasks to secutools (GPU/Gemini/Claude API) instead of dispatching them to
// a local ccl-auto tmux session. This package is the Go side of that
// delegation: SubmitJob, GetJob, WaitForResult.
//
// The Client interface is intentionally narrow so tests can plug a fake
// implementation without any network dependency.
package secutools
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"time"
)
// Client is the abstraction the rest of the daemon uses to talk to secutools.
// Real callers use HTTPClient; tests substitute a mock.
type Client interface {
SubmitJob(ctx context.Context, req *JobRequest) (*JobResponse, error)
GetJob(ctx context.Context, id string) (*JobStatus, error)
WaitForResult(ctx context.Context, id string, timeout time.Duration) (*JobResult, error)
}
// JobType mirrors the secutools job-type enum.
type JobType string
const (
TypeAnalyze JobType = "ai:analyze"
TypeBatch JobType = "ai:batch"
TypeReport JobType = "ai:report"
TypeCorrelate JobType = "ai:correlate"
)
// Priority mirrors the secutools priority enum.
type Priority string
const (
PriorityCritical Priority = "critical"
PriorityHigh Priority = "high"
PriorityDefault Priority = "default"
PriorityLow Priority = "low"
)
// JobRequest is the body of POST /api/v1/jobs.
type JobRequest struct {
Type JobType `json:"type"`
Priority Priority `json:"priority,omitempty"`
Prompt string `json:"prompt"`
Data map[string]any `json:"data,omitempty"`
MaxTokens int `json:"max_tokens,omitempty"`
PreferredAI string `json:"preferred_ai,omitempty"`
Source string `json:"source,omitempty"`
}
// JobResponse is the immediate reply from POST /api/v1/jobs.
type JobResponse struct {
JobID string `json:"job_id"`
Status string `json:"status"`
}
// JobStatus is the reply from GET /api/v1/jobs/:id.
type JobStatus struct {
JobID string `json:"job_id"`
Status string `json:"status"` // pending | running | completed | failed | cancelled
Provider string `json:"provider,omitempty"`
Error string `json:"error,omitempty"`
}
// JobResult is the reply from GET /api/v1/jobs/:id/result.
type JobResult struct {
JobID string `json:"job_id"`
Response string `json:"response"`
Provider string `json:"provider"`
Model string `json:"model"`
CostCAD float64 `json:"cost_cad"`
Tokens int `json:"tokens,omitempty"`
}
// HTTPClient is the production implementation of Client.
type HTTPClient struct {
baseURL string
apiKey string
hc *http.Client
maxRetries int
baseDelay time.Duration
}
// NewHTTPClient returns an HTTPClient ready to talk to secutools.
// If hc is nil, a default http.Client with a 30s timeout is used.
//
// The client performs up to 3 retries on transport errors and 5xx
// responses, with exponential backoff starting at 500ms (500ms, 1s, 2s).
// 4xx responses are returned as errors without retrying.
func NewHTTPClient(baseURL, apiKey string, hc *http.Client) *HTTPClient {
if hc == nil {
hc = &http.Client{Timeout: 30 * time.Second}
}
return &HTTPClient{
baseURL: baseURL,
apiKey: apiKey,
hc: hc,
maxRetries: 3,
baseDelay: 500 * time.Millisecond,
}
}
// SetRetryPolicy overrides the default retry policy. Useful for tests.
func (c *HTTPClient) SetRetryPolicy(maxRetries int, baseDelay time.Duration) {
c.maxRetries = maxRetries
c.baseDelay = baseDelay
}
// doWithRetry sends req and retries on transport errors or 5xx responses
// using exponential backoff. 4xx is returned without retry. Respects ctx.
func (c *HTTPClient) doWithRetry(ctx context.Context, build func() (*http.Request, error)) (*http.Response, error) {
var lastErr error
delay := c.baseDelay
for attempt := 0; attempt <= c.maxRetries; attempt++ {
if attempt > 0 {
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(delay):
}
delay *= 2
}
req, err := build()
if err != nil {
return nil, err
}
resp, err := c.hc.Do(req)
if err != nil {
lastErr = err
continue
}
// Retry 5xx; return success or 4xx immediately.
if resp.StatusCode >= 500 && resp.StatusCode <= 599 {
raw, _ := io.ReadAll(resp.Body)
_ = resp.Body.Close()
lastErr = fmt.Errorf("HTTP %d: %s", resp.StatusCode, string(raw))
continue
}
return resp, nil
}
if lastErr == nil {
lastErr = errors.New("secutools: unknown transport failure")
}
return nil, fmt.Errorf("after %d attempts: %w", c.maxRetries+1, lastErr)
}
// SubmitJob POSTs req to /api/v1/jobs with retry on 5xx.
func (c *HTTPClient) SubmitJob(ctx context.Context, req *JobRequest) (*JobResponse, error) {
body, err := json.Marshal(req)
if err != nil {
return nil, fmt.Errorf("marshal request: %w", err)
}
resp, err := c.doWithRetry(ctx, func() (*http.Request, error) {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost,
c.baseURL+"/api/v1/jobs", bytes.NewReader(body))
if err != nil {
return nil, err
}
httpReq.Header.Set("Content-Type", "application/json")
httpReq.Header.Set("X-API-Key", c.apiKey)
return httpReq, nil
})
if err != nil {
return nil, fmt.Errorf("submit job: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode/100 != 2 {
raw, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("submit job: HTTP %d: %s", resp.StatusCode, string(raw))
}
var out JobResponse
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return nil, fmt.Errorf("decode submit response: %w", err)
}
return &out, nil
}
// GetJob GETs /api/v1/jobs/:id with retry on 5xx.
func (c *HTTPClient) GetJob(ctx context.Context, id string) (*JobStatus, error) {
resp, err := c.doWithRetry(ctx, func() (*http.Request, error) {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet,
c.baseURL+"/api/v1/jobs/"+id, nil)
if err != nil {
return nil, err
}
httpReq.Header.Set("X-API-Key", c.apiKey)
return httpReq, nil
})
if err != nil {
return nil, fmt.Errorf("get job: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode/100 != 2 {
raw, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("get job: HTTP %d: %s", resp.StatusCode, string(raw))
}
var out JobStatus
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return nil, fmt.Errorf("decode get response: %w", err)
}
return &out, nil
}
// getResult fetches the final payload of a completed job with retry on 5xx.
func (c *HTTPClient) getResult(ctx context.Context, id string) (*JobResult, error) {
resp, err := c.doWithRetry(ctx, func() (*http.Request, error) {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet,
c.baseURL+"/api/v1/jobs/"+id+"/result", nil)
if err != nil {
return nil, err
}
httpReq.Header.Set("X-API-Key", c.apiKey)
return httpReq, nil
})
if err != nil {
return nil, fmt.Errorf("get result: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode/100 != 2 {
raw, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("get result: HTTP %d: %s", resp.StatusCode, string(raw))
}
var out JobResult
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return nil, fmt.Errorf("decode result: %w", err)
}
return &out, nil
}
// ErrJobFailed is returned by WaitForResult when secutools reports the job
// as terminally failed (no result will ever be produced).
var ErrJobFailed = errors.New("secutools: job failed")
// ErrTimeout is returned by WaitForResult when the polling deadline elapses
// before the job reaches a terminal state.
var ErrTimeout = errors.New("secutools: wait timeout")
// WaitForResult polls /api/v1/jobs/:id every 2s until the job reaches a
// terminal state (completed/failed/cancelled) or timeout elapses.
// On completed, fetches and returns the result.
//
// Polling cadence is intentionally fixed (not configurable) to keep the
// reaper goroutine simple. If callers need a different cadence they can
// implement it themselves on top of GetJob/getResult.
func (c *HTTPClient) WaitForResult(ctx context.Context, id string, timeout time.Duration) (*JobResult, error) {
deadline := time.Now().Add(timeout)
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
for {
st, err := c.GetJob(ctx, id)
if err != nil {
return nil, err
}
switch st.Status {
case "completed":
return c.getResult(ctx, id)
case "failed", "cancelled":
return nil, fmt.Errorf("%w: status=%s err=%s", ErrJobFailed, st.Status, st.Error)
}
if time.Now().After(deadline) {
return nil, ErrTimeout
}
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-ticker.C:
}
}
}