claude-failover/internal/quota/monitor_test.go
Ubuntu 62e98cb9e7 fix(quota): veto 5xx errors + tighten patterns to stop false-positive swaps
v0.2.2's 2-poll confirmation was insufficient because Anthropic 500/503
errors are printed into Claude Code's conversation transcript and stay
visible in every tmux capture until the user scrolls. A persistent
server error would confirm on the second poll and still trigger a swap.

Root cause: the pattern "rate limit" (bare substring) matched any 500
payload that happened to mention rate limits in its error text. Real
HTTP 429s from Anthropic are typed as "rate_limit_error" in the error
payload — and that's the signature we should actually key on.

- Remove "rate limit" from quotaPatterns (too generic — matches transcripts).
- Add "rate_limit_error" (Anthropic's typed 429 error) and "5-hour limit".
- Add serverErrorPatterns veto: "api_error", "overloaded_error",
  "internal server error", "api error: 5". When any is present in the
  pane, isQuotaExhausted returns false even if a quota pattern matched.
- 4 new subtests covering the veto paths + sanity that real 429s pass.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 19:26:00 +00:00

217 lines
6.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package quota
import (
"testing"
"forge.secuaas.ovh/olivier/claude-failover/internal/config"
"forge.secuaas.ovh/olivier/claude-failover/internal/state"
)
// mockTmux for quota tests.
type mockTmux struct {
sessions map[string]bool
paneOutput map[string]string
}
func newMockTmux() *mockTmux {
return &mockTmux{
sessions: make(map[string]bool),
paneOutput: make(map[string]string),
}
}
func (m *mockTmux) HasSession(name string) bool { return m.sessions[name] }
func (m *mockTmux) CreateSession(name, _ string) error { m.sessions[name] = true; return nil }
func (m *mockTmux) KillSession(_ string) error { return nil }
func (m *mockTmux) SendKeys(_, _ string) error { return nil }
func (m *mockTmux) CapturePaneTail(session string, _ int) (string, error) {
return m.paneOutput[session], nil
}
// TestIsQuotaExhausted verifies pattern matching on pane output, including
// the server-error veto that prevents 5xx from being mistaken for quota.
func TestIsQuotaExhausted(t *testing.T) {
cases := []struct {
name string
input string
want bool
}{
{"friendly hit message", "You've hit your limit for Claude Pro.", true},
{"typed rate_limit_error", `{"error":{"type":"rate_limit_error"}}`, true},
{"quota exceeded", "quota exceeded for this period", true},
{"usage limit", "Usage limit reached", true},
{"too many requests", "Too many requests", true},
{"5-hour limit", "You've reached the 5-hour limit", true},
{"normal output", "Some normal output ", false},
{"empty prompt", " ", false},
{"status line", "still running 5s · ", false},
// Server-error veto cases — these MUST NOT trigger a swap.
{"api_error 500 veto", `API Error: 500 {"type":"error","error":{"type":"api_error","message":"Internal server error"}} rate limit`, false},
{"overloaded_error veto", `{"error":{"type":"overloaded_error"}} rate limit`, false},
{"internal server error veto", "Internal Server Error — rate limit mentioned elsewhere", false},
// Real 429 should still pass even if generic words are around.
{"real rate_limit_error wins", `{"error":{"type":"rate_limit_error","message":"Rate limited"}}`, true},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
if got := isQuotaExhausted(c.input); got != c.want {
t.Errorf("isQuotaExhausted(%q) = %v, want %v", c.input, got, c.want)
}
})
}
}
// TestExtractResetTime parses various reset time formats.
func TestExtractResetTime(t *testing.T) {
cases := []struct {
input string
want string
}{
{"Usage resets 8pm", "8pm"},
{"Your quota resets at 11:30pm", "11:30pm"},
{"resets in 45 minutes", "in 45 minutes"},
{"resets in 2 hours", "in 2 hours"},
{"no reset info here", ""},
}
for _, c := range cases {
if got := extractResetTime(c.input); got != c.want {
t.Errorf("extractResetTime(%q) = %q, want %q", c.input, got, c.want)
}
}
}
// TestPollTriggersSwitchOnTwoBlockedPoolWithReset verifies a legitimate 429
// (reset time present) triggers a swap immediately on the first poll.
func TestPollTriggersSwitchOnTwoBlockedPoolWithReset(t *testing.T) {
tc := newMockTmux()
tc.sessions["ccl-auto-0"] = true
tc.sessions["ccl-auto-1"] = true
tc.paneOutput["ccl-auto-0"] = "You've hit your limit for Claude Pro. resets in 45 minutes"
tc.paneOutput["ccl-auto-1"] = `{"error":{"type":"rate_limit_error"}} — resets at 8pm`
s := state.New("")
s.SetActiveAccount("compte1")
cfg := &config.Config{
Pool: config.PoolConfig{
Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Max: 2},
},
}
m := New(tc, s, cfg)
m.poll()
select {
case req := <-m.switchCh:
if req.From != "compte1" {
t.Errorf("expected From=compte1, got %q", req.From)
}
if req.ResetTime == "" {
t.Errorf("expected non-empty ResetTime")
}
default:
t.Fatal("expected SwitchRequest on channel")
}
}
// TestPollRequiresConfirmationWhenNoResetTime verifies that a hit without a
// parseable reset time does not trigger a swap on a single poll. A second
// consecutive hit is required. This guards against transient Anthropic 500
// errors whose payload happens to contain "rate limit".
func TestPollRequiresConfirmationWhenNoResetTime(t *testing.T) {
tc := newMockTmux()
tc.sessions["my-session"] = true
tc.paneOutput["my-session"] = "quota exceeded" // no reset time
s := state.New("")
s.SetActiveAccount("compte1")
cfg := &config.Config{
Pool: config.PoolConfig{
Dedicated: []config.DedicatedSession{{Name: "my-session"}},
Autonomous: config.AutonomousConfig{Max: 0},
},
}
m := New(tc, s, cfg)
// First poll — suspected only, no swap yet.
m.poll()
select {
case req := <-m.switchCh:
t.Fatalf("unexpected SwitchRequest on first poll: %+v", req)
default:
}
// Second poll — confirmed, swap emitted.
m.poll()
select {
case req := <-m.switchCh:
if req.From != "compte1" {
t.Errorf("expected From=compte1, got %q", req.From)
}
if req.ResetTime != "" {
t.Errorf("expected empty ResetTime, got %q", req.ResetTime)
}
default:
t.Fatal("expected SwitchRequest on confirmation poll")
}
}
// TestPollSuspectedHitClearedOnRecovery verifies a transient hit followed by
// a clean poll does NOT trigger a swap on a subsequent hit — the suspected
// state must be reset when detection clears.
func TestPollSuspectedHitClearedOnRecovery(t *testing.T) {
tc := newMockTmux()
tc.sessions["my-session"] = true
tc.paneOutput["my-session"] = "quota exceeded"
s := state.New("")
s.SetActiveAccount("compte1")
cfg := &config.Config{
Pool: config.PoolConfig{
Dedicated: []config.DedicatedSession{{Name: "my-session"}},
Autonomous: config.AutonomousConfig{Max: 0},
},
}
m := New(tc, s, cfg)
m.poll() // suspected
tc.paneOutput["my-session"] = "all good "
m.poll() // cleared
tc.paneOutput["my-session"] = "quota exceeded"
m.poll() // re-suspected, NOT confirmed yet
select {
case req := <-m.switchCh:
t.Fatalf("unexpected SwitchRequest after recovery: %+v", req)
default:
}
}
// TestPollNoTriggerWhenBelowThreshold verifies no swap for a single blocked pool session.
func TestPollNoTriggerWhenBelowThreshold(t *testing.T) {
tc := newMockTmux()
tc.sessions["ccl-auto-0"] = true
tc.sessions["ccl-auto-1"] = true
tc.paneOutput["ccl-auto-0"] = "You've hit your limit"
tc.paneOutput["ccl-auto-1"] = " " // fine
s := state.New("")
cfg := &config.Config{
Pool: config.PoolConfig{
Autonomous: config.AutonomousConfig{Prefix: "ccl-auto-", Max: 2},
},
}
m := New(tc, s, cfg)
m.poll()
select {
case req := <-m.switchCh:
t.Errorf("unexpected SwitchRequest: %+v", req)
default:
// Correct: only 1 blocked pool session, threshold is 2.
}
}