fix(quota): add cooldown + 2-poll confirmation to prevent swap ping-pong

Anthropic HTTP 500 errors surface in the TUI with payloads containing
"rate limit" text, which the monitor was matching against quotaPatterns
and treating as a real 429 quota hit. With no cooldown and no
confirmation, a burst of 500s produced sub-minute ping-pong swaps that
tore down user sessions.

Two-layer fix:
- quota.reactivate_cooldown (already in config, 5m) now gates the
  monitor too — not just the dispatcher. A completed swap suppresses
  further detection for the cooldown window.
- A hit with no parseable reset time is treated as suspected only on
  the first poll; a second consecutive poll is required before
  emitting SwapRequested. Legitimate 429s with "resets in ..." still
  swap instantly on the first detection.

Adds state.RecordSwap / LastSwapInfo for the cooldown, and a
forensic log line on every detection: trigger_session, matched
pattern, 120-char pane snippet.

Tests cover: instant swap with reset, 2-poll confirmation without
reset, and suspected-state reset on recovery.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Ubuntu 2026-04-15 19:18:27 +00:00
parent 75b5110748
commit 7c5f8384fa
5 changed files with 246 additions and 25 deletions

View file

@ -46,6 +46,12 @@ type Monitor struct {
switchCh chan SwitchRequest
interval time.Duration
logger *log.Logger
// suspectedHitAt tracks the first poll that detected a quota pattern
// without a parseable reset time. A second consecutive hit is required
// before emitting SwapRequested; a single-poll hit is ignored as likely
// transient (e.g. Anthropic 500s containing "rate limit" in payload).
// Mutated only from poll(), which runs on a single goroutine — no lock.
suspectedHitAt time.Time
}
// New creates a Monitor with defaults from cfg.
@ -89,9 +95,33 @@ func (m *Monitor) poll() {
return
}
// Cooldown guard: after a recent swap, refuse to trigger another one until
// the cooldown elapses. Prevents ping-pong when the freshly-activated
// account surfaces transient errors whose text happens to match
// quotaPatterns (e.g. Anthropic 500s rendered as "rate limit" in the TUI).
cooldown := m.config.Quota.ReactivateCooldown.Duration
if cooldown == 0 {
cooldown = 10 * time.Minute
}
if lastAt, _, lastTo := m.state.LastSwapInfo(); !lastAt.IsZero() {
if since := time.Since(lastAt); since < cooldown {
m.logger.Printf("[quota] swap cooldown active (since=%v < cooldown=%v, last_to=%q) — skipping detection",
since.Round(time.Second), cooldown, lastTo)
return
}
}
blockedPool := 0
blockedInteractive := 0
var resetTime string
var firstPattern, firstSession, firstSnippet string
recordHit := func(session, tail string) {
if firstPattern == "" {
firstPattern = firstMatchingPattern(tail)
firstSession = session
firstSnippet = snippet(tail)
}
}
prefix := m.config.Pool.Autonomous.Prefix
if prefix == "" {
@ -109,6 +139,7 @@ func (m *Monitor) poll() {
}
if isQuotaExhausted(tail) {
blockedPool++
recordHit(name, tail)
if rt := extractResetTime(tail); rt != "" {
resetTime = rt
}
@ -125,24 +156,51 @@ func (m *Monitor) poll() {
}
if isQuotaExhausted(tail) {
blockedInteractive++
recordHit(ds.Name, tail)
if rt := extractResetTime(tail); rt != "" {
resetTime = rt
}
}
}
if blockedPool >= 2 || blockedInteractive >= 1 {
req := SwitchRequest{
From: m.state.ActiveAccount(),
ResetTime: resetTime,
if blockedPool < 2 && blockedInteractive < 1 {
// No detection — clear any suspected-hit state so future transient
// blips have to re-confirm from scratch.
if !m.suspectedHitAt.IsZero() {
m.logger.Printf("[quota] suspected hit cleared (no detection this poll)")
m.suspectedHitAt = time.Time{}
}
select {
case m.switchCh <- req:
m.logger.Printf("[quota] SwapRequested: from=%s pool=%d interactive=%d reset=%q",
req.From, blockedPool, blockedInteractive, resetTime)
default:
// Swap already pending — do not queue another.
return
}
// When no reset time can be parsed, the "hit" might be a transient
// Anthropic 500 that happens to contain "rate limit" in its error
// payload. Require two consecutive polls detecting a hit before
// swapping, so a single-poll false positive is absorbed.
if resetTime == "" {
if m.suspectedHitAt.IsZero() {
m.suspectedHitAt = time.Now()
m.logger.Printf("[quota] suspected hit (no reset time): session=%q pattern=%q snippet=%q — awaiting confirmation next poll",
firstSession, firstPattern, firstSnippet)
return
}
// Second consecutive hit — proceed.
m.logger.Printf("[quota] hit confirmed across %v — proceeding with swap",
time.Since(m.suspectedHitAt).Round(time.Second))
}
m.suspectedHitAt = time.Time{}
req := SwitchRequest{
From: m.state.ActiveAccount(),
ResetTime: resetTime,
}
select {
case m.switchCh <- req:
m.logger.Printf("[quota] SwapRequested: from=%s pool=%d interactive=%d reset=%q trigger_session=%q pattern=%q snippet=%q",
req.From, blockedPool, blockedInteractive, resetTime,
firstSession, firstPattern, firstSnippet)
default:
// Swap already pending — do not queue another.
}
}
@ -154,13 +212,30 @@ func (m *Monitor) isQuotaPaused() bool {
// isQuotaExhausted returns true if the pane content indicates quota exhaustion.
func isQuotaExhausted(paneContent string) bool {
return firstMatchingPattern(paneContent) != ""
}
// firstMatchingPattern returns the first quota pattern found in paneContent,
// or "" if none match. Exposed for diagnostic logging.
func firstMatchingPattern(paneContent string) string {
lower := strings.ToLower(paneContent)
for _, p := range quotaPatterns {
if strings.Contains(lower, p) {
return true
return p
}
}
return false
return ""
}
// snippet returns a 120-char single-line excerpt of pane content for logging.
func snippet(s string) string {
s = strings.ReplaceAll(s, "\n", " ")
s = strings.ReplaceAll(s, "\r", "")
s = strings.TrimSpace(s)
if len(s) > 120 {
s = s[:120]
}
return s
}
// extractResetTime parses a reset time string from pane content.

View file

@ -69,13 +69,14 @@ func TestExtractResetTime(t *testing.T) {
}
}
// TestPollTriggersSwitchOnTwoBlockedPool verifies swap trigger for >=2 blocked pool sessions.
func TestPollTriggersSwitchOnTwoBlockedPool(t *testing.T) {
// TestPollTriggersSwitchOnTwoBlockedPoolWithReset verifies a legitimate 429
// (reset time present) triggers a swap immediately on the first poll.
func TestPollTriggersSwitchOnTwoBlockedPoolWithReset(t *testing.T) {
tc := newMockTmux()
tc.sessions["ccl-auto-0"] = true
tc.sessions["ccl-auto-1"] = true
tc.paneOutput["ccl-auto-0"] = "You've hit your limit for Claude Pro."
tc.paneOutput["ccl-auto-1"] = "rate limit exceeded"
tc.paneOutput["ccl-auto-0"] = "You've hit your limit for Claude Pro. resets in 45 minutes"
tc.paneOutput["ccl-auto-1"] = "rate limit exceeded — resets at 8pm"
s := state.New("")
s.SetActiveAccount("compte1")
@ -93,13 +94,61 @@ func TestPollTriggersSwitchOnTwoBlockedPool(t *testing.T) {
if req.From != "compte1" {
t.Errorf("expected From=compte1, got %q", req.From)
}
if req.ResetTime == "" {
t.Errorf("expected non-empty ResetTime")
}
default:
t.Fatal("expected SwitchRequest on channel")
}
}
// TestPollTriggersSwitchOnOneBlockedInteractive verifies swap trigger for >=1 dedicated session.
func TestPollTriggersSwitchOnOneBlockedInteractive(t *testing.T) {
// TestPollRequiresConfirmationWhenNoResetTime verifies that a hit without a
// parseable reset time does not trigger a swap on a single poll. A second
// consecutive hit is required. This guards against transient Anthropic 500
// errors whose payload happens to contain "rate limit".
func TestPollRequiresConfirmationWhenNoResetTime(t *testing.T) {
tc := newMockTmux()
tc.sessions["my-session"] = true
tc.paneOutput["my-session"] = "quota exceeded" // no reset time
s := state.New("")
s.SetActiveAccount("compte1")
cfg := &config.Config{
Pool: config.PoolConfig{
Dedicated: []config.DedicatedSession{{Name: "my-session"}},
Autonomous: config.AutonomousConfig{Max: 0},
},
}
m := New(tc, s, cfg)
// First poll — suspected only, no swap yet.
m.poll()
select {
case req := <-m.switchCh:
t.Fatalf("unexpected SwitchRequest on first poll: %+v", req)
default:
}
// Second poll — confirmed, swap emitted.
m.poll()
select {
case req := <-m.switchCh:
if req.From != "compte1" {
t.Errorf("expected From=compte1, got %q", req.From)
}
if req.ResetTime != "" {
t.Errorf("expected empty ResetTime, got %q", req.ResetTime)
}
default:
t.Fatal("expected SwitchRequest on confirmation poll")
}
}
// TestPollSuspectedHitClearedOnRecovery verifies a transient hit followed by
// a clean poll does NOT trigger a swap on a subsequent hit — the suspected
// state must be reset when detection clears.
func TestPollSuspectedHitClearedOnRecovery(t *testing.T) {
tc := newMockTmux()
tc.sessions["my-session"] = true
tc.paneOutput["my-session"] = "quota exceeded"
@ -114,15 +163,17 @@ func TestPollTriggersSwitchOnOneBlockedInteractive(t *testing.T) {
},
}
m := New(tc, s, cfg)
m.poll()
m.poll() // suspected
tc.paneOutput["my-session"] = "all good "
m.poll() // cleared
tc.paneOutput["my-session"] = "quota exceeded"
m.poll() // re-suspected, NOT confirmed yet
select {
case req := <-m.switchCh:
if req.From != "compte1" {
t.Errorf("expected From=compte1, got %q", req.From)
}
t.Fatalf("unexpected SwitchRequest after recovery: %+v", req)
default:
t.Fatal("expected SwitchRequest on channel")
}
}