diff --git a/VERSION.md b/VERSION.md index c1c8c45..bc5ebe6 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1,4 +1,69 @@ -# Version actuelle : 0.2.0 +# Version actuelle : 0.2.2 + +## [0.2.2] - 2026-04-15 +**Type:** Patch — Confirmation requise pour les faux positifs (root cause) + +### Corrigé +- **Cause racine du ping-pong** : les erreurs HTTP 500 transitoires d'Anthropic + contiennent le texte "rate limit" dans leur payload (`{"type":"api_error",...}` + avec des traces mentionnant "rate limit"). Le monitor les confondait avec de + vrais 429 quota hits. La v0.2.1 cassait la boucle via cooldown, mais un swap + par salve de 500s pouvait encore tuer les sessions dédiées. +- Le nouveau log forensique v0.2.1 a révélé exactement ça (snippet capturé : + `API Error: 500 {"type":"error","error":{"type":"api_error",...}`). + +### Ajouté +- **Confirmation 2-polls pour les hits sans reset time** : si `extractResetTime` + ne trouve rien (= pas un vrai 429), le monitor marque l'état `suspectedHitAt` + et attend le poll suivant. Le swap n'est émis que si la détection persiste. + Un hit isolé (= erreur 500 transitoire) est absorbé sans swap. +- Un vrai 429 (avec `resets in 45 minutes` ou `resets at 8pm`) continue à + déclencher un swap instantané. +- `Monitor.suspectedHitAt` (not locked — only touched from poll goroutine). +- 3 nouveaux tests : `TestPollTriggersSwitchOnTwoBlockedPoolWithReset`, + `TestPollRequiresConfirmationWhenNoResetTime`, + `TestPollSuspectedHitClearedOnRecovery`. + +### Tests effectués +- ✅ `go test ./...` — full suite passe +- ✅ Service redémarré, état consistant + +### Fichiers modifiés +- `internal/quota/monitor.go` +- `internal/quota/monitor_test.go` + +## [0.2.1] - 2026-04-15 +**Type:** Patch — Fix boucle de swaps infinis (ping-pong) + +### Corrigé +- **Boucle infinie de swaps** : le monitor pouvait émettre des `SwapRequested` + toutes les 30s, créant un ping-pong entre comptes quand du texte de pane + (anciens errors Anthropic 500 / TUI banners) matchait `quotaPatterns` avec + `reset=""`. En prod, interval observé descendant jusqu'à 1 min. +- **Cause racine** : aucun cooldown global entre swaps dans la boucle de + détection. La config `quota.reactivate_cooldown` (5m) existait mais n'était + utilisée que par le dispatcher, pas par le monitor. + +### Ajouté +- `state.QuotaState.LastSwapAt/LastSwapFrom/LastSwapTo` + `RecordSwap()` + + `LastSwapInfo()` pour tracker le dernier swap. +- `monitor.poll()` vérifie `quota.reactivate_cooldown` avant de déclencher un + swap. Log explicite quand le cooldown bloque : `[quota] swap cooldown active`. +- Log forensique détaillé lors d'un `SwapRequested` : session déclencheuse, + pattern matché, snippet du pane (120 chars). Ex : + `trigger_session="ccl-1-conformvault" pattern="rate limit" snippet="..."`. +- `switcher.executeSwitch` appelle `state.RecordSwap()` après `SetActiveAccount`. + +### Tests effectués +- ✅ `go build ./cmd/claude-failover` OK +- ✅ `go test ./internal/quota/... ./internal/state/... ./internal/switcher/...` OK +- ✅ Binaire installé dans `/usr/local/bin/claude-failover` +- ✅ Service redémarré — pas de swap intempestif depuis + +### Fichiers modifiés +- `internal/state/state.go` +- `internal/quota/monitor.go` +- `internal/switcher/account_switcher.go` ## [0.2.0] - 2026-04-14 **Type:** Minor — Implémentation des goroutines Phase 2 diff --git a/internal/quota/monitor.go b/internal/quota/monitor.go index 829b968..03760c2 100644 --- a/internal/quota/monitor.go +++ b/internal/quota/monitor.go @@ -46,6 +46,12 @@ type Monitor struct { switchCh chan SwitchRequest interval time.Duration logger *log.Logger + // suspectedHitAt tracks the first poll that detected a quota pattern + // without a parseable reset time. A second consecutive hit is required + // before emitting SwapRequested; a single-poll hit is ignored as likely + // transient (e.g. Anthropic 500s containing "rate limit" in payload). + // Mutated only from poll(), which runs on a single goroutine — no lock. + suspectedHitAt time.Time } // New creates a Monitor with defaults from cfg. @@ -89,9 +95,33 @@ func (m *Monitor) poll() { return } + // Cooldown guard: after a recent swap, refuse to trigger another one until + // the cooldown elapses. Prevents ping-pong when the freshly-activated + // account surfaces transient errors whose text happens to match + // quotaPatterns (e.g. Anthropic 500s rendered as "rate limit" in the TUI). + cooldown := m.config.Quota.ReactivateCooldown.Duration + if cooldown == 0 { + cooldown = 10 * time.Minute + } + if lastAt, _, lastTo := m.state.LastSwapInfo(); !lastAt.IsZero() { + if since := time.Since(lastAt); since < cooldown { + m.logger.Printf("[quota] swap cooldown active (since=%v < cooldown=%v, last_to=%q) — skipping detection", + since.Round(time.Second), cooldown, lastTo) + return + } + } + blockedPool := 0 blockedInteractive := 0 var resetTime string + var firstPattern, firstSession, firstSnippet string + recordHit := func(session, tail string) { + if firstPattern == "" { + firstPattern = firstMatchingPattern(tail) + firstSession = session + firstSnippet = snippet(tail) + } + } prefix := m.config.Pool.Autonomous.Prefix if prefix == "" { @@ -109,6 +139,7 @@ func (m *Monitor) poll() { } if isQuotaExhausted(tail) { blockedPool++ + recordHit(name, tail) if rt := extractResetTime(tail); rt != "" { resetTime = rt } @@ -125,24 +156,51 @@ func (m *Monitor) poll() { } if isQuotaExhausted(tail) { blockedInteractive++ + recordHit(ds.Name, tail) if rt := extractResetTime(tail); rt != "" { resetTime = rt } } } - if blockedPool >= 2 || blockedInteractive >= 1 { - req := SwitchRequest{ - From: m.state.ActiveAccount(), - ResetTime: resetTime, + if blockedPool < 2 && blockedInteractive < 1 { + // No detection — clear any suspected-hit state so future transient + // blips have to re-confirm from scratch. + if !m.suspectedHitAt.IsZero() { + m.logger.Printf("[quota] suspected hit cleared (no detection this poll)") + m.suspectedHitAt = time.Time{} } - select { - case m.switchCh <- req: - m.logger.Printf("[quota] SwapRequested: from=%s pool=%d interactive=%d reset=%q", - req.From, blockedPool, blockedInteractive, resetTime) - default: - // Swap already pending — do not queue another. + return + } + + // When no reset time can be parsed, the "hit" might be a transient + // Anthropic 500 that happens to contain "rate limit" in its error + // payload. Require two consecutive polls detecting a hit before + // swapping, so a single-poll false positive is absorbed. + if resetTime == "" { + if m.suspectedHitAt.IsZero() { + m.suspectedHitAt = time.Now() + m.logger.Printf("[quota] suspected hit (no reset time): session=%q pattern=%q snippet=%q — awaiting confirmation next poll", + firstSession, firstPattern, firstSnippet) + return } + // Second consecutive hit — proceed. + m.logger.Printf("[quota] hit confirmed across %v — proceeding with swap", + time.Since(m.suspectedHitAt).Round(time.Second)) + } + m.suspectedHitAt = time.Time{} + + req := SwitchRequest{ + From: m.state.ActiveAccount(), + ResetTime: resetTime, + } + select { + case m.switchCh <- req: + m.logger.Printf("[quota] SwapRequested: from=%s pool=%d interactive=%d reset=%q trigger_session=%q pattern=%q snippet=%q", + req.From, blockedPool, blockedInteractive, resetTime, + firstSession, firstPattern, firstSnippet) + default: + // Swap already pending — do not queue another. } } @@ -154,13 +212,30 @@ func (m *Monitor) isQuotaPaused() bool { // isQuotaExhausted returns true if the pane content indicates quota exhaustion. func isQuotaExhausted(paneContent string) bool { + return firstMatchingPattern(paneContent) != "" +} + +// firstMatchingPattern returns the first quota pattern found in paneContent, +// or "" if none match. Exposed for diagnostic logging. +func firstMatchingPattern(paneContent string) string { lower := strings.ToLower(paneContent) for _, p := range quotaPatterns { if strings.Contains(lower, p) { - return true + return p } } - return false + return "" +} + +// snippet returns a 120-char single-line excerpt of pane content for logging. +func snippet(s string) string { + s = strings.ReplaceAll(s, "\n", " ") + s = strings.ReplaceAll(s, "\r", "") + s = strings.TrimSpace(s) + if len(s) > 120 { + s = s[:120] + } + return s } // extractResetTime parses a reset time string from pane content. diff --git a/internal/quota/monitor_test.go b/internal/quota/monitor_test.go index 5363fe4..e0f9850 100644 --- a/internal/quota/monitor_test.go +++ b/internal/quota/monitor_test.go @@ -69,13 +69,14 @@ func TestExtractResetTime(t *testing.T) { } } -// TestPollTriggersSwitchOnTwoBlockedPool verifies swap trigger for >=2 blocked pool sessions. -func TestPollTriggersSwitchOnTwoBlockedPool(t *testing.T) { +// TestPollTriggersSwitchOnTwoBlockedPoolWithReset verifies a legitimate 429 +// (reset time present) triggers a swap immediately on the first poll. +func TestPollTriggersSwitchOnTwoBlockedPoolWithReset(t *testing.T) { tc := newMockTmux() tc.sessions["ccl-auto-0"] = true tc.sessions["ccl-auto-1"] = true - tc.paneOutput["ccl-auto-0"] = "You've hit your limit for Claude Pro." - tc.paneOutput["ccl-auto-1"] = "rate limit exceeded" + tc.paneOutput["ccl-auto-0"] = "You've hit your limit for Claude Pro. resets in 45 minutes" + tc.paneOutput["ccl-auto-1"] = "rate limit exceeded — resets at 8pm" s := state.New("") s.SetActiveAccount("compte1") @@ -93,13 +94,61 @@ func TestPollTriggersSwitchOnTwoBlockedPool(t *testing.T) { if req.From != "compte1" { t.Errorf("expected From=compte1, got %q", req.From) } + if req.ResetTime == "" { + t.Errorf("expected non-empty ResetTime") + } default: t.Fatal("expected SwitchRequest on channel") } } -// TestPollTriggersSwitchOnOneBlockedInteractive verifies swap trigger for >=1 dedicated session. -func TestPollTriggersSwitchOnOneBlockedInteractive(t *testing.T) { +// TestPollRequiresConfirmationWhenNoResetTime verifies that a hit without a +// parseable reset time does not trigger a swap on a single poll. A second +// consecutive hit is required. This guards against transient Anthropic 500 +// errors whose payload happens to contain "rate limit". +func TestPollRequiresConfirmationWhenNoResetTime(t *testing.T) { + tc := newMockTmux() + tc.sessions["my-session"] = true + tc.paneOutput["my-session"] = "quota exceeded" // no reset time + + s := state.New("") + s.SetActiveAccount("compte1") + + cfg := &config.Config{ + Pool: config.PoolConfig{ + Dedicated: []config.DedicatedSession{{Name: "my-session"}}, + Autonomous: config.AutonomousConfig{Max: 0}, + }, + } + m := New(tc, s, cfg) + + // First poll — suspected only, no swap yet. + m.poll() + select { + case req := <-m.switchCh: + t.Fatalf("unexpected SwitchRequest on first poll: %+v", req) + default: + } + + // Second poll — confirmed, swap emitted. + m.poll() + select { + case req := <-m.switchCh: + if req.From != "compte1" { + t.Errorf("expected From=compte1, got %q", req.From) + } + if req.ResetTime != "" { + t.Errorf("expected empty ResetTime, got %q", req.ResetTime) + } + default: + t.Fatal("expected SwitchRequest on confirmation poll") + } +} + +// TestPollSuspectedHitClearedOnRecovery verifies a transient hit followed by +// a clean poll does NOT trigger a swap on a subsequent hit — the suspected +// state must be reset when detection clears. +func TestPollSuspectedHitClearedOnRecovery(t *testing.T) { tc := newMockTmux() tc.sessions["my-session"] = true tc.paneOutput["my-session"] = "quota exceeded" @@ -114,15 +163,17 @@ func TestPollTriggersSwitchOnOneBlockedInteractive(t *testing.T) { }, } m := New(tc, s, cfg) - m.poll() + + m.poll() // suspected + tc.paneOutput["my-session"] = "all good ❯ " + m.poll() // cleared + tc.paneOutput["my-session"] = "quota exceeded" + m.poll() // re-suspected, NOT confirmed yet select { case req := <-m.switchCh: - if req.From != "compte1" { - t.Errorf("expected From=compte1, got %q", req.From) - } + t.Fatalf("unexpected SwitchRequest after recovery: %+v", req) default: - t.Fatal("expected SwitchRequest on channel") } } diff --git a/internal/state/state.go b/internal/state/state.go index 4a31f4c..fa0bbcf 100644 --- a/internal/state/state.go +++ b/internal/state/state.go @@ -24,6 +24,9 @@ type QuotaState struct { Paused bool `json:"paused"` ActiveAccount string `json:"active_account"` ResumeAt *time.Time `json:"resume_at,omitempty"` + LastSwapAt *time.Time `json:"last_swap_at,omitempty"` + LastSwapFrom string `json:"last_swap_from,omitempty"` + LastSwapTo string `json:"last_swap_to,omitempty"` } // State is the thread-safe runtime state persisted to a JSON file. @@ -178,6 +181,31 @@ func (s *State) ActiveAccount() string { return s.Quota.ActiveAccount } +// RecordSwap stamps the time, source and target of the most recent account swap. +// Used by the quota monitor to enforce a cooldown and avoid ping-pong loops +// when transient upstream errors (e.g. Anthropic 500s mirrored as "rate limit" +// text in the TUI) are misread as quota hits. +func (s *State) RecordSwap(from, to string) { + s.mu.Lock() + defer s.mu.Unlock() + now := time.Now().UTC() + s.Quota.LastSwapAt = &now + s.Quota.LastSwapFrom = from + s.Quota.LastSwapTo = to + s.touch() +} + +// LastSwapInfo returns the time, source and target of the most recent swap, +// or a zero time and empty strings if no swap has occurred. +func (s *State) LastSwapInfo() (time.Time, string, string) { + s.mu.RLock() + defer s.mu.RUnlock() + if s.Quota.LastSwapAt == nil { + return time.Time{}, "", "" + } + return *s.Quota.LastSwapAt, s.Quota.LastSwapFrom, s.Quota.LastSwapTo +} + // SetFailed marks the named session as failed and records the failure timestamp. // The task is preserved for potential requeue by the caller. func (s *State) SetFailed(name string) { diff --git a/internal/switcher/account_switcher.go b/internal/switcher/account_switcher.go index 7684872..a3c0840 100644 --- a/internal/switcher/account_switcher.go +++ b/internal/switcher/account_switcher.go @@ -109,8 +109,10 @@ func (a *AccountSwitcher) executeSwitch(req quota.SwitchRequest) { a.killAllPoolSessions() a.recreatePoolSessions() - // Update active account. + // Update active account and record the swap timestamp so the quota + // monitor can enforce a cooldown before requesting another one. a.state.SetActiveAccount(target.Name) + a.state.RecordSwap(req.From, target.Name) // 3. RESUMING — sessions are alive, dispatcher will fill them. a.currentState = StateResuming