fix(sessions): surface gateway SSE failures and add polling fallback (#828)

* fix(sessions): surface gateway SSE failures and add polling fallback

- add a JSON probe mode for the gateway SSE endpoint
- detect watcher-unavailable 503s from the browser
- fall back to periodic session refresh with a toast
- add probe payload tests and endpoint coverage

Fixes #635

* fix(sessions): surface gateway SSE failures and add polling fallback (#826)

Absorbed from PR #826 by @cloudyun888 (fixes #635).

When the gateway watcher thread is not running, the browser now shows a
toast notification and falls back to 30-second periodic polling for session
sync. Previously the SSE failure was completely silent with no user feedback.

Changes from original PR:
- Deleted misplaced test_gateway_sse_probe_unit.py (was at repo root, not
  discovered by `pytest tests/`); unit tests moved into tests/test_gateway_sync.py
- _gateway_sse_probe_payload now checks watcher._thread.is_alive() rather
  than just watcher is not None — a watcher instance with a dead poll thread
  now correctly reports unavailable and activates the polling fallback
- probeGatewaySSEStatus catch(e) now starts the polling fallback on network
  error rather than silently swallowing the failure
- Added 5 unit tests covering all watcher-alive/dead/missing/disabled branches

Co-authored-by: cloudyun888 <269269188+86cloudyun-afk@users.noreply.github.com>

* cleanup(gateway): public is_alive() + dedup probe/live watcher-alive check + changelog

Three small cleanups on top of @cloudyun888's PR #826 absorption:

1. Add GatewayWatcher.is_alive() public accessor so routes.py doesn't
   reach into the private _thread attribute.  The existing private-
   attribute check stays as a defensive fallback for any older in-
   memory instance or test double that doesn't implement the full API.

2. Dedupe the watcher_alive computation in _handle_gateway_sse_stream:
   the live-SSE path now calls _gateway_sse_probe_payload(...) and reads
   its watcher_running field instead of re-deriving the same logic
   inline.  Keeps probe and SSE in sync automatically.

3. CHANGELOG trailer was (#826, fixes #635, @cloudyun888) — this PR is
   #828, so updated to (#828, absorbs PR #826 by @cloudyun888, fixes
   #635) matching the repo convention for absorbed PRs (see #805).

Added two regression tests:
- test_gateway_watcher_is_alive_public_method — covers the three
  lifecycle states (before start, while running, after stop).
- test_probe_payload_prefers_public_is_alive — asserts the probe
  uses watcher.is_alive() rather than poking _thread when the
  public method exists.

Full suite: 1735 passed, 0 new failures.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: cloudyun888 <269269188+86cloudyun-afk@users.noreply.github.com>
Co-authored-by: nesquena-hermes <nesquena-hermes@users.noreply.github.com>
Co-authored-by: Nathan Esquenazi <nesquena@gmail.com>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
nesquena-hermes
2026-04-21 21:18:55 -07:00
committed by GitHub
parent 3daf2427f7
commit d4a3adb7b1
5 changed files with 264 additions and 8 deletions

View File

@@ -361,6 +361,53 @@ async function renderSessionList(){
// ── Gateway session SSE (real-time sync for agent sessions) ──
let _gatewaySSE = null;
let _gatewayPollTimer = null;
let _gatewayProbeInFlight = false;
let _gatewaySSEWarningShown = false;
const _gatewayFallbackPollMs = 30000;
function startGatewayPollFallback(ms){
const intervalMs = Math.max(5000, Number(ms) || _gatewayFallbackPollMs);
if(_gatewayPollTimer) clearInterval(_gatewayPollTimer);
_gatewayPollTimer = setInterval(() => { renderSessionList(); }, intervalMs);
}
function stopGatewayPollFallback(){
if(_gatewayPollTimer){
clearInterval(_gatewayPollTimer);
_gatewayPollTimer = null;
}
}
async function probeGatewaySSEStatus(){
if(_gatewayProbeInFlight || !window._showCliSessions) return;
_gatewayProbeInFlight = true;
try{
const resp = await fetch('/api/sessions/gateway/stream?probe=1', { credentials:'same-origin' });
const data = await resp.json().catch(() => ({}));
if(resp.ok && data.watcher_running){
stopGatewayPollFallback();
_gatewaySSEWarningShown = false;
return;
}
if(resp.status === 503 || data.watcher_running === false){
startGatewayPollFallback(data.fallback_poll_ms || _gatewayFallbackPollMs);
renderSessionList();
if(!_gatewaySSEWarningShown && typeof showToast === 'function'){
showToast('Gateway sync unavailable — falling back to periodic refresh.', 5000);
_gatewaySSEWarningShown = true;
}
}
}catch(e){
// Network error during probe — server may be unreachable.
// Start fallback polling as a safe default; it will self-cancel
// when the SSE connection recovers and sessions_changed fires.
startGatewayPollFallback(_gatewayFallbackPollMs);
renderSessionList();
}finally{
_gatewayProbeInFlight = false;
}
}
function startGatewaySSE(){
stopGatewaySSE();
@@ -371,6 +418,8 @@ function startGatewaySSE(){
try{
const data = JSON.parse(ev.data);
if(data.sessions){
stopGatewayPollFallback();
_gatewaySSEWarningShown = false;
renderSessionList(); // re-fetch and re-render
// If the active session received new gateway messages, refresh the conversation view.
// S.busy check prevents stomping on an in-progress WebUI response.
@@ -400,9 +449,11 @@ function startGatewaySSE(){
}catch(e){ /* ignore parse errors */ }
});
_gatewaySSE.onerror = () => {
// EventSource auto-reconnects; no action needed
void probeGatewaySSEStatus();
};
}catch(e){ /* SSE not available */ }
}catch(e){
void probeGatewaySSEStatus();
}
}
function stopGatewaySSE(){
@@ -410,6 +461,9 @@ function stopGatewaySSE(){
_gatewaySSE.close();
_gatewaySSE = null;
}
stopGatewayPollFallback();
_gatewayProbeInFlight = false;
_gatewaySSEWarningShown = false;
}
let _searchDebounceTimer = null;