fix(sessions): surface gateway SSE failures and add polling fallback (#828)

* fix(sessions): surface gateway SSE failures and add polling fallback

- add a JSON probe mode for the gateway SSE endpoint
- detect watcher-unavailable 503s from the browser
- fall back to periodic session refresh with a toast
- add probe payload tests and endpoint coverage

Fixes #635

* fix(sessions): surface gateway SSE failures and add polling fallback (#826)

Absorbed from PR #826 by @cloudyun888 (fixes #635).

When the gateway watcher thread is not running, the browser now shows a
toast notification and falls back to 30-second periodic polling for session
sync. Previously the SSE failure was completely silent with no user feedback.

Changes from original PR:
- Deleted misplaced test_gateway_sse_probe_unit.py (was at repo root, not
  discovered by `pytest tests/`); unit tests moved into tests/test_gateway_sync.py
- _gateway_sse_probe_payload now checks watcher._thread.is_alive() rather
  than just watcher is not None — a watcher instance with a dead poll thread
  now correctly reports unavailable and activates the polling fallback
- probeGatewaySSEStatus catch(e) now starts the polling fallback on network
  error rather than silently swallowing the failure
- Added 5 unit tests covering all watcher-alive/dead/missing/disabled branches

Co-authored-by: cloudyun888 <269269188+86cloudyun-afk@users.noreply.github.com>

* cleanup(gateway): public is_alive() + dedup probe/live watcher-alive check + changelog

Three small cleanups on top of @cloudyun888's PR #826 absorption:

1. Add GatewayWatcher.is_alive() public accessor so routes.py doesn't
   reach into the private _thread attribute.  The existing private-
   attribute check stays as a defensive fallback for any older in-
   memory instance or test double that doesn't implement the full API.

2. Dedupe the watcher_alive computation in _handle_gateway_sse_stream:
   the live-SSE path now calls _gateway_sse_probe_payload(...) and reads
   its watcher_running field instead of re-deriving the same logic
   inline.  Keeps probe and SSE in sync automatically.

3. CHANGELOG trailer was (#826, fixes #635, @cloudyun888) — this PR is
   #828, so updated to (#828, absorbs PR #826 by @cloudyun888, fixes
   #635) matching the repo convention for absorbed PRs (see #805).

Added two regression tests:
- test_gateway_watcher_is_alive_public_method — covers the three
  lifecycle states (before start, while running, after stop).
- test_probe_payload_prefers_public_is_alive — asserts the probe
  uses watcher.is_alive() rather than poking _thread when the
  public method exists.

Full suite: 1735 passed, 0 new failures.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: cloudyun888 <269269188+86cloudyun-afk@users.noreply.github.com>
Co-authored-by: nesquena-hermes <nesquena-hermes@users.noreply.github.com>
Co-authored-by: Nathan Esquenazi <nesquena@gmail.com>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
nesquena-hermes
2026-04-21 21:18:55 -07:00
committed by GitHub
parent 3daf2427f7
commit d4a3adb7b1
5 changed files with 264 additions and 8 deletions

View File

@@ -352,6 +352,23 @@ def test_gateway_sse_stream_endpoint_exists():
post('/api/settings', {'show_cli_sessions': False})
def test_gateway_sse_stream_probe_reports_status():
"""Probe mode returns JSON watcher status instead of holding open an SSE stream."""
post('/api/settings', {'show_cli_sessions': True})
try:
req = urllib.request.Request(BASE + '/api/sessions/gateway/stream?probe=1')
with urllib.request.urlopen(req, timeout=5) as r:
assert r.status == 200, f"Expected 200, got {r.status}"
ctype = r.headers.get('Content-Type', '')
assert 'application/json' in ctype, f"Expected application/json, got {ctype}"
data = json.loads(r.read().decode('utf-8'))
assert data['enabled'] is True
assert 'watcher_running' in data
assert data['fallback_poll_ms'] == 30000
finally:
post('/api/settings', {'show_cli_sessions': False})
def test_gateway_webui_sessions_not_duplicated():
"""If a session_id exists both in WebUI store and state.db, it's not duplicated."""
# Create a WebUI session with a known ID
@@ -418,3 +435,124 @@ def test_cli_sessions_still_work():
except Exception:
pass
post('/api/settings', {'show_cli_sessions': False})
# ── Unit tests for _gateway_sse_probe_payload ────────────────────────────────
# These replace the deleted repo-root test_gateway_sse_probe_unit.py and account
# for the watcher_alive check (thread existence + is_alive()).
import sys
import threading
sys.path.insert(0, str(REPO_ROOT))
from api.routes import _gateway_sse_probe_payload
def test_probe_payload_when_disabled():
"""Probe returns 404 when show_cli_sessions is False."""
body, status = _gateway_sse_probe_payload({'show_cli_sessions': False}, watcher=None)
assert status == 404
assert body['ok'] is False
assert body['enabled'] is False
assert body['watcher_running'] is False
assert body['error'] == 'agent sessions not enabled'
assert body['fallback_poll_ms'] == 30000
def test_probe_payload_when_watcher_missing():
"""Probe returns 503 when enabled but no watcher instance."""
body, status = _gateway_sse_probe_payload({'show_cli_sessions': True}, watcher=None)
assert status == 503
assert body['ok'] is False
assert body['enabled'] is True
assert body['watcher_running'] is False
assert body['error'] == 'watcher not started'
assert body['fallback_poll_ms'] == 30000
def test_probe_payload_when_watcher_instance_no_thread():
"""Probe returns 503 when watcher exists but _thread attribute is missing/None."""
class _FakeWatcher:
_thread = None
body, status = _gateway_sse_probe_payload({'show_cli_sessions': True}, watcher=_FakeWatcher())
assert status == 503
assert body['watcher_running'] is False
def test_probe_payload_when_watcher_thread_alive():
"""Probe returns 200 when enabled and watcher thread is alive."""
class _FakeWatcher:
pass
w = _FakeWatcher()
t = threading.Thread(target=lambda: None)
t.daemon = True
t.start()
w._thread = t
# Thread may finish fast — loop-start a live daemon thread for reliability
import time as _time
done = threading.Event()
live = threading.Thread(target=done.wait, daemon=True)
live.start()
w._thread = live
try:
body, status = _gateway_sse_probe_payload({'show_cli_sessions': True}, watcher=w)
assert status == 200
assert body['ok'] is True
assert body['watcher_running'] is True
assert body['fallback_poll_ms'] == 30000
finally:
done.set()
live.join(timeout=1)
def test_probe_payload_when_watcher_thread_dead():
"""Probe returns 503 when watcher instance exists but thread has exited."""
class _FakeWatcher:
pass
w = _FakeWatcher()
t = threading.Thread(target=lambda: None)
t.start()
t.join() # wait for it to finish
w._thread = t
body, status = _gateway_sse_probe_payload({'show_cli_sessions': True}, watcher=w)
assert status == 503
assert body['watcher_running'] is False
assert body['ok'] is False
def test_gateway_watcher_is_alive_public_method():
"""GatewayWatcher.is_alive() is the public API the probe uses. Cover all
three states: before start(), while running, after stop()."""
from api.gateway_watcher import GatewayWatcher
w = GatewayWatcher()
# Before start(): no thread
assert w.is_alive() is False, "is_alive() must be False before start()"
# After start(): thread running
w.start()
try:
assert w.is_alive() is True, "is_alive() must be True while running"
finally:
w.stop()
# After stop(): thread cleared
assert w.is_alive() is False, "is_alive() must be False after stop()"
def test_probe_payload_prefers_public_is_alive():
"""Regression guard: _gateway_sse_probe_payload must call watcher.is_alive()
rather than poking at _thread directly when the public method exists."""
calls = []
class _WatcherWithPublicApi:
def is_alive(self):
calls.append('is_alive')
return True
# _thread is deliberately absent — must not be accessed.
body, status = _gateway_sse_probe_payload(
{'show_cli_sessions': True},
watcher=_WatcherWithPublicApi(),
)
assert status == 200
assert body['watcher_running'] is True
assert calls == ['is_alive'], (
"probe must prefer the public is_alive() method over poking _thread"
)