fix(streaming): eagerly release session lock in cancel_stream() (#778)

cancel_stream() now pops STREAMS/CANCEL_FLAGS/AGENT_INSTANCES and clears session.active_stream_id immediately after signalling cancel. Fixes sessions permanently stuck at 409 when the agent thread is blocked in a bad tool call. Session cleanup runs outside STREAMS_LOCK to preserve lock ordering.

Fixes #653

Co-authored-by: bergeouss <bergeouss@users.noreply.github.com>
This commit is contained in:
nesquena-hermes
2026-04-20 16:54:40 -07:00
committed by GitHub
parent c34892be44
commit a7e8b1ab83
4 changed files with 227 additions and 5 deletions

View File

@@ -1507,7 +1507,16 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
def cancel_stream(stream_id: str) -> bool:
"""Signal an in-flight stream to cancel. Returns True if the stream existed."""
"""Signal an in-flight stream to cancel. Returns True if the stream existed.
Eagerly releases the session lock (pops STREAMS/CANCEL_FLAGS/AGENT_INSTANCES
and clears session.active_stream_id) so new /api/chat/start requests succeed
immediately after cancel, even if the agent thread is still blocked.
The worker thread's finally block uses .pop(key, None), so the double-pop is
a safe no-op. Session cleanup runs outside STREAMS_LOCK to preserve lock
ordering (streaming thread does LOCK → STREAMS_LOCK; inverting would deadlock).
"""
with STREAMS_LOCK:
if stream_id not in STREAMS:
return False
@@ -1552,4 +1561,34 @@ def cancel_stream(stream_id: str) -> bool:
q.put_nowait(('cancel', {'message': 'Cancelled by user'}))
except Exception:
logger.debug("Failed to put cancel event to queue")
# ── Eager session lock release (fixes #653) ──────────────────────────
# Pop stream state now so the 409 guard in routes.py sees the session
# as idle and allows new /api/chat/start immediately after cancel,
# even if the agent thread is still blocked in a C-level syscall.
# The worker thread's finally block uses .pop(key, None) too, so a
# double-pop here is safe (no-op).
STREAMS.pop(stream_id, None)
CANCEL_FLAGS.pop(stream_id, None)
AGENT_INSTANCES.pop(stream_id, None)
# Capture session_id while holding STREAMS_LOCK (avoids a race where
# the agent thread deallocates the agent object after we release).
# Session cleanup (get_session + save) must happen OUTSIDE the lock —
# get_session() acquires LOCK, and the streaming thread does LOCK first
# then STREAMS_LOCK, so inverting the order here would cause deadlock.
_cancel_session_id = getattr(agent, 'session_id', None) if agent else None
# Session cleanup outside STREAMS_LOCK to preserve lock ordering.
if _cancel_session_id:
try:
_cs = get_session(_cancel_session_id)
_cs.active_stream_id = None
_cs.pending_user_message = None
_cs.pending_attachments = []
_cs.pending_started_at = None
_cs.save()
except Exception:
logger.debug("Failed to clear session state on cancel for %s", _cancel_session_id)
return True