fix: harden session persistence and per-session lock handling during streaming (v0.50.175, #910) (#910)

Co-authored-by: starship-s

Co-authored-by: nesquena-hermes <nesquena-hermes@users.noreply.github.com>
This commit is contained in:
nesquena-hermes
2026-04-23 14:25:43 -07:00
committed by GitHub
parent 5082f426f2
commit 5b923a9502
9 changed files with 1237 additions and 429 deletions

View File

@@ -1683,6 +1683,25 @@ SESSION_AGENT_LOCKS_LOCK = threading.Lock()
def _get_session_agent_lock(session_id: str) -> threading.Lock:
"""Return the per-session Lock used to serialize all Session mutations.
Lock lifecycle invariant:
- A Lock is created lazily on first access and lives in SESSION_AGENT_LOCKS
for the lifetime of the session.
- The entry is pruned in /api/session/delete (under SESSION_AGENT_LOCKS_LOCK)
so deleted sessions don't leak a Lock forever.
- During context compression the agent may rotate session_id. The
streaming thread migrates the lock entry atomically under
SESSION_AGENT_LOCKS_LOCK: it aliases the new session_id to the *same*
Lock object and pops the old-id entry (see streaming.py compression
block). This ensures that subsequent callers using the new ID still
acquire the same Lock, while the old-id entry is removed to prevent a
leak. The streaming thread already holds the Lock during this
migration, so the reference stays alive even after the dict entry is
removed.
- Lock contract: hold for the in-memory mutation + s.save() only; never
across network I/O (LLM calls, HTTP requests).
"""
with SESSION_AGENT_LOCKS_LOCK:
if session_id not in SESSION_AGENT_LOCKS:
SESSION_AGENT_LOCKS[session_id] = threading.Lock()

View File

@@ -1,10 +1,9 @@
"""
Hermes Web UI -- Session model and in-memory session store.
"""
"""Hermes Web UI -- Session model and in-memory session store."""
import collections
import json
import logging
import os
import threading
import time
import uuid
from pathlib import Path
@@ -19,6 +18,46 @@ from api.workspace import get_last_workspace
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Stale temp-file cleanup
# ---------------------------------------------------------------------------
# Both Session.save() and _write_session_index() use the atomic-write pattern:
# write to <path>.tmp.<pid>.<tid> → os.replace() to final path
# If the process crashes between write and replace the .tmp file is left
# behind. Because the name embeds pid + tid, leftover files can never be
# reused by a different process/thread, so they are safe to remove on the
# next startup. _cleanup_stale_tmp_files() is called from the full-rebuild
# path of _write_session_index (i.e. at first index access / startup) and
# removes any *.tmp.* file whose mtime is older than one hour.
# ---------------------------------------------------------------------------
_STALE_TMP_AGE_SECONDS = 3600 # 1 hour
# Serializes index writers so concurrent Session.save() calls cannot race on
# stale baselines while still allowing LOCK to be released before disk I/O.
_INDEX_WRITE_LOCK = threading.RLock()
def _cleanup_stale_tmp_files() -> None:
"""Best-effort removal of stale ``*.tmp.*`` files from SESSION_DIR.
Only files whose mtime is older than ``_STALE_TMP_AGE_SECONDS`` are
removed so that in-flight writes from a long-running sibling process
are not disturbed. Errors are logged and swallowed — this must never
prevent startup.
"""
cutoff = time.time() - _STALE_TMP_AGE_SECONDS
try:
for p in SESSION_DIR.glob('*.tmp.*'):
try:
if p.stat().st_mtime < cutoff:
p.unlink(missing_ok=True)
logger.debug("Cleaned up stale tmp file: %s", p.name)
except OSError:
pass # best-effort
except Exception:
pass # SESSION_DIR may not exist yet; that's fine
def _index_entry_exists(session_id: str, in_memory_ids=None) -> bool:
"""Return True if an index entry still has backing state.
@@ -46,58 +85,101 @@ def _write_session_index(updates=None):
entries should be refreshed), this does a targeted in-place update of
the existing index — O(1) for single-session changes. When *updates*
is None, a full rebuild is performed (used on startup / first call).
"""
# Lazy full-rebuild path — used when index doesn't exist yet.
if updates is None or not SESSION_INDEX_FILE.exists():
entries = []
for p in SESSION_DIR.glob('*.json'):
if p.name.startswith('_'): continue
try:
s = Session.load(p.stem)
if s: entries.append(s.compact())
except Exception:
logger.debug("Failed to load session from %s", p)
with LOCK:
for s in SESSIONS.values():
if not any(e['session_id'] == s.session_id for e in entries):
entries.append(s.compact())
entries.sort(key=lambda s: s['updated_at'], reverse=True)
_tmp = SESSION_INDEX_FILE.with_suffix('.tmp')
_tmp.write_text(json.dumps(entries, ensure_ascii=False, indent=2), encoding='utf-8')
os.replace(_tmp, SESSION_INDEX_FILE)
return
# Fast path: patch existing index with updated sessions.
# This avoids loading every session file on every single save().
# LOCK covers the entire read-patch-write to prevent concurrent save() calls
# from both reading the same baseline and one losing its update.
_fallback = False
try:
with LOCK:
existing = json.loads(SESSION_INDEX_FILE.read_text(encoding='utf-8'))
in_memory_ids = set(SESSIONS.keys())
existing = [
e for e in existing
if _index_entry_exists(e.get('session_id'), in_memory_ids=in_memory_ids)
]
# Build lookup of updated entries
updated_map = {s.session_id: s.compact() for s in updates}
existing_ids = {e.get('session_id') for e in existing}
# Add any updated entries not yet in the index
for sid, entry in updated_map.items():
if sid not in existing_ids:
existing.append(entry)
# Replace matching entries in-place
for i, e in enumerate(existing):
sid = e.get('session_id')
if sid in updated_map:
existing[i] = updated_map[sid]
existing.sort(key=lambda s: s.get('updated_at', 0), reverse=True)
_tmp = SESSION_INDEX_FILE.with_suffix('.tmp')
_tmp.write_text(json.dumps(existing, ensure_ascii=False, indent=2), encoding='utf-8')
os.replace(_tmp, SESSION_INDEX_FILE)
except Exception:
_fallback = True
LOCK protects in-memory state snapshots and payload construction only;
disk I/O (write/flush/fsync/replace) always runs outside LOCK.
"""
_tmp = SESSION_INDEX_FILE.with_suffix(f'.tmp.{os.getpid()}.{threading.current_thread().ident}')
with _INDEX_WRITE_LOCK:
# Lazy full-rebuild path — used when index doesn't exist yet.
if updates is None or not SESSION_INDEX_FILE.exists():
_cleanup_stale_tmp_files() # best-effort sweep on startup / first call
entries = []
for p in SESSION_DIR.glob('*.json'):
if p.name.startswith('_'):
continue
try:
s = Session.load(p.stem)
if s:
entries.append(s.compact())
except Exception:
logger.debug("Failed to load session from %s", p)
with LOCK:
existing_ids = {e.get('session_id') for e in entries}
for s in SESSIONS.values():
if s.session_id not in existing_ids:
entries.append(s.compact())
entries.sort(key=lambda s: s.get('updated_at', 0), reverse=True)
_payload = json.dumps(entries, ensure_ascii=False, indent=2)
try:
with open(_tmp, 'w', encoding='utf-8') as f:
f.write(_payload)
f.flush()
os.fsync(f.fileno())
os.replace(_tmp, SESSION_INDEX_FILE)
except Exception:
# Best-effort cleanup of stale tmp on failure
try:
_tmp.unlink(missing_ok=True)
except Exception:
pass
raise
return
# Fast path: patch existing index with updated sessions.
# This avoids loading every session file on every single save().
_fallback = False
try:
with LOCK:
existing = json.loads(SESSION_INDEX_FILE.read_text(encoding='utf-8'))
in_memory_ids = set(SESSIONS.keys())
# Avoid N filesystem exists() checks under LOCK by collecting
# on-disk IDs once.
on_disk_ids = {
p.stem
for p in SESSION_DIR.glob('*.json')
if not p.name.startswith('_')
}
existing = [
e for e in existing
if (e.get('session_id') in in_memory_ids or e.get('session_id') in on_disk_ids)
]
# Build lookup of updated entries
updated_map = {s.session_id: s.compact() for s in updates}
existing_ids = {e.get('session_id') for e in existing}
# Add any updated entries not yet in the index
for sid, entry in updated_map.items():
if sid not in existing_ids:
existing.append(entry)
# Replace matching entries in-place
for i, e in enumerate(existing):
sid = e.get('session_id')
if sid in updated_map:
existing[i] = updated_map[sid]
existing.sort(key=lambda s: s.get('updated_at', 0), reverse=True)
_payload = json.dumps(existing, ensure_ascii=False, indent=2)
try:
with open(_tmp, 'w', encoding='utf-8') as f:
f.write(_payload)
f.flush()
os.fsync(f.fileno())
os.replace(_tmp, SESSION_INDEX_FILE)
except Exception:
try:
_tmp.unlink(missing_ok=True)
except Exception:
pass
raise
except Exception:
_fallback = True
if _fallback:
# Corrupt or missing index — fall back to full rebuild (called outside LOCK to avoid deadlock)
_write_session_index(updates=None)
@@ -157,10 +239,20 @@ class Session:
def save(self, touch_updated_at: bool = True, skip_index: bool = False) -> None:
if touch_updated_at:
self.updated_at = time.time()
self.path.write_text(
json.dumps(self.__dict__, ensure_ascii=False, indent=2),
encoding='utf-8',
)
payload = json.dumps(self.__dict__, ensure_ascii=False, indent=2)
tmp = self.path.with_suffix(f'.tmp.{os.getpid()}.{threading.current_thread().ident}')
try:
with open(tmp, 'w', encoding='utf-8') as f:
f.write(payload)
f.flush()
os.fsync(f.fileno())
os.replace(tmp, self.path)
except Exception:
try:
tmp.unlink(missing_ok=True)
except Exception:
pass
raise
if not skip_index:
_write_session_index(updates=[self])

View File

@@ -66,6 +66,9 @@ from api.config import (
MAX_FILE_BYTES,
MAX_UPLOAD_BYTES,
CHAT_LOCK,
_get_session_agent_lock,
SESSION_AGENT_LOCKS,
SESSION_AGENT_LOCKS_LOCK,
load_settings,
save_settings,
set_hermes_default_model,
@@ -1049,8 +1052,9 @@ def handle_post(handler, parsed) -> bool:
s = get_session(body["session_id"])
except KeyError:
return bad(handler, "Session not found", 404)
s.title = str(body["title"]).strip()[:80] or "Untitled"
s.save()
with _get_session_agent_lock(body["session_id"]):
s.title = str(body["title"]).strip()[:80] or "Untitled"
s.save()
return j(handler, {"session": s.compact()})
if parsed.path == "/api/personality/set":
@@ -1093,8 +1097,9 @@ def handle_post(handler, parsed) -> bool:
prompt = "\n".join(p for p in parts if p)
else:
prompt = str(value)
s.personality = name if name else None
s.save()
with _get_session_agent_lock(sid):
s.personality = name if name else None
s.save()
return j(handler, {"ok": True, "personality": s.personality, "prompt": prompt})
if parsed.path == "/api/session/update":
@@ -1110,9 +1115,10 @@ def handle_post(handler, parsed) -> bool:
new_ws = str(resolve_trusted_workspace(body.get("workspace", s.workspace)))
except ValueError as e:
return bad(handler, str(e))
s.workspace = new_ws
s.model = body.get("model", s.model)
s.save()
with _get_session_agent_lock(body["session_id"]):
s.workspace = new_ws
s.model = body.get("model", s.model)
s.save()
set_last_workspace(new_ws)
return j(handler, {"session": s.compact() | {"messages": s.messages}})
@@ -1134,6 +1140,10 @@ def handle_post(handler, parsed) -> bool:
p.unlink(missing_ok=True)
except Exception:
logger.debug("Failed to unlink session file %s", p)
# Prune the per-session agent lock so deleted sessions don't leak
# Lock entries in SESSION_AGENT_LOCKS forever.
with SESSION_AGENT_LOCKS_LOCK:
SESSION_AGENT_LOCKS.pop(sid, None)
try:
SESSION_INDEX_FILE.unlink(missing_ok=True)
except Exception:
@@ -1156,10 +1166,11 @@ def handle_post(handler, parsed) -> bool:
s = get_session(body["session_id"])
except KeyError:
return bad(handler, "Session not found", 404)
s.messages = []
s.tool_calls = []
s.title = "Untitled"
s.save()
with _get_session_agent_lock(body["session_id"]):
s.messages = []
s.tool_calls = []
s.title = "Untitled"
s.save()
return j(handler, {"ok": True, "session": s.compact()})
if parsed.path == "/api/session/truncate":
@@ -1174,8 +1185,9 @@ def handle_post(handler, parsed) -> bool:
except KeyError:
return bad(handler, "Session not found", 404)
keep = int(body["keep_count"])
s.messages = s.messages[:keep]
s.save()
with _get_session_agent_lock(body["session_id"]):
s.messages = s.messages[:keep]
s.save()
return j(
handler, {"ok": True, "session": s.compact() | {"messages": s.messages}}
)
@@ -1448,8 +1460,9 @@ def handle_post(handler, parsed) -> bool:
s = get_session(body["session_id"])
except KeyError:
return bad(handler, "Session not found", 404)
s.pinned = bool(body.get("pinned", True))
s.save()
with _get_session_agent_lock(body["session_id"]):
s.pinned = bool(body.get("pinned", True))
s.save()
return j(handler, {"ok": True, "session": s.compact()})
# ── Session archive (POST) ──
@@ -1462,8 +1475,9 @@ def handle_post(handler, parsed) -> bool:
s = get_session(body["session_id"])
except KeyError:
return bad(handler, "Session not found", 404)
s.archived = bool(body.get("archived", True))
s.save()
with _get_session_agent_lock(body["session_id"]):
s.archived = bool(body.get("archived", True))
s.save()
return j(handler, {"ok": True, "session": s.compact()})
# ── Session move to project (POST) ──
@@ -1476,8 +1490,9 @@ def handle_post(handler, parsed) -> bool:
s = get_session(body["session_id"])
except KeyError:
return bad(handler, "Session not found", 404)
s.project_id = body.get("project_id") or None
s.save()
with _get_session_agent_lock(body["session_id"]):
s.project_id = body.get("project_id") or None
s.save()
return j(handler, {"ok": True, "session": s.compact()})
# ── Project CRUD (POST) ──
@@ -2445,13 +2460,14 @@ def _handle_chat_start(handler, body):
# Stale stream id from a previous run; clear and continue.
s.active_stream_id = None
stream_id = uuid.uuid4().hex
s.workspace = workspace
s.model = model
s.active_stream_id = stream_id
s.pending_user_message = msg
s.pending_attachments = attachments
s.pending_started_at = time.time()
s.save()
with _get_session_agent_lock(s.session_id):
s.workspace = workspace
s.model = model
s.active_stream_id = stream_id
s.pending_user_message = msg
s.pending_attachments = attachments
s.pending_started_at = time.time()
s.save()
set_last_workspace(workspace)
q = queue.Queue()
with STREAMS_LOCK:
@@ -2470,15 +2486,14 @@ def _handle_chat_start(handler, body):
def _handle_chat_sync(handler, body):
"""Fallback synchronous chat endpoint (POST /api/chat). Not used by frontend."""
from api.config import _get_session_agent_lock
s = get_session(body["session_id"])
msg = str(body.get("message", "")).strip()
if not msg:
return j(handler, {"error": "empty message"}, status=400)
workspace = Path(body.get("workspace") or s.workspace).expanduser().resolve()
s.workspace = str(workspace)
s.model = body.get("model") or s.model
with _get_session_agent_lock(s.session_id):
s.workspace = str(workspace)
s.model = body.get("model") or s.model
from api.streaming import _ENV_LOCK
with _ENV_LOCK:
@@ -2559,14 +2574,15 @@ def _handle_chat_sync(handler, body):
os.environ.pop("HERMES_SESSION_KEY", None)
else:
os.environ["HERMES_SESSION_KEY"] = old_session_key
s.messages = _restore_reasoning_metadata(
_previous_messages,
result.get("messages") or s.messages,
)
# Only auto-generate title when still default; preserves user renames
if s.title == "Untitled":
s.title = title_from(s.messages, s.title)
s.save()
with _get_session_agent_lock(s.session_id):
s.messages = _restore_reasoning_metadata(
_previous_messages,
result.get("messages") or s.messages,
)
# Only auto-generate title when still default; preserves user renames
if s.title == "Untitled":
s.title = title_from(s.messages, s.title)
s.save()
# Sync to state.db for /insights (opt-in setting)
try:
if load_settings().get("sync_to_insights"):
@@ -3094,33 +3110,42 @@ def _handle_session_compress(handler, body):
if not resolved_api_key:
return bad(handler, "No provider configured -- cannot compress.")
with _cfg._get_session_agent_lock(sid):
original_messages = list(messages)
approx_tokens = _estimate_messages_tokens_rough(original_messages)
# Compute compression *outside* the lock — the LLM round-trip can take
# many seconds and we must not block cancel_stream or other writers.
# Lock contract: hold for the in-memory mutation only, never across
# network I/O.
original_messages = list(messages)
approx_tokens = _estimate_messages_tokens_rough(original_messages)
agent = _run_agent.AIAgent(
model=resolved_model,
provider=resolved_provider,
base_url=resolved_base_url,
api_key=resolved_api_key,
platform="cli",
quiet_mode=True,
enabled_toolsets=_resolve_cli_toolsets(),
session_id=sid,
)
compressed = agent.context_compressor.compress(
original_messages,
current_tokens=approx_tokens,
focus_topic=focus_topic,
)
new_tokens = _estimate_messages_tokens_rough(compressed)
summary = _summarize_manual_compression(
original_messages,
compressed,
approx_tokens,
new_tokens,
focus_topic=focus_topic,
)
agent = _run_agent.AIAgent(
model=resolved_model,
provider=resolved_provider,
base_url=resolved_base_url,
api_key=resolved_api_key,
platform="cli",
quiet_mode=True,
enabled_toolsets=_resolve_cli_toolsets(),
session_id=sid,
)
compressed = agent.context_compressor.compress(
original_messages,
current_tokens=approx_tokens,
focus_topic=focus_topic,
)
new_tokens = _estimate_messages_tokens_rough(compressed)
summary = _summarize_manual_compression(
original_messages,
compressed,
approx_tokens,
new_tokens,
focus_topic=focus_topic,
)
with _cfg._get_session_agent_lock(sid):
# Re-read messages to detect concurrent edits during the LLM call.
# If the history changed, the compression result is stale — abort.
if _sanitize_messages_for_api(s.messages) != original_messages:
return bad(handler, "Session was modified during compression; please retry.", 409)
s.messages = compressed
s.tool_calls = []

View File

@@ -9,7 +9,7 @@ from __future__ import annotations
import logging
from typing import Any
from api.config import LOCK
from api.config import LOCK, _get_session_agent_lock
from api.models import get_session, SESSIONS
logger = logging.getLogger(__name__)
@@ -27,38 +27,43 @@ def retry_last(session_id: str) -> dict[str, Any]:
KeyError: session not found
ValueError: no user message in transcript
"""
# get_session() and Session.save() both acquire the module-level LOCK
# internally (the latter via _write_session_index()), and LOCK is a
# non-reentrant threading.Lock — so they MUST be called outside our
# own `with LOCK:` block to avoid self-deadlocking.
#
# The race we close is the read-modify-write of s.messages: two
# concurrent /api/session/retry calls could otherwise both compute the
# same last_user_idx from the same history and double-truncate. We
# serialize just the in-memory mutation; persistence happens outside
# the lock and is naturally last-write-wins on a consistent state.
#
# Stale-object guard: on a cache miss, two concurrent get_session()
# calls can each load and cache a *different* Session instance for the
# same session_id (the second store_clobbers the first). Re-bind to
# the canonical cached instance inside the lock so the mutation lands
# on the object the next reader will see, not a stale parallel copy.
s = get_session(session_id) # raises KeyError if missing
with LOCK:
s = SESSIONS.get(session_id, s)
history = s.messages or []
last_user_idx = None
for i in range(len(history) - 1, -1, -1):
if history[i].get('role') == 'user':
last_user_idx = i
break
if last_user_idx is None:
raise ValueError('No previous message to retry.')
# Acquire the per-session agent lock as the outermost lock so that the
# read-modify-write of s.messages is serialised with the periodic
# checkpoint thread, cancel_stream, and all other session writers.
# Lock ordering: _agent_lock → LOCK → _write_session_index (LOCK).
with _get_session_agent_lock(session_id):
# get_session() and Session.save() both acquire the module-level LOCK
# internally (the latter via _write_session_index()), and LOCK is a
# non-reentrant threading.Lock — so they MUST be called outside our
# own `with LOCK:` block to avoid self-deadlocking.
#
# The race we close is the read-modify-write of s.messages: two
# concurrent /api/session/retry calls could otherwise both compute the
# same last_user_idx from the same history and double-truncate. We
# serialize just the in-memory mutation; persistence happens inside
# the per-session lock so the checkpoint thread cannot race us.
#
# Stale-object guard: on a cache miss, two concurrent get_session()
# calls can each load and cache a *different* Session instance for the
# same session_id (the second store clobbers the first). Re-bind to
# the canonical cached instance inside the lock so the mutation lands
# on the object the next reader will see, not a stale parallel copy.
s = get_session(session_id) # raises KeyError if missing
with LOCK:
s = SESSIONS.get(session_id, s)
history = s.messages or []
last_user_idx = None
for i in range(len(history) - 1, -1, -1):
if history[i].get('role') == 'user':
last_user_idx = i
break
if last_user_idx is None:
raise ValueError('No previous message to retry.')
last_user_text = _extract_text(history[last_user_idx].get('content', ''))
removed_count = len(history) - last_user_idx
s.messages = history[:last_user_idx]
s.save()
last_user_text = _extract_text(history[last_user_idx].get('content', ''))
removed_count = len(history) - last_user_idx
s.messages = history[:last_user_idx]
s.save()
return {'last_user_text': last_user_text, 'removed_count': removed_count}
@@ -72,23 +77,28 @@ def undo_last(session_id: str) -> dict[str, Any]:
KeyError: session not found
ValueError: no user message in transcript
"""
s = get_session(session_id) # acquires LOCK transiently
with LOCK:
# Stale-object guard — see retry_last for the rationale.
s = SESSIONS.get(session_id, s)
history = s.messages or []
last_user_idx = None
for i in range(len(history) - 1, -1, -1):
if history[i].get('role') == 'user':
last_user_idx = i
break
if last_user_idx is None:
raise ValueError('Nothing to undo.')
# Acquire the per-session agent lock as the outermost lock so that the
# read-modify-write of s.messages is serialised with the periodic
# checkpoint thread, cancel_stream, and all other session writers.
# Lock ordering: _agent_lock → LOCK → _write_session_index (LOCK).
with _get_session_agent_lock(session_id):
s = get_session(session_id) # acquires LOCK transiently
with LOCK:
# Stale-object guard — see retry_last for the rationale.
s = SESSIONS.get(session_id, s)
history = s.messages or []
last_user_idx = None
for i in range(len(history) - 1, -1, -1):
if history[i].get('role') == 'user':
last_user_idx = i
break
if last_user_idx is None:
raise ValueError('Nothing to undo.')
removed_text = _extract_text(history[last_user_idx].get('content', ''))
removed_count = len(history) - last_user_idx
s.messages = history[:last_user_idx]
s.save() # outside LOCK -- save() re-acquires LOCK via _write_session_index()
removed_text = _extract_text(history[last_user_idx].get('content', ''))
removed_count = len(history) - last_user_idx
s.messages = history[:last_user_idx]
s.save() # outside LOCK -- save() re-acquires LOCK via _write_session_index()
preview = (removed_text[:40] + '...') if len(removed_text) > 40 else removed_text
return {
'removed_count': removed_count,

View File

@@ -2,6 +2,7 @@
Hermes Web UI -- SSE streaming engine and agent thread runner.
Includes Sprint 10 cancel support via CANCEL_FLAGS.
"""
import contextlib
import json
import logging
import os
@@ -20,6 +21,7 @@ from api.config import (
STREAMS, STREAMS_LOCK, CANCEL_FLAGS, AGENT_INSTANCES, STREAM_PARTIAL_TEXT,
LOCK, SESSIONS, SESSION_DIR,
_get_session_agent_lock, _set_thread_env, _clear_thread_env,
SESSION_AGENT_LOCKS, SESSION_AGENT_LOCKS_LOCK,
resolve_model_provider,
)
from api.helpers import redact_session_data
@@ -534,18 +536,46 @@ def _run_background_title_update(session_id: str, user_text: str, assistant_text
if next_title:
logger.debug("Using local fallback for session title generation")
source = 'fallback'
if next_title and next_title != current:
s.title = next_title
s.llm_title_generated = True
# Keep chronological ordering stable in the sidebar.
s.save(touch_updated_at=False)
wrote_title = False
effective_title = current
if next_title:
# Hold _agent_lock only for in-memory mutation + save so title write
# is serialized with checkpoint saves, cancel_stream, and other
# session-mutating endpoints. The LLM round-trip above ran outside
# the lock to avoid blocking other writers.
with _get_session_agent_lock(session_id):
# Stale-object guard: rebind to the canonical cached Session
# instance under LOCK before checking whether a user rename
# landed while the LLM title request was in-flight.
with LOCK:
s = SESSIONS.get(session_id, s)
effective_title = str(s.title or '').strip()
invalid_existing_now = _looks_invalid_generated_title(s.title)
still_auto = (
effective_title == placeholder_title
or effective_title in ('Untitled', 'New Chat', '')
or _is_provisional_title(effective_title, s.messages)
or invalid_existing_now
)
if not still_auto:
_put_title_status(put_event, session_id, 'skipped', 'manual_title', effective_title)
return
if next_title != effective_title:
s.title = next_title
s.llm_title_generated = True
# Keep chronological ordering stable in the sidebar.
s.save(touch_updated_at=False)
effective_title = s.title
wrote_title = True
if wrote_title:
if source == 'fallback':
_put_title_status(put_event, session_id, source, 'local_summary', s.title, raw_preview)
_put_title_status(put_event, session_id, source, 'local_summary', effective_title, raw_preview)
else:
_put_title_status(put_event, session_id, source, llm_status, s.title, raw_preview)
put_event('title', {'session_id': session_id, 'title': s.title})
_put_title_status(put_event, session_id, source, llm_status, effective_title, raw_preview)
put_event('title', {'session_id': session_id, 'title': effective_title})
else:
_put_title_status(put_event, session_id, 'skipped', source or 'unchanged', current, raw_preview)
_put_title_status(put_event, session_id, 'skipped', source or 'unchanged', effective_title, raw_preview)
finally:
put_event('stream_end', {'session_id': session_id})
@@ -830,6 +860,8 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
# block can safely check `if _checkpoint_stop is not None` even when an
# exception fires before the checkpoint thread is created (Issue #765).
_checkpoint_stop = None
_ckpt_thread = None
_agent_lock = None
try:
s = get_session(session_id)
s.workspace = str(Path(workspace).expanduser().resolve())
@@ -974,6 +1006,11 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
_reasoning_text += str(text)
put('reasoning', {'text': str(text)})
# Pre-initialise the activity counter here so on_tool (which
# closes over it) never captures an unbound name even if this
# block is reordered later (Issue #765).
_checkpoint_activity = [0]
def on_tool(*cb_args, **cb_kwargs):
event_type = None
name = None
@@ -1224,7 +1261,7 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
# response — better than a silent loss of the entire conversation turn.
# The final s.save() at task completion handles the full session update + index.
# (_checkpoint_stop is pre-initialised at the top of the outer try.)
_checkpoint_activity = [0]
# (_checkpoint_activity is already initialised before on_tool().)
def _periodic_checkpoint():
last_saved_activity = 0
@@ -1232,7 +1269,8 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
try:
cur = _checkpoint_activity[0]
if cur > last_saved_activity:
s.save(skip_index=True)
with _agent_lock:
s.save(skip_index=True)
last_saved_activity = cur
except Exception as e:
logger.debug("Periodic checkpoint save failed: %s", e)
@@ -1251,193 +1289,214 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
task_id=session_id,
persist_user_message=msg_text,
)
s.messages = _restore_reasoning_metadata(
_previous_messages,
result.get('messages') or s.messages,
)
# Strip XML tool-call blocks from assistant message content.
# DeepSeek and some other providers emit <function_calls>...</function_calls>
# in the raw response text; this must be removed before the content is
# saved to the session and displayed in the chat bubble. (#702)
for _m in s.messages:
if isinstance(_m, dict) and _m.get('role') == 'assistant':
_raw_content = _m.get('content')
if isinstance(_raw_content, str):
_cleaned = _strip_xml_tool_calls(_raw_content)
if _cleaned != _raw_content:
_m['content'] = _cleaned
elif isinstance(_raw_content, list):
for _part in _raw_content:
if isinstance(_part, dict) and isinstance(_part.get('text'), str):
_part['text'] = _strip_xml_tool_calls(_part['text'])
if _checkpoint_stop is not None:
_checkpoint_stop.set()
if _ckpt_thread is not None:
_ckpt_thread.join(timeout=15)
with _agent_lock:
s.messages = _restore_reasoning_metadata(
_previous_messages,
result.get('messages') or s.messages,
)
# Strip XML tool-call blocks from assistant message content.
# DeepSeek and some other providers emit <function_calls>...</function_calls>
# in the raw response text; this must be removed before the content is
# saved to the session and displayed in the chat bubble. (#702)
for _m in s.messages:
if isinstance(_m, dict) and _m.get('role') == 'assistant':
_raw_content = _m.get('content')
if isinstance(_raw_content, str):
_cleaned = _strip_xml_tool_calls(_raw_content)
if _cleaned != _raw_content:
_m['content'] = _cleaned
elif isinstance(_raw_content, list):
for _part in _raw_content:
if isinstance(_part, dict) and isinstance(_part.get('text'), str):
_part['text'] = _strip_xml_tool_calls(_part['text'])
# ── Detect silent agent failure (no assistant reply produced) ──
# When the agent catches an auth/network error internally it may return
# an empty final_response without raising — the stream would end with
# a done event containing zero assistant messages, leaving the user with
# no feedback. Emit an apperror so the client shows an inline error.
_assistant_added = any(
m.get('role') == 'assistant' and str(m.get('content') or '').strip()
for m in (result.get('messages') or [])
)
# _token_sent tracks whether on_token() was called (any streamed text)
if not _assistant_added and not _token_sent:
_last_err = getattr(agent, '_last_error', None) or result.get('error') or ''
_err_str = str(_last_err) if _last_err else ''
_err_lower = _err_str.lower()
_is_quota = (
'insufficient credit' in _err_lower
or 'credit balance' in _err_lower
or 'credits exhausted' in _err_lower
or 'quota_exceeded' in _err_lower
or 'quota exceeded' in _err_lower
or 'exceeded your current quota' in _err_lower
# ── Detect silent agent failure (no assistant reply produced) ──
# When the agent catches an auth/network error internally it may return
# an empty final_response without raising — the stream would end with
# a done event containing zero assistant messages, leaving the user with
# no feedback. Emit an apperror so the client shows an inline error.
_assistant_added = any(
m.get('role') == 'assistant' and str(m.get('content') or '').strip()
for m in (result.get('messages') or [])
)
_is_auth = (
not _is_quota and (
'401' in _err_str
or (_last_err and 'AuthenticationError' in type(_last_err).__name__)
or 'authentication' in _err_lower
or 'unauthorized' in _err_lower
or 'invalid api key' in _err_lower
or 'invalid_api_key' in _err_lower
# _token_sent tracks whether on_token() was called (any streamed text)
if not _assistant_added and not _token_sent:
_last_err = getattr(agent, '_last_error', None) or result.get('error') or ''
_err_str = str(_last_err) if _last_err else ''
_err_lower = _err_str.lower()
_is_quota = (
'insufficient credit' in _err_lower
or 'credit balance' in _err_lower
or 'credits exhausted' in _err_lower
or 'quota_exceeded' in _err_lower
or 'quota exceeded' in _err_lower
or 'exceeded your current quota' in _err_lower
)
_is_auth = (
not _is_quota and (
'401' in _err_str
or (_last_err and 'AuthenticationError' in type(_last_err).__name__)
or 'authentication' in _err_lower
or 'unauthorized' in _err_lower
or 'invalid api key' in _err_lower
or 'invalid_api_key' in _err_lower
)
)
if _is_quota:
_err_label = 'Out of credits'
_err_type = 'quota_exhausted'
_err_hint = 'Your provider account is out of credits. Top up your balance or switch providers via `hermes model`.'
elif _is_auth:
_err_label = 'Authentication failed'
_err_type = 'auth_mismatch'
_err_hint = (
'The selected model may not be supported by your configured provider or '
'your API key is invalid. Run `hermes model` in your terminal to '
'update credentials, then restart the WebUI.'
)
else:
_err_label = 'No response received'
_err_type = 'no_response'
_err_hint = 'Verify your API key is valid and the selected model is available for your account.'
put('apperror', {
'message': _err_str or f'{_err_label}.',
'type': _err_type,
'hint': _err_hint,
})
# Clear stream/pending state so the session does not appear
# "agent_running" on reload after a silent failure.
# Persist the error so it survives page reload.
# _error=True ensures _sanitize_messages_for_api excludes it from
# subsequent API calls so the LLM never sees its own error as prior context.
s.active_stream_id = None
s.pending_user_message = None
s.pending_attachments = []
s.pending_started_at = None
s.messages.append({
'role': 'assistant',
'content': f'**{_err_label}:** {_err_str or _err_label}\n\n*{_err_hint}*',
'timestamp': int(time.time()),
'_error': True,
})
try:
s.save()
except Exception:
pass
return # apperror already closes the stream on the client side
# ── Handle context compression side effects ──
# If compression fired inside run_conversation, the agent may have
# rotated its session_id. Detect and fix the mismatch so the WebUI
# continues writing to the correct session file.
#
# Lock migration: when session_id rotates, we alias the new ID to
# the *same* Lock object under SESSION_AGENT_LOCKS so that
# subsequent callers using _get_session_agent_lock(new_sid) get the
# same Lock the streaming thread is already holding. We then pop
# the old-id entry to prevent a leak. This is safe because we
# already hold _agent_lock (the Lock object itself), so the
# reference stays alive even after the dict entry is removed.
# Concurrent readers that already looked up the old ID will still
# see the same Lock object until they release it.
_agent_sid = getattr(agent, 'session_id', None)
_compressed = False
if _agent_sid and _agent_sid != session_id:
old_sid = session_id
new_sid = _agent_sid
# Rename the session file
old_path = SESSION_DIR / f'{old_sid}.json'
new_path = SESSION_DIR / f'{new_sid}.json'
s.session_id = new_sid
with LOCK:
if old_sid in SESSIONS:
SESSIONS[new_sid] = SESSIONS.pop(old_sid)
# Migrate the per-session lock: alias new_sid to the held
# _agent_lock reference directly (not via old_sid lookup),
# then remove the old_sid entry to prevent a leak.
with SESSION_AGENT_LOCKS_LOCK:
SESSION_AGENT_LOCKS[new_sid] = _agent_lock
SESSION_AGENT_LOCKS.pop(old_sid, None)
if old_path.exists() and not new_path.exists():
try:
old_path.rename(new_path)
except OSError:
logger.debug("Failed to rename session file during compression")
_compressed = True
# Also detect compression via the result dict or compressor state
if not _compressed:
_compressor = getattr(agent, 'context_compressor', None)
if _compressor and getattr(_compressor, 'compression_count', 0) > 0:
_compressed = True
# Notify the frontend that compression happened
if _compressed:
put('compressed', {
'message': 'Context auto-compressed to continue the conversation',
})
# Stamp 'timestamp' on any messages that don't have one yet
_now = time.time()
for _m in s.messages:
if isinstance(_m, dict) and not _m.get('timestamp') and not _m.get('_ts'):
_m['timestamp'] = int(_now)
# Only auto-generate title when still default; preserves user renames
if s.title == 'Untitled' or s.title == 'New Chat' or not s.title:
s.title = title_from(s.messages, s.title)
_looks_default = (s.title == 'Untitled' or s.title == 'New Chat' or not s.title)
_looks_provisional = _is_provisional_title(s.title, s.messages)
_invalid_existing_title = _looks_invalid_generated_title(s.title)
_should_bg_title = (
(_looks_default or _looks_provisional or _invalid_existing_title)
and (not getattr(s, 'llm_title_generated', False) or _invalid_existing_title)
)
if _is_quota:
_err_label = 'Out of credits'
_err_type = 'quota_exhausted'
_err_hint = 'Your provider account is out of credits. Top up your balance or switch providers via `hermes model`.'
elif _is_auth:
_err_label = 'Authentication failed'
_err_type = 'auth_mismatch'
_err_hint = (
'The selected model may not be supported by your configured provider or '
'your API key is invalid. Run `hermes model` in your terminal to '
'update credentials, then restart the WebUI.'
)
else:
_err_label = 'No response received'
_err_type = 'no_response'
_err_hint = 'Verify your API key is valid and the selected model is available for your account.'
put('apperror', {
'message': _err_str or f'{_err_label}.',
'type': _err_type,
'hint': _err_hint,
})
# Clear stream/pending state so the session does not appear
# "agent_running" on reload after a silent failure.
_u0 = ''
_a0 = ''
if _should_bg_title:
_u0, _a0 = _first_exchange_snippets(s.messages)
# Read token/cost usage from the agent object (if available)
input_tokens = getattr(agent, 'session_prompt_tokens', 0) or 0
output_tokens = getattr(agent, 'session_completion_tokens', 0) or 0
estimated_cost = getattr(agent, 'session_estimated_cost_usd', None)
s.input_tokens = (s.input_tokens or 0) + input_tokens
s.output_tokens = (s.output_tokens or 0) + output_tokens
if estimated_cost:
s.estimated_cost = (s.estimated_cost or 0) + estimated_cost
# Persist tool-call summaries even when the final message history only
# kept bare tool rows and omitted explicit assistant tool_call IDs.
tool_calls = _extract_tool_calls_from_messages(
s.messages,
live_tool_calls=_live_tool_calls,
)
s.tool_calls = tool_calls
s.active_stream_id = None
s.pending_user_message = None
s.pending_attachments = []
s.pending_started_at = None
# Persist the error so it survives page reload.
# _error=True ensures _sanitize_messages_for_api excludes it from
# subsequent API calls so the LLM never sees its own error as prior context.
s.messages.append({
'role': 'assistant',
'content': f'**{_err_label}:** {_err_str or _err_label}\n\n*{_err_hint}*',
'timestamp': int(time.time()),
'_error': True,
})
try:
s.save()
except Exception:
pass
return # apperror already closes the stream on the client side
# ── Handle context compression side effects ──
# If compression fired inside run_conversation, the agent may have
# rotated its session_id. Detect and fix the mismatch so the WebUI
# continues writing to the correct session file.
_agent_sid = getattr(agent, 'session_id', None)
_compressed = False
if _agent_sid and _agent_sid != session_id:
old_sid = session_id
new_sid = _agent_sid
# Rename the session file
old_path = SESSION_DIR / f'{old_sid}.json'
new_path = SESSION_DIR / f'{new_sid}.json'
s.session_id = new_sid
with LOCK:
if old_sid in SESSIONS:
SESSIONS[new_sid] = SESSIONS.pop(old_sid)
if old_path.exists() and not new_path.exists():
try:
old_path.rename(new_path)
except OSError:
logger.debug("Failed to rename session file during compression")
_compressed = True
# Also detect compression via the result dict or compressor state
if not _compressed:
_compressor = getattr(agent, 'context_compressor', None)
if _compressor and getattr(_compressor, 'compression_count', 0) > 0:
_compressed = True
# Notify the frontend that compression happened
if _compressed:
put('compressed', {
'message': 'Context auto-compressed to continue the conversation',
})
# Stamp 'timestamp' on any messages that don't have one yet
_now = time.time()
for _m in s.messages:
if isinstance(_m, dict) and not _m.get('timestamp') and not _m.get('_ts'):
_m['timestamp'] = int(_now)
# Only auto-generate title when still default; preserves user renames
if s.title == 'Untitled' or s.title == 'New Chat' or not s.title:
s.title = title_from(s.messages, s.title)
_looks_default = (s.title == 'Untitled' or s.title == 'New Chat' or not s.title)
_looks_provisional = _is_provisional_title(s.title, s.messages)
_invalid_existing_title = _looks_invalid_generated_title(s.title)
_should_bg_title = (
(_looks_default or _looks_provisional or _invalid_existing_title)
and (not getattr(s, 'llm_title_generated', False) or _invalid_existing_title)
)
_u0 = ''
_a0 = ''
if _should_bg_title:
_u0, _a0 = _first_exchange_snippets(s.messages)
# Read token/cost usage from the agent object (if available)
input_tokens = getattr(agent, 'session_prompt_tokens', 0) or 0
output_tokens = getattr(agent, 'session_completion_tokens', 0) or 0
estimated_cost = getattr(agent, 'session_estimated_cost_usd', None)
s.input_tokens = (s.input_tokens or 0) + input_tokens
s.output_tokens = (s.output_tokens or 0) + output_tokens
if estimated_cost:
s.estimated_cost = (s.estimated_cost or 0) + estimated_cost
# Persist tool-call summaries even when the final message history only
# kept bare tool rows and omitted explicit assistant tool_call IDs.
tool_calls = _extract_tool_calls_from_messages(
s.messages,
live_tool_calls=_live_tool_calls,
)
s.tool_calls = tool_calls
s.active_stream_id = None
s.pending_user_message = None
s.pending_attachments = []
s.pending_started_at = None
# Tag the matching user message with attachment filenames for display on reload
# Only tag a user message whose content relates to this turn's text
# (msg_text is the full message including the [Attached files: ...] suffix)
if attachments:
for m in reversed(s.messages):
if m.get('role') == 'user':
content = str(m.get('content', ''))
# Match if content is part of the sent message or vice-versa
base_text = msg_text.split('\n\n[Attached files:')[0].strip() if '\n\n[Attached files:' in msg_text else msg_text
if base_text[:60] in content or content[:60] in msg_text:
m['attachments'] = attachments
# Tag the matching user message with attachment filenames for display on reload
# Only tag a user message whose content relates to this turn's text
# (msg_text is the full message including the [Attached files: ...] suffix)
if attachments:
for m in reversed(s.messages):
if m.get('role') == 'user':
content = str(m.get('content', ''))
# Match if content is part of the sent message or vice-versa
base_text = msg_text.split('\n\n[Attached files:')[0].strip() if '\n\n[Attached files:' in msg_text else msg_text
if base_text[:60] in content or content[:60] in msg_text:
m['attachments'] = attachments
break
# Persist reasoning trace in the session so it survives reload.
# Must run BEFORE s.save() — otherwise the mutation lives only in
# memory until the next turn's save, and the last-turn thinking card
# is lost when the user reloads immediately after a response.
if _reasoning_text and s.messages:
for _rm in reversed(s.messages):
if isinstance(_rm, dict) and _rm.get('role') == 'assistant':
_rm['reasoning'] = _reasoning_text
break
# Persist reasoning trace in the session so it survives reload.
# Must run BEFORE s.save() — otherwise the mutation lives only in
# memory until the next turn's save, and the last-turn thinking card
# is lost when the user reloads immediately after a response.
if _reasoning_text and s.messages:
for _rm in reversed(s.messages):
if isinstance(_rm, dict) and _rm.get('role') == 'assistant':
_rm['reasoning'] = _reasoning_text
break
s.save()
s.save()
# Sync to state.db for /insights (opt-in setting)
try:
from api.config import load_settings as _load_settings
@@ -1543,23 +1602,29 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
else:
_exc_label, _exc_type, _exc_hint = 'Error', 'error', ''
if s is not None:
s.active_stream_id = None
s.pending_user_message = None
s.pending_attachments = []
s.pending_started_at = None
if _checkpoint_stop is not None:
_checkpoint_stop.set()
if _ckpt_thread is not None:
_ckpt_thread.join(timeout=15)
# Persist the error so it survives page reload.
# _error=True ensures _sanitize_messages_for_api excludes it from subsequent
# API calls so the LLM never sees its own error as prior context on the next turn.
s.messages.append({
'role': 'assistant',
'content': f'**{_exc_label}:** {err_str}' + (f'\n\n*{_exc_hint}*' if _exc_hint else ''),
'timestamp': int(time.time()),
'_error': True,
})
try:
s.save()
except Exception:
pass
_lock_ctx = _agent_lock if _agent_lock is not None else contextlib.nullcontext()
with _lock_ctx:
s.active_stream_id = None
s.pending_user_message = None
s.pending_attachments = []
s.pending_started_at = None
s.messages.append({
'role': 'assistant',
'content': f'**{_exc_label}:** {err_str}' + (f'\n\n*{_exc_hint}*' if _exc_hint else ''),
'timestamp': int(time.time()),
'_error': True,
})
try:
s.save()
except Exception:
pass
_apperror_payload: dict = {'message': err_str, 'type': _exc_type}
if _exc_hint:
_apperror_payload['hint'] = _exc_hint
@@ -1568,6 +1633,8 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
# Stop periodic checkpoint thread if it was started (Issue #765)
if _checkpoint_stop is not None:
_checkpoint_stop.set()
if _ckpt_thread is not None:
_ckpt_thread.join(timeout=15)
_clear_thread_env() # TD1: always clear thread-local context
with STREAMS_LOCK:
STREAMS.pop(stream_id, None)
@@ -1662,55 +1729,60 @@ def cancel_stream(stream_id: str) -> bool:
_cancel_partial_text = STREAM_PARTIAL_TEXT.get(stream_id, '')
# Session cleanup outside STREAMS_LOCK to preserve lock ordering.
# Acquire the per-session _agent_lock too, mirroring every other session
# writer (streaming success/error paths, periodic checkpoint, POST endpoints)
# so the cancel-path mutation races neither the checkpoint thread nor
# concurrent undo/retry calls.
if _cancel_session_id:
try:
_cs = get_session(_cancel_session_id)
_cs.active_stream_id = None
_cs.pending_user_message = None
_cs.pending_attachments = []
_cs.pending_started_at = None
# Persist any partial assistant text that was streamed before cancel (#893).
# Preserving partial content means the user sees what the agent had
# produced rather than losing it entirely. The marker is _partial=True
# (for session/UI identification) — NOT _error=True — so the partial
# content IS kept in the history sent to the agent on the next user
# message, letting the model continue from where it was cut off.
# See the inner comment on the append call below for the rationale.
partial_text = _cancel_partial_text.strip() if _cancel_partial_text else ''
if partial_text:
import re as _re
# Strip thinking/reasoning markup from partial content before saving.
# First pass: remove complete <think>...</think> and <thinking>...</thinking> blocks.
_stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*?</think(?:ing)?>',
'', partial_text,
flags=_re.DOTALL | _re.IGNORECASE).strip()
# Second pass: strip trailing UNCLOSED think/thinking block (the common
# cancel case — user stops mid-reasoning before the close tag appears).
_stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*',
'', _stripped,
flags=_re.DOTALL | _re.IGNORECASE).strip()
if _stripped:
# Mark _partial=True for session/UI identification only.
# Deliberately NOT _error=True — the partial content is real model
# output and should be visible in conversation history so the model
# can continue from it on the next turn (#893).
_cs.messages.append({
'role': 'assistant',
'content': _stripped,
'_partial': True,
'timestamp': int(time.time()),
})
# Cancel marker — flagged _error=True so it is stripped from conversation
# history on the next turn (prevents model from seeing "Task cancelled."
# as a prior assistant reply).
_cs.messages.append({
'role': 'assistant',
'content': '*Task cancelled.*',
'_error': True,
'timestamp': int(time.time()),
})
_cs.save()
except Exception:
logger.debug("Failed to clear session state on cancel for %s", _cancel_session_id)
with _get_session_agent_lock(_cancel_session_id):
try:
_cs = get_session(_cancel_session_id)
_cs.active_stream_id = None
_cs.pending_user_message = None
_cs.pending_attachments = []
_cs.pending_started_at = None
# Persist any partial assistant text that was streamed before cancel (#893).
# Preserving partial content means the user sees what the agent had
# produced rather than losing it entirely. The marker is _partial=True
# (for session/UI identification only) — NOT _error=True — so the partial
# content IS kept in the history sent to the agent on the next user
# message, letting the model continue from where it was cut off.
# See the inner comment on the append call below for the rationale.
partial_text = _cancel_partial_text.strip() if _cancel_partial_text else ''
if partial_text:
import re as _re
# Strip thinking/reasoning markup from partial content before saving.
# First pass: remove complete <thinking>...</thinking> blocks.
_stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*?</think(?:ing)?>',
'', partial_text,
flags=_re.DOTALL | _re.IGNORECASE).strip()
# Second pass: strip trailing UNCLOSED think/thinking block (the common
# cancel case — user stops mid-reasoning before the close tag appears).
_stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*',
'', _stripped,
flags=_re.DOTALL | _re.IGNORECASE).strip()
if _stripped:
# Mark _partial=True for session/UI identification only.
# Deliberately NOT _error=True — the partial content is real model
# output and should be visible in conversation history so the model
# can continue from it on the next turn (#893).
_cs.messages.append({
'role': 'assistant',
'content': _stripped,
'_partial': True,
'timestamp': int(time.time()),
})
# Cancel marker — flagged _error=True so it is stripped from conversation
# history on the next turn (prevents model from seeing "Task cancelled."
# as a prior assistant reply).
_cs.messages.append({
'role': 'assistant',
'content': '*Task cancelled.*',
'_error': True,
'timestamp': int(time.time()),
})
_cs.save()
except Exception:
logger.debug("Failed to clear session state on cancel for %s", _cancel_session_id)
return True