fix: harden session persistence and per-session lock handling during streaming (v0.50.175, #910) (#910)

Co-authored-by: starship-s Co-authored-by: nesquena-hermes <nesquena-hermes@users.noreply.github.com>
2026-04-23 14:25:43 -07:00
parent 5082f426f2
commit 5b923a9502
9 changed files with 1237 additions and 429 deletions
--- a/api/config.py
+++ b/api/config.py
@@ -1683,6 +1683,25 @@ SESSION_AGENT_LOCKS_LOCK = threading.Lock()


 def _get_session_agent_lock(session_id: str) -> threading.Lock:
+    """Return the per-session Lock used to serialize all Session mutations.
+
+    Lock lifecycle invariant:
+      - A Lock is created lazily on first access and lives in SESSION_AGENT_LOCKS
+        for the lifetime of the session.
+      - The entry is pruned in /api/session/delete (under SESSION_AGENT_LOCKS_LOCK)
+        so deleted sessions don't leak a Lock forever.
+      - During context compression the agent may rotate session_id.  The
+        streaming thread migrates the lock entry atomically under
+        SESSION_AGENT_LOCKS_LOCK: it aliases the new session_id to the *same*
+        Lock object and pops the old-id entry (see streaming.py compression
+        block).  This ensures that subsequent callers using the new ID still
+        acquire the same Lock, while the old-id entry is removed to prevent a
+        leak.  The streaming thread already holds the Lock during this
+        migration, so the reference stays alive even after the dict entry is
+        removed.
+      - Lock contract: hold for the in-memory mutation + s.save() only; never
+        across network I/O (LLM calls, HTTP requests).
+    """
    with SESSION_AGENT_LOCKS_LOCK:
        if session_id not in SESSION_AGENT_LOCKS:
            SESSION_AGENT_LOCKS[session_id] = threading.Lock()
--- a/api/models.py
+++ b/api/models.py
@@ -1,10 +1,9 @@
-"""
-Hermes Web UI -- Session model and in-memory session store.
-"""
+"""Hermes Web UI -- Session model and in-memory session store."""
 import collections
 import json
 import logging
 import os
+import threading
 import time
 import uuid
 from pathlib import Path
@@ -19,6 +18,46 @@ from api.workspace import get_last_workspace

 logger = logging.getLogger(__name__)

+# ---------------------------------------------------------------------------
+# Stale temp-file cleanup
+# ---------------------------------------------------------------------------
+# Both Session.save() and _write_session_index() use the atomic-write pattern:
+#   write to  <path>.tmp.<pid>.<tid>  →  os.replace() to final path
+# If the process crashes between write and replace the .tmp file is left
+# behind.  Because the name embeds pid + tid, leftover files can never be
+# reused by a different process/thread, so they are safe to remove on the
+# next startup.  _cleanup_stale_tmp_files() is called from the full-rebuild
+# path of _write_session_index (i.e. at first index access / startup) and
+# removes any *.tmp.* file whose mtime is older than one hour.
+# ---------------------------------------------------------------------------
+
+_STALE_TMP_AGE_SECONDS = 3600  # 1 hour
+
+# Serializes index writers so concurrent Session.save() calls cannot race on
+# stale baselines while still allowing LOCK to be released before disk I/O.
+_INDEX_WRITE_LOCK = threading.RLock()
+
+
+def _cleanup_stale_tmp_files() -> None:
+    """Best-effort removal of stale ``*.tmp.*`` files from SESSION_DIR.
+
+    Only files whose mtime is older than ``_STALE_TMP_AGE_SECONDS`` are
+    removed so that in-flight writes from a long-running sibling process
+    are not disturbed.  Errors are logged and swallowed — this must never
+    prevent startup.
+    """
+    cutoff = time.time() - _STALE_TMP_AGE_SECONDS
+    try:
+        for p in SESSION_DIR.glob('*.tmp.*'):
+            try:
+                if p.stat().st_mtime < cutoff:
+                    p.unlink(missing_ok=True)
+                    logger.debug("Cleaned up stale tmp file: %s", p.name)
+            except OSError:
+                pass  # best-effort
+    except Exception:
+        pass  # SESSION_DIR may not exist yet; that's fine
+

 def _index_entry_exists(session_id: str, in_memory_ids=None) -> bool:
    """Return True if an index entry still has backing state.
@@ -46,58 +85,101 @@ def _write_session_index(updates=None):
    entries should be refreshed), this does a targeted in-place update of
    the existing index — O(1) for single-session changes.  When *updates*
    is None, a full rebuild is performed (used on startup / first call).
-    """
-    # Lazy full-rebuild path — used when index doesn't exist yet.
-    if updates is None or not SESSION_INDEX_FILE.exists():
-        entries = []
-        for p in SESSION_DIR.glob('*.json'):
-            if p.name.startswith('_'): continue
-            try:
-                s = Session.load(p.stem)
-                if s: entries.append(s.compact())
-            except Exception:
-                logger.debug("Failed to load session from %s", p)
-        with LOCK:
-            for s in SESSIONS.values():
-                if not any(e['session_id'] == s.session_id for e in entries):
-                    entries.append(s.compact())
-            entries.sort(key=lambda s: s['updated_at'], reverse=True)
-            _tmp = SESSION_INDEX_FILE.with_suffix('.tmp')
-            _tmp.write_text(json.dumps(entries, ensure_ascii=False, indent=2), encoding='utf-8')
-            os.replace(_tmp, SESSION_INDEX_FILE)
-        return

-    # Fast path: patch existing index with updated sessions.
-    # This avoids loading every session file on every single save().
-    # LOCK covers the entire read-patch-write to prevent concurrent save() calls
-    # from both reading the same baseline and one losing its update.
-    _fallback = False
-    try:
-        with LOCK:
-            existing = json.loads(SESSION_INDEX_FILE.read_text(encoding='utf-8'))
-            in_memory_ids = set(SESSIONS.keys())
-            existing = [
-                e for e in existing
-                if _index_entry_exists(e.get('session_id'), in_memory_ids=in_memory_ids)
-            ]
-            # Build lookup of updated entries
-            updated_map = {s.session_id: s.compact() for s in updates}
-            existing_ids = {e.get('session_id') for e in existing}
-            # Add any updated entries not yet in the index
-            for sid, entry in updated_map.items():
-                if sid not in existing_ids:
-                    existing.append(entry)
-            # Replace matching entries in-place
-            for i, e in enumerate(existing):
-                sid = e.get('session_id')
-                if sid in updated_map:
-                    existing[i] = updated_map[sid]
-            existing.sort(key=lambda s: s.get('updated_at', 0), reverse=True)
-            _tmp = SESSION_INDEX_FILE.with_suffix('.tmp')
-            _tmp.write_text(json.dumps(existing, ensure_ascii=False, indent=2), encoding='utf-8')
-            os.replace(_tmp, SESSION_INDEX_FILE)
-    except Exception:
-        _fallback = True
+    LOCK protects in-memory state snapshots and payload construction only;
+    disk I/O (write/flush/fsync/replace) always runs outside LOCK.
+    """
+    _tmp = SESSION_INDEX_FILE.with_suffix(f'.tmp.{os.getpid()}.{threading.current_thread().ident}')
+
+    with _INDEX_WRITE_LOCK:
+        # Lazy full-rebuild path — used when index doesn't exist yet.
+        if updates is None or not SESSION_INDEX_FILE.exists():
+            _cleanup_stale_tmp_files()  # best-effort sweep on startup / first call
+            entries = []
+            for p in SESSION_DIR.glob('*.json'):
+                if p.name.startswith('_'):
+                    continue
+                try:
+                    s = Session.load(p.stem)
+                    if s:
+                        entries.append(s.compact())
+                except Exception:
+                    logger.debug("Failed to load session from %s", p)
+
+            with LOCK:
+                existing_ids = {e.get('session_id') for e in entries}
+                for s in SESSIONS.values():
+                    if s.session_id not in existing_ids:
+                        entries.append(s.compact())
+                entries.sort(key=lambda s: s.get('updated_at', 0), reverse=True)
+                _payload = json.dumps(entries, ensure_ascii=False, indent=2)
+
+            try:
+                with open(_tmp, 'w', encoding='utf-8') as f:
+                    f.write(_payload)
+                    f.flush()
+                    os.fsync(f.fileno())
+                os.replace(_tmp, SESSION_INDEX_FILE)
+            except Exception:
+                # Best-effort cleanup of stale tmp on failure
+                try:
+                    _tmp.unlink(missing_ok=True)
+                except Exception:
+                    pass
+                raise
+            return
+
+        # Fast path: patch existing index with updated sessions.
+        # This avoids loading every session file on every single save().
+        _fallback = False
+        try:
+            with LOCK:
+                existing = json.loads(SESSION_INDEX_FILE.read_text(encoding='utf-8'))
+                in_memory_ids = set(SESSIONS.keys())
+
+                # Avoid N filesystem exists() checks under LOCK by collecting
+                # on-disk IDs once.
+                on_disk_ids = {
+                    p.stem
+                    for p in SESSION_DIR.glob('*.json')
+                    if not p.name.startswith('_')
+                }
+
+                existing = [
+                    e for e in existing
+                    if (e.get('session_id') in in_memory_ids or e.get('session_id') in on_disk_ids)
+                ]
+
+                # Build lookup of updated entries
+                updated_map = {s.session_id: s.compact() for s in updates}
+                existing_ids = {e.get('session_id') for e in existing}
+                # Add any updated entries not yet in the index
+                for sid, entry in updated_map.items():
+                    if sid not in existing_ids:
+                        existing.append(entry)
+                # Replace matching entries in-place
+                for i, e in enumerate(existing):
+                    sid = e.get('session_id')
+                    if sid in updated_map:
+                        existing[i] = updated_map[sid]
+                existing.sort(key=lambda s: s.get('updated_at', 0), reverse=True)
+                _payload = json.dumps(existing, ensure_ascii=False, indent=2)
+
+            try:
+                with open(_tmp, 'w', encoding='utf-8') as f:
+                    f.write(_payload)
+                    f.flush()
+                    os.fsync(f.fileno())
+                os.replace(_tmp, SESSION_INDEX_FILE)
+            except Exception:
+                try:
+                    _tmp.unlink(missing_ok=True)
+                except Exception:
+                    pass
+                raise
+        except Exception:
+            _fallback = True
+
    if _fallback:
        # Corrupt or missing index — fall back to full rebuild (called outside LOCK to avoid deadlock)
        _write_session_index(updates=None)
@@ -157,10 +239,20 @@ class Session:
    def save(self, touch_updated_at: bool = True, skip_index: bool = False) -> None:
        if touch_updated_at:
            self.updated_at = time.time()
-        self.path.write_text(
-            json.dumps(self.__dict__, ensure_ascii=False, indent=2),
-            encoding='utf-8',
-        )
+        payload = json.dumps(self.__dict__, ensure_ascii=False, indent=2)
+        tmp = self.path.with_suffix(f'.tmp.{os.getpid()}.{threading.current_thread().ident}')
+        try:
+            with open(tmp, 'w', encoding='utf-8') as f:
+                f.write(payload)
+                f.flush()
+                os.fsync(f.fileno())
+            os.replace(tmp, self.path)
+        except Exception:
+            try:
+                tmp.unlink(missing_ok=True)
+            except Exception:
+                pass
+            raise
        if not skip_index:
            _write_session_index(updates=[self])

--- a/api/routes.py
+++ b/api/routes.py
@@ -66,6 +66,9 @@ from api.config import (
    MAX_FILE_BYTES,
    MAX_UPLOAD_BYTES,
    CHAT_LOCK,
+    _get_session_agent_lock,
+    SESSION_AGENT_LOCKS,
+    SESSION_AGENT_LOCKS_LOCK,
    load_settings,
    save_settings,
    set_hermes_default_model,
@@ -1049,8 +1052,9 @@ def handle_post(handler, parsed) -> bool:
            s = get_session(body["session_id"])
        except KeyError:
            return bad(handler, "Session not found", 404)
-        s.title = str(body["title"]).strip()[:80] or "Untitled"
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.title = str(body["title"]).strip()[:80] or "Untitled"
+            s.save()
        return j(handler, {"session": s.compact()})

    if parsed.path == "/api/personality/set":
@@ -1093,8 +1097,9 @@ def handle_post(handler, parsed) -> bool:
                prompt = "\n".join(p for p in parts if p)
            else:
                prompt = str(value)
-        s.personality = name if name else None
-        s.save()
+        with _get_session_agent_lock(sid):
+            s.personality = name if name else None
+            s.save()
        return j(handler, {"ok": True, "personality": s.personality, "prompt": prompt})

    if parsed.path == "/api/session/update":
@@ -1110,9 +1115,10 @@ def handle_post(handler, parsed) -> bool:
            new_ws = str(resolve_trusted_workspace(body.get("workspace", s.workspace)))
        except ValueError as e:
            return bad(handler, str(e))
-        s.workspace = new_ws
-        s.model = body.get("model", s.model)
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.workspace = new_ws
+            s.model = body.get("model", s.model)
+            s.save()
        set_last_workspace(new_ws)
        return j(handler, {"session": s.compact() | {"messages": s.messages}})

@@ -1134,6 +1140,10 @@ def handle_post(handler, parsed) -> bool:
            p.unlink(missing_ok=True)
        except Exception:
            logger.debug("Failed to unlink session file %s", p)
+        # Prune the per-session agent lock so deleted sessions don't leak
+        # Lock entries in SESSION_AGENT_LOCKS forever.
+        with SESSION_AGENT_LOCKS_LOCK:
+            SESSION_AGENT_LOCKS.pop(sid, None)
        try:
            SESSION_INDEX_FILE.unlink(missing_ok=True)
        except Exception:
@@ -1156,10 +1166,11 @@ def handle_post(handler, parsed) -> bool:
            s = get_session(body["session_id"])
        except KeyError:
            return bad(handler, "Session not found", 404)
-        s.messages = []
-        s.tool_calls = []
-        s.title = "Untitled"
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.messages = []
+            s.tool_calls = []
+            s.title = "Untitled"
+            s.save()
        return j(handler, {"ok": True, "session": s.compact()})

    if parsed.path == "/api/session/truncate":
@@ -1174,8 +1185,9 @@ def handle_post(handler, parsed) -> bool:
        except KeyError:
            return bad(handler, "Session not found", 404)
        keep = int(body["keep_count"])
-        s.messages = s.messages[:keep]
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.messages = s.messages[:keep]
+            s.save()
        return j(
            handler, {"ok": True, "session": s.compact() | {"messages": s.messages}}
        )
@@ -1448,8 +1460,9 @@ def handle_post(handler, parsed) -> bool:
            s = get_session(body["session_id"])
        except KeyError:
            return bad(handler, "Session not found", 404)
-        s.pinned = bool(body.get("pinned", True))
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.pinned = bool(body.get("pinned", True))
+            s.save()
        return j(handler, {"ok": True, "session": s.compact()})

    # ── Session archive (POST) ──
@@ -1462,8 +1475,9 @@ def handle_post(handler, parsed) -> bool:
            s = get_session(body["session_id"])
        except KeyError:
            return bad(handler, "Session not found", 404)
-        s.archived = bool(body.get("archived", True))
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.archived = bool(body.get("archived", True))
+            s.save()
        return j(handler, {"ok": True, "session": s.compact()})

    # ── Session move to project (POST) ──
@@ -1476,8 +1490,9 @@ def handle_post(handler, parsed) -> bool:
            s = get_session(body["session_id"])
        except KeyError:
            return bad(handler, "Session not found", 404)
-        s.project_id = body.get("project_id") or None
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.project_id = body.get("project_id") or None
+            s.save()
        return j(handler, {"ok": True, "session": s.compact()})

    # ── Project CRUD (POST) ──
@@ -2445,13 +2460,14 @@ def _handle_chat_start(handler, body):
        # Stale stream id from a previous run; clear and continue.
        s.active_stream_id = None
    stream_id = uuid.uuid4().hex
-    s.workspace = workspace
-    s.model = model
-    s.active_stream_id = stream_id
-    s.pending_user_message = msg
-    s.pending_attachments = attachments
-    s.pending_started_at = time.time()
-    s.save()
+    with _get_session_agent_lock(s.session_id):
+        s.workspace = workspace
+        s.model = model
+        s.active_stream_id = stream_id
+        s.pending_user_message = msg
+        s.pending_attachments = attachments
+        s.pending_started_at = time.time()
+        s.save()
    set_last_workspace(workspace)
    q = queue.Queue()
    with STREAMS_LOCK:
@@ -2470,15 +2486,14 @@ def _handle_chat_start(handler, body):

 def _handle_chat_sync(handler, body):
    """Fallback synchronous chat endpoint (POST /api/chat). Not used by frontend."""
-    from api.config import _get_session_agent_lock
-
    s = get_session(body["session_id"])
    msg = str(body.get("message", "")).strip()
    if not msg:
        return j(handler, {"error": "empty message"}, status=400)
    workspace = Path(body.get("workspace") or s.workspace).expanduser().resolve()
-    s.workspace = str(workspace)
-    s.model = body.get("model") or s.model
+    with _get_session_agent_lock(s.session_id):
+        s.workspace = str(workspace)
+        s.model = body.get("model") or s.model
    from api.streaming import _ENV_LOCK

    with _ENV_LOCK:
@@ -2559,14 +2574,15 @@ def _handle_chat_sync(handler, body):
                os.environ.pop("HERMES_SESSION_KEY", None)
            else:
                os.environ["HERMES_SESSION_KEY"] = old_session_key
-    s.messages = _restore_reasoning_metadata(
-        _previous_messages,
-        result.get("messages") or s.messages,
-    )
-    # Only auto-generate title when still default; preserves user renames
-    if s.title == "Untitled":
-        s.title = title_from(s.messages, s.title)
-    s.save()
+    with _get_session_agent_lock(s.session_id):
+        s.messages = _restore_reasoning_metadata(
+            _previous_messages,
+            result.get("messages") or s.messages,
+        )
+        # Only auto-generate title when still default; preserves user renames
+        if s.title == "Untitled":
+            s.title = title_from(s.messages, s.title)
+        s.save()
    # Sync to state.db for /insights (opt-in setting)
    try:
        if load_settings().get("sync_to_insights"):
@@ -3094,33 +3110,42 @@ def _handle_session_compress(handler, body):
        if not resolved_api_key:
            return bad(handler, "No provider configured -- cannot compress.")

-        with _cfg._get_session_agent_lock(sid):
-            original_messages = list(messages)
-            approx_tokens = _estimate_messages_tokens_rough(original_messages)
+        # Compute compression *outside* the lock — the LLM round-trip can take
+        # many seconds and we must not block cancel_stream or other writers.
+        # Lock contract: hold for the in-memory mutation only, never across
+        # network I/O.
+        original_messages = list(messages)
+        approx_tokens = _estimate_messages_tokens_rough(original_messages)

-            agent = _run_agent.AIAgent(
-                model=resolved_model,
-                provider=resolved_provider,
-                base_url=resolved_base_url,
-                api_key=resolved_api_key,
-                platform="cli",
-                quiet_mode=True,
-                enabled_toolsets=_resolve_cli_toolsets(),
-                session_id=sid,
-            )
-            compressed = agent.context_compressor.compress(
-                original_messages,
-                current_tokens=approx_tokens,
-                focus_topic=focus_topic,
-            )
-            new_tokens = _estimate_messages_tokens_rough(compressed)
-            summary = _summarize_manual_compression(
-                original_messages,
-                compressed,
-                approx_tokens,
-                new_tokens,
-                focus_topic=focus_topic,
-            )
+        agent = _run_agent.AIAgent(
+            model=resolved_model,
+            provider=resolved_provider,
+            base_url=resolved_base_url,
+            api_key=resolved_api_key,
+            platform="cli",
+            quiet_mode=True,
+            enabled_toolsets=_resolve_cli_toolsets(),
+            session_id=sid,
+        )
+        compressed = agent.context_compressor.compress(
+            original_messages,
+            current_tokens=approx_tokens,
+            focus_topic=focus_topic,
+        )
+        new_tokens = _estimate_messages_tokens_rough(compressed)
+        summary = _summarize_manual_compression(
+            original_messages,
+            compressed,
+            approx_tokens,
+            new_tokens,
+            focus_topic=focus_topic,
+        )
+
+        with _cfg._get_session_agent_lock(sid):
+            # Re-read messages to detect concurrent edits during the LLM call.
+            # If the history changed, the compression result is stale — abort.
+            if _sanitize_messages_for_api(s.messages) != original_messages:
+                return bad(handler, "Session was modified during compression; please retry.", 409)

            s.messages = compressed
            s.tool_calls = []
--- a/api/session_ops.py
+++ b/api/session_ops.py
@@ -9,7 +9,7 @@ from __future__ import annotations
 import logging
 from typing import Any

-from api.config import LOCK
+from api.config import LOCK, _get_session_agent_lock
 from api.models import get_session, SESSIONS

 logger = logging.getLogger(__name__)
@@ -27,38 +27,43 @@ def retry_last(session_id: str) -> dict[str, Any]:
        KeyError: session not found
        ValueError: no user message in transcript
    """
-    # get_session() and Session.save() both acquire the module-level LOCK
-    # internally (the latter via _write_session_index()), and LOCK is a
-    # non-reentrant threading.Lock — so they MUST be called outside our
-    # own `with LOCK:` block to avoid self-deadlocking.
-    #
-    # The race we close is the read-modify-write of s.messages: two
-    # concurrent /api/session/retry calls could otherwise both compute the
-    # same last_user_idx from the same history and double-truncate. We
-    # serialize just the in-memory mutation; persistence happens outside
-    # the lock and is naturally last-write-wins on a consistent state.
-    #
-    # Stale-object guard: on a cache miss, two concurrent get_session()
-    # calls can each load and cache a *different* Session instance for the
-    # same session_id (the second store_clobbers the first). Re-bind to
-    # the canonical cached instance inside the lock so the mutation lands
-    # on the object the next reader will see, not a stale parallel copy.
-    s = get_session(session_id)  # raises KeyError if missing
-    with LOCK:
-        s = SESSIONS.get(session_id, s)
-        history = s.messages or []
-        last_user_idx = None
-        for i in range(len(history) - 1, -1, -1):
-            if history[i].get('role') == 'user':
-                last_user_idx = i
-                break
-        if last_user_idx is None:
-            raise ValueError('No previous message to retry.')
+    # Acquire the per-session agent lock as the outermost lock so that the
+    # read-modify-write of s.messages is serialised with the periodic
+    # checkpoint thread, cancel_stream, and all other session writers.
+    # Lock ordering: _agent_lock → LOCK → _write_session_index (LOCK).
+    with _get_session_agent_lock(session_id):
+        # get_session() and Session.save() both acquire the module-level LOCK
+        # internally (the latter via _write_session_index()), and LOCK is a
+        # non-reentrant threading.Lock — so they MUST be called outside our
+        # own `with LOCK:` block to avoid self-deadlocking.
+        #
+        # The race we close is the read-modify-write of s.messages: two
+        # concurrent /api/session/retry calls could otherwise both compute the
+        # same last_user_idx from the same history and double-truncate. We
+        # serialize just the in-memory mutation; persistence happens inside
+        # the per-session lock so the checkpoint thread cannot race us.
+        #
+        # Stale-object guard: on a cache miss, two concurrent get_session()
+        # calls can each load and cache a *different* Session instance for the
+        # same session_id (the second store clobbers the first). Re-bind to
+        # the canonical cached instance inside the lock so the mutation lands
+        # on the object the next reader will see, not a stale parallel copy.
+        s = get_session(session_id)  # raises KeyError if missing
+        with LOCK:
+            s = SESSIONS.get(session_id, s)
+            history = s.messages or []
+            last_user_idx = None
+            for i in range(len(history) - 1, -1, -1):
+                if history[i].get('role') == 'user':
+                    last_user_idx = i
+                    break
+            if last_user_idx is None:
+                raise ValueError('No previous message to retry.')

-        last_user_text = _extract_text(history[last_user_idx].get('content', ''))
-        removed_count = len(history) - last_user_idx
-        s.messages = history[:last_user_idx]
-    s.save()
+            last_user_text = _extract_text(history[last_user_idx].get('content', ''))
+            removed_count = len(history) - last_user_idx
+            s.messages = history[:last_user_idx]
+        s.save()
    return {'last_user_text': last_user_text, 'removed_count': removed_count}


@@ -72,23 +77,28 @@ def undo_last(session_id: str) -> dict[str, Any]:
        KeyError: session not found
        ValueError: no user message in transcript
    """
-    s = get_session(session_id)  # acquires LOCK transiently
-    with LOCK:
-        # Stale-object guard — see retry_last for the rationale.
-        s = SESSIONS.get(session_id, s)
-        history = s.messages or []
-        last_user_idx = None
-        for i in range(len(history) - 1, -1, -1):
-            if history[i].get('role') == 'user':
-                last_user_idx = i
-                break
-        if last_user_idx is None:
-            raise ValueError('Nothing to undo.')
+    # Acquire the per-session agent lock as the outermost lock so that the
+    # read-modify-write of s.messages is serialised with the periodic
+    # checkpoint thread, cancel_stream, and all other session writers.
+    # Lock ordering: _agent_lock → LOCK → _write_session_index (LOCK).
+    with _get_session_agent_lock(session_id):
+        s = get_session(session_id)  # acquires LOCK transiently
+        with LOCK:
+            # Stale-object guard — see retry_last for the rationale.
+            s = SESSIONS.get(session_id, s)
+            history = s.messages or []
+            last_user_idx = None
+            for i in range(len(history) - 1, -1, -1):
+                if history[i].get('role') == 'user':
+                    last_user_idx = i
+                    break
+            if last_user_idx is None:
+                raise ValueError('Nothing to undo.')

-        removed_text = _extract_text(history[last_user_idx].get('content', ''))
-        removed_count = len(history) - last_user_idx
-        s.messages = history[:last_user_idx]
-    s.save()  # outside LOCK -- save() re-acquires LOCK via _write_session_index()
+            removed_text = _extract_text(history[last_user_idx].get('content', ''))
+            removed_count = len(history) - last_user_idx
+            s.messages = history[:last_user_idx]
+        s.save()  # outside LOCK -- save() re-acquires LOCK via _write_session_index()
    preview = (removed_text[:40] + '...') if len(removed_text) > 40 else removed_text
    return {
        'removed_count': removed_count,
--- a/api/streaming.py
+++ b/api/streaming.py
@@ -2,6 +2,7 @@
 Hermes Web UI -- SSE streaming engine and agent thread runner.
 Includes Sprint 10 cancel support via CANCEL_FLAGS.
 """
+import contextlib
 import json
 import logging
 import os
@@ -20,6 +21,7 @@ from api.config import (
    STREAMS, STREAMS_LOCK, CANCEL_FLAGS, AGENT_INSTANCES, STREAM_PARTIAL_TEXT,
    LOCK, SESSIONS, SESSION_DIR,
    _get_session_agent_lock, _set_thread_env, _clear_thread_env,
+    SESSION_AGENT_LOCKS, SESSION_AGENT_LOCKS_LOCK,
    resolve_model_provider,
 )
 from api.helpers import redact_session_data
@@ -534,18 +536,46 @@ def _run_background_title_update(session_id: str, user_text: str, assistant_text
            if next_title:
                logger.debug("Using local fallback for session title generation")
                source = 'fallback'
-        if next_title and next_title != current:
-            s.title = next_title
-            s.llm_title_generated = True
-            # Keep chronological ordering stable in the sidebar.
-            s.save(touch_updated_at=False)
+        wrote_title = False
+        effective_title = current
+        if next_title:
+            # Hold _agent_lock only for in-memory mutation + save so title write
+            # is serialized with checkpoint saves, cancel_stream, and other
+            # session-mutating endpoints. The LLM round-trip above ran outside
+            # the lock to avoid blocking other writers.
+            with _get_session_agent_lock(session_id):
+                # Stale-object guard: rebind to the canonical cached Session
+                # instance under LOCK before checking whether a user rename
+                # landed while the LLM title request was in-flight.
+                with LOCK:
+                    s = SESSIONS.get(session_id, s)
+                    effective_title = str(s.title or '').strip()
+                    invalid_existing_now = _looks_invalid_generated_title(s.title)
+                    still_auto = (
+                        effective_title == placeholder_title
+                        or effective_title in ('Untitled', 'New Chat', '')
+                        or _is_provisional_title(effective_title, s.messages)
+                        or invalid_existing_now
+                    )
+                if not still_auto:
+                    _put_title_status(put_event, session_id, 'skipped', 'manual_title', effective_title)
+                    return
+                if next_title != effective_title:
+                    s.title = next_title
+                    s.llm_title_generated = True
+                    # Keep chronological ordering stable in the sidebar.
+                    s.save(touch_updated_at=False)
+                    effective_title = s.title
+                    wrote_title = True
+
+        if wrote_title:
            if source == 'fallback':
-                _put_title_status(put_event, session_id, source, 'local_summary', s.title, raw_preview)
+                _put_title_status(put_event, session_id, source, 'local_summary', effective_title, raw_preview)
            else:
-                _put_title_status(put_event, session_id, source, llm_status, s.title, raw_preview)
-            put_event('title', {'session_id': session_id, 'title': s.title})
+                _put_title_status(put_event, session_id, source, llm_status, effective_title, raw_preview)
+            put_event('title', {'session_id': session_id, 'title': effective_title})
        else:
-            _put_title_status(put_event, session_id, 'skipped', source or 'unchanged', current, raw_preview)
+            _put_title_status(put_event, session_id, 'skipped', source or 'unchanged', effective_title, raw_preview)
    finally:
        put_event('stream_end', {'session_id': session_id})

@@ -830,6 +860,8 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
    # block can safely check `if _checkpoint_stop is not None` even when an
    # exception fires before the checkpoint thread is created (Issue #765).
    _checkpoint_stop = None
+    _ckpt_thread = None
+    _agent_lock = None
    try:
        s = get_session(session_id)
        s.workspace = str(Path(workspace).expanduser().resolve())
@@ -974,6 +1006,11 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
                _reasoning_text += str(text)
                put('reasoning', {'text': str(text)})

+            # Pre-initialise the activity counter here so on_tool (which
+            # closes over it) never captures an unbound name even if this
+            # block is reordered later (Issue #765).
+            _checkpoint_activity = [0]
+
            def on_tool(*cb_args, **cb_kwargs):
                event_type = None
                name = None
@@ -1224,7 +1261,7 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
            # response — better than a silent loss of the entire conversation turn.
            # The final s.save() at task completion handles the full session update + index.
            # (_checkpoint_stop is pre-initialised at the top of the outer try.)
-            _checkpoint_activity = [0]
+            # (_checkpoint_activity is already initialised before on_tool().)

            def _periodic_checkpoint():
                last_saved_activity = 0
@@ -1232,7 +1269,8 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
                    try:
                        cur = _checkpoint_activity[0]
                        if cur > last_saved_activity:
-                            s.save(skip_index=True)
+                            with _agent_lock:
+                                s.save(skip_index=True)
                            last_saved_activity = cur
                    except Exception as e:
                        logger.debug("Periodic checkpoint save failed: %s", e)
@@ -1251,193 +1289,214 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
                task_id=session_id,
                persist_user_message=msg_text,
            )
-            s.messages = _restore_reasoning_metadata(
-                _previous_messages,
-                result.get('messages') or s.messages,
-            )
-            # Strip XML tool-call blocks from assistant message content.
-            # DeepSeek and some other providers emit <function_calls>...</function_calls>
-            # in the raw response text; this must be removed before the content is
-            # saved to the session and displayed in the chat bubble. (#702)
-            for _m in s.messages:
-                if isinstance(_m, dict) and _m.get('role') == 'assistant':
-                    _raw_content = _m.get('content')
-                    if isinstance(_raw_content, str):
-                        _cleaned = _strip_xml_tool_calls(_raw_content)
-                        if _cleaned != _raw_content:
-                            _m['content'] = _cleaned
-                    elif isinstance(_raw_content, list):
-                        for _part in _raw_content:
-                            if isinstance(_part, dict) and isinstance(_part.get('text'), str):
-                                _part['text'] = _strip_xml_tool_calls(_part['text'])
+            if _checkpoint_stop is not None:
+                _checkpoint_stop.set()
+            if _ckpt_thread is not None:
+                _ckpt_thread.join(timeout=15)
+            with _agent_lock:
+                s.messages = _restore_reasoning_metadata(
+                    _previous_messages,
+                    result.get('messages') or s.messages,
+                )
+                # Strip XML tool-call blocks from assistant message content.
+                # DeepSeek and some other providers emit <function_calls>...</function_calls>
+                # in the raw response text; this must be removed before the content is
+                # saved to the session and displayed in the chat bubble. (#702)
+                for _m in s.messages:
+                    if isinstance(_m, dict) and _m.get('role') == 'assistant':
+                        _raw_content = _m.get('content')
+                        if isinstance(_raw_content, str):
+                            _cleaned = _strip_xml_tool_calls(_raw_content)
+                            if _cleaned != _raw_content:
+                                _m['content'] = _cleaned
+                        elif isinstance(_raw_content, list):
+                            for _part in _raw_content:
+                                if isinstance(_part, dict) and isinstance(_part.get('text'), str):
+                                    _part['text'] = _strip_xml_tool_calls(_part['text'])

-            # ── Detect silent agent failure (no assistant reply produced) ──
-            # When the agent catches an auth/network error internally it may return
-            # an empty final_response without raising — the stream would end with
-            # a done event containing zero assistant messages, leaving the user with
-            # no feedback. Emit an apperror so the client shows an inline error.
-            _assistant_added = any(
-                m.get('role') == 'assistant' and str(m.get('content') or '').strip()
-                for m in (result.get('messages') or [])
-            )
-            # _token_sent tracks whether on_token() was called (any streamed text)
-            if not _assistant_added and not _token_sent:
-                _last_err = getattr(agent, '_last_error', None) or result.get('error') or ''
-                _err_str = str(_last_err) if _last_err else ''
-                _err_lower = _err_str.lower()
-                _is_quota = (
-                    'insufficient credit' in _err_lower
-                    or 'credit balance' in _err_lower
-                    or 'credits exhausted' in _err_lower
-                    or 'quota_exceeded' in _err_lower
-                    or 'quota exceeded' in _err_lower
-                    or 'exceeded your current quota' in _err_lower
+                # ── Detect silent agent failure (no assistant reply produced) ──
+                # When the agent catches an auth/network error internally it may return
+                # an empty final_response without raising — the stream would end with
+                # a done event containing zero assistant messages, leaving the user with
+                # no feedback. Emit an apperror so the client shows an inline error.
+                _assistant_added = any(
+                    m.get('role') == 'assistant' and str(m.get('content') or '').strip()
+                    for m in (result.get('messages') or [])
                )
-                _is_auth = (
-                    not _is_quota and (
-                        '401' in _err_str
-                        or (_last_err and 'AuthenticationError' in type(_last_err).__name__)
-                        or 'authentication' in _err_lower
-                        or 'unauthorized' in _err_lower
-                        or 'invalid api key' in _err_lower
-                        or 'invalid_api_key' in _err_lower
+                # _token_sent tracks whether on_token() was called (any streamed text)
+                if not _assistant_added and not _token_sent:
+                    _last_err = getattr(agent, '_last_error', None) or result.get('error') or ''
+                    _err_str = str(_last_err) if _last_err else ''
+                    _err_lower = _err_str.lower()
+                    _is_quota = (
+                        'insufficient credit' in _err_lower
+                        or 'credit balance' in _err_lower
+                        or 'credits exhausted' in _err_lower
+                        or 'quota_exceeded' in _err_lower
+                        or 'quota exceeded' in _err_lower
+                        or 'exceeded your current quota' in _err_lower
                    )
+                    _is_auth = (
+                        not _is_quota and (
+                            '401' in _err_str
+                            or (_last_err and 'AuthenticationError' in type(_last_err).__name__)
+                            or 'authentication' in _err_lower
+                            or 'unauthorized' in _err_lower
+                            or 'invalid api key' in _err_lower
+                            or 'invalid_api_key' in _err_lower
+                        )
+                    )
+                    if _is_quota:
+                        _err_label = 'Out of credits'
+                        _err_type = 'quota_exhausted'
+                        _err_hint = 'Your provider account is out of credits. Top up your balance or switch providers via `hermes model`.'
+                    elif _is_auth:
+                        _err_label = 'Authentication failed'
+                        _err_type = 'auth_mismatch'
+                        _err_hint = (
+                            'The selected model may not be supported by your configured provider or '
+                            'your API key is invalid. Run `hermes model` in your terminal to '
+                            'update credentials, then restart the WebUI.'
+                        )
+                    else:
+                        _err_label = 'No response received'
+                        _err_type = 'no_response'
+                        _err_hint = 'Verify your API key is valid and the selected model is available for your account.'
+                    put('apperror', {
+                        'message': _err_str or f'{_err_label}.',
+                        'type': _err_type,
+                        'hint': _err_hint,
+                    })
+                    # Clear stream/pending state so the session does not appear
+                    # "agent_running" on reload after a silent failure.
+                    # Persist the error so it survives page reload.
+                    # _error=True ensures _sanitize_messages_for_api excludes it from
+                    # subsequent API calls so the LLM never sees its own error as prior context.
+                    s.active_stream_id = None
+                    s.pending_user_message = None
+                    s.pending_attachments = []
+                    s.pending_started_at = None
+                    s.messages.append({
+                        'role': 'assistant',
+                        'content': f'**{_err_label}:** {_err_str or _err_label}\n\n*{_err_hint}*',
+                        'timestamp': int(time.time()),
+                        '_error': True,
+                    })
+                    try:
+                        s.save()
+                    except Exception:
+                        pass
+                    return  # apperror already closes the stream on the client side
+
+                # ── Handle context compression side effects ──
+                # If compression fired inside run_conversation, the agent may have
+                # rotated its session_id. Detect and fix the mismatch so the WebUI
+                # continues writing to the correct session file.
+                #
+                # Lock migration: when session_id rotates, we alias the new ID to
+                # the *same* Lock object under SESSION_AGENT_LOCKS so that
+                # subsequent callers using _get_session_agent_lock(new_sid) get the
+                # same Lock the streaming thread is already holding.  We then pop
+                # the old-id entry to prevent a leak.  This is safe because we
+                # already hold _agent_lock (the Lock object itself), so the
+                # reference stays alive even after the dict entry is removed.
+                # Concurrent readers that already looked up the old ID will still
+                # see the same Lock object until they release it.
+                _agent_sid = getattr(agent, 'session_id', None)
+                _compressed = False
+                if _agent_sid and _agent_sid != session_id:
+                    old_sid = session_id
+                    new_sid = _agent_sid
+                    # Rename the session file
+                    old_path = SESSION_DIR / f'{old_sid}.json'
+                    new_path = SESSION_DIR / f'{new_sid}.json'
+                    s.session_id = new_sid
+                    with LOCK:
+                        if old_sid in SESSIONS:
+                            SESSIONS[new_sid] = SESSIONS.pop(old_sid)
+                    # Migrate the per-session lock: alias new_sid to the held
+                    # _agent_lock reference directly (not via old_sid lookup),
+                    # then remove the old_sid entry to prevent a leak.
+                    with SESSION_AGENT_LOCKS_LOCK:
+                        SESSION_AGENT_LOCKS[new_sid] = _agent_lock
+                        SESSION_AGENT_LOCKS.pop(old_sid, None)
+                    if old_path.exists() and not new_path.exists():
+                        try:
+                            old_path.rename(new_path)
+                        except OSError:
+                            logger.debug("Failed to rename session file during compression")
+                    _compressed = True
+                # Also detect compression via the result dict or compressor state
+                if not _compressed:
+                    _compressor = getattr(agent, 'context_compressor', None)
+                    if _compressor and getattr(_compressor, 'compression_count', 0) > 0:
+                        _compressed = True
+                # Notify the frontend that compression happened
+                if _compressed:
+                    put('compressed', {
+                        'message': 'Context auto-compressed to continue the conversation',
+                    })
+
+                # Stamp 'timestamp' on any messages that don't have one yet
+                _now = time.time()
+                for _m in s.messages:
+                    if isinstance(_m, dict) and not _m.get('timestamp') and not _m.get('_ts'):
+                        _m['timestamp'] = int(_now)
+                # Only auto-generate title when still default; preserves user renames
+                if s.title == 'Untitled' or s.title == 'New Chat' or not s.title:
+                    s.title = title_from(s.messages, s.title)
+                _looks_default = (s.title == 'Untitled' or s.title == 'New Chat' or not s.title)
+                _looks_provisional = _is_provisional_title(s.title, s.messages)
+                _invalid_existing_title = _looks_invalid_generated_title(s.title)
+                _should_bg_title = (
+                    (_looks_default or _looks_provisional or _invalid_existing_title)
+                    and (not getattr(s, 'llm_title_generated', False) or _invalid_existing_title)
                )
-                if _is_quota:
-                    _err_label = 'Out of credits'
-                    _err_type = 'quota_exhausted'
-                    _err_hint = 'Your provider account is out of credits. Top up your balance or switch providers via `hermes model`.'
-                elif _is_auth:
-                    _err_label = 'Authentication failed'
-                    _err_type = 'auth_mismatch'
-                    _err_hint = (
-                        'The selected model may not be supported by your configured provider or '
-                        'your API key is invalid. Run `hermes model` in your terminal to '
-                        'update credentials, then restart the WebUI.'
-                    )
-                else:
-                    _err_label = 'No response received'
-                    _err_type = 'no_response'
-                    _err_hint = 'Verify your API key is valid and the selected model is available for your account.'
-                put('apperror', {
-                    'message': _err_str or f'{_err_label}.',
-                    'type': _err_type,
-                    'hint': _err_hint,
-                })
-                # Clear stream/pending state so the session does not appear
-                # "agent_running" on reload after a silent failure.
+                _u0 = ''
+                _a0 = ''
+                if _should_bg_title:
+                    _u0, _a0 = _first_exchange_snippets(s.messages)
+                # Read token/cost usage from the agent object (if available)
+                input_tokens = getattr(agent, 'session_prompt_tokens', 0) or 0
+                output_tokens = getattr(agent, 'session_completion_tokens', 0) or 0
+                estimated_cost = getattr(agent, 'session_estimated_cost_usd', None)
+                s.input_tokens = (s.input_tokens or 0) + input_tokens
+                s.output_tokens = (s.output_tokens or 0) + output_tokens
+                if estimated_cost:
+                    s.estimated_cost = (s.estimated_cost or 0) + estimated_cost
+                # Persist tool-call summaries even when the final message history only
+                # kept bare tool rows and omitted explicit assistant tool_call IDs.
+                tool_calls = _extract_tool_calls_from_messages(
+                    s.messages,
+                    live_tool_calls=_live_tool_calls,
+                )
+                s.tool_calls = tool_calls
                s.active_stream_id = None
                s.pending_user_message = None
                s.pending_attachments = []
                s.pending_started_at = None
-                # Persist the error so it survives page reload.
-                # _error=True ensures _sanitize_messages_for_api excludes it from
-                # subsequent API calls so the LLM never sees its own error as prior context.
-                s.messages.append({
-                    'role': 'assistant',
-                    'content': f'**{_err_label}:** {_err_str or _err_label}\n\n*{_err_hint}*',
-                    'timestamp': int(time.time()),
-                    '_error': True,
-                })
-                try:
-                    s.save()
-                except Exception:
-                    pass
-                return  # apperror already closes the stream on the client side
-
-            # ── Handle context compression side effects ──
-            # If compression fired inside run_conversation, the agent may have
-            # rotated its session_id. Detect and fix the mismatch so the WebUI
-            # continues writing to the correct session file.
-            _agent_sid = getattr(agent, 'session_id', None)
-            _compressed = False
-            if _agent_sid and _agent_sid != session_id:
-                old_sid = session_id
-                new_sid = _agent_sid
-                # Rename the session file
-                old_path = SESSION_DIR / f'{old_sid}.json'
-                new_path = SESSION_DIR / f'{new_sid}.json'
-                s.session_id = new_sid
-                with LOCK:
-                    if old_sid in SESSIONS:
-                        SESSIONS[new_sid] = SESSIONS.pop(old_sid)
-                if old_path.exists() and not new_path.exists():
-                    try:
-                        old_path.rename(new_path)
-                    except OSError:
-                        logger.debug("Failed to rename session file during compression")
-                _compressed = True
-            # Also detect compression via the result dict or compressor state
-            if not _compressed:
-                _compressor = getattr(agent, 'context_compressor', None)
-                if _compressor and getattr(_compressor, 'compression_count', 0) > 0:
-                    _compressed = True
-            # Notify the frontend that compression happened
-            if _compressed:
-                put('compressed', {
-                    'message': 'Context auto-compressed to continue the conversation',
-                })
-
-            # Stamp 'timestamp' on any messages that don't have one yet
-            _now = time.time()
-            for _m in s.messages:
-                if isinstance(_m, dict) and not _m.get('timestamp') and not _m.get('_ts'):
-                    _m['timestamp'] = int(_now)
-            # Only auto-generate title when still default; preserves user renames
-            if s.title == 'Untitled' or s.title == 'New Chat' or not s.title:
-                s.title = title_from(s.messages, s.title)
-            _looks_default = (s.title == 'Untitled' or s.title == 'New Chat' or not s.title)
-            _looks_provisional = _is_provisional_title(s.title, s.messages)
-            _invalid_existing_title = _looks_invalid_generated_title(s.title)
-            _should_bg_title = (
-                (_looks_default or _looks_provisional or _invalid_existing_title)
-                and (not getattr(s, 'llm_title_generated', False) or _invalid_existing_title)
-            )
-            _u0 = ''
-            _a0 = ''
-            if _should_bg_title:
-                _u0, _a0 = _first_exchange_snippets(s.messages)
-            # Read token/cost usage from the agent object (if available)
-            input_tokens = getattr(agent, 'session_prompt_tokens', 0) or 0
-            output_tokens = getattr(agent, 'session_completion_tokens', 0) or 0
-            estimated_cost = getattr(agent, 'session_estimated_cost_usd', None)
-            s.input_tokens = (s.input_tokens or 0) + input_tokens
-            s.output_tokens = (s.output_tokens or 0) + output_tokens
-            if estimated_cost:
-                s.estimated_cost = (s.estimated_cost or 0) + estimated_cost
-            # Persist tool-call summaries even when the final message history only
-            # kept bare tool rows and omitted explicit assistant tool_call IDs.
-            tool_calls = _extract_tool_calls_from_messages(
-                s.messages,
-                live_tool_calls=_live_tool_calls,
-            )
-            s.tool_calls = tool_calls
-            s.active_stream_id = None
-            s.pending_user_message = None
-            s.pending_attachments = []
-            s.pending_started_at = None
-            # Tag the matching user message with attachment filenames for display on reload
-            # Only tag a user message whose content relates to this turn's text
-            # (msg_text is the full message including the [Attached files: ...] suffix)
-            if attachments:
-                for m in reversed(s.messages):
-                    if m.get('role') == 'user':
-                        content = str(m.get('content', ''))
-                        # Match if content is part of the sent message or vice-versa
-                        base_text = msg_text.split('\n\n[Attached files:')[0].strip() if '\n\n[Attached files:' in msg_text else msg_text
-                        if base_text[:60] in content or content[:60] in msg_text:
-                            m['attachments'] = attachments
+                # Tag the matching user message with attachment filenames for display on reload
+                # Only tag a user message whose content relates to this turn's text
+                # (msg_text is the full message including the [Attached files: ...] suffix)
+                if attachments:
+                    for m in reversed(s.messages):
+                        if m.get('role') == 'user':
+                            content = str(m.get('content', ''))
+                            # Match if content is part of the sent message or vice-versa
+                            base_text = msg_text.split('\n\n[Attached files:')[0].strip() if '\n\n[Attached files:' in msg_text else msg_text
+                            if base_text[:60] in content or content[:60] in msg_text:
+                                m['attachments'] = attachments
+                                break
+                # Persist reasoning trace in the session so it survives reload.
+                # Must run BEFORE s.save() — otherwise the mutation lives only in
+                # memory until the next turn's save, and the last-turn thinking card
+                # is lost when the user reloads immediately after a response.
+                if _reasoning_text and s.messages:
+                    for _rm in reversed(s.messages):
+                        if isinstance(_rm, dict) and _rm.get('role') == 'assistant':
+                            _rm['reasoning'] = _reasoning_text
                            break
-            # Persist reasoning trace in the session so it survives reload.
-            # Must run BEFORE s.save() — otherwise the mutation lives only in
-            # memory until the next turn's save, and the last-turn thinking card
-            # is lost when the user reloads immediately after a response.
-            if _reasoning_text and s.messages:
-                for _rm in reversed(s.messages):
-                    if isinstance(_rm, dict) and _rm.get('role') == 'assistant':
-                        _rm['reasoning'] = _reasoning_text
-                        break
-            s.save()
+                s.save()
            # Sync to state.db for /insights (opt-in setting)
            try:
                from api.config import load_settings as _load_settings
@@ -1543,23 +1602,29 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
        else:
            _exc_label, _exc_type, _exc_hint = 'Error', 'error', ''
        if s is not None:
-            s.active_stream_id = None
-            s.pending_user_message = None
-            s.pending_attachments = []
-            s.pending_started_at = None
+            if _checkpoint_stop is not None:
+                _checkpoint_stop.set()
+            if _ckpt_thread is not None:
+                _ckpt_thread.join(timeout=15)
            # Persist the error so it survives page reload.
            # _error=True ensures _sanitize_messages_for_api excludes it from subsequent
            # API calls so the LLM never sees its own error as prior context on the next turn.
-            s.messages.append({
-                'role': 'assistant',
-                'content': f'**{_exc_label}:** {err_str}' + (f'\n\n*{_exc_hint}*' if _exc_hint else ''),
-                'timestamp': int(time.time()),
-                '_error': True,
-            })
-            try:
-                s.save()
-            except Exception:
-                pass
+            _lock_ctx = _agent_lock if _agent_lock is not None else contextlib.nullcontext()
+            with _lock_ctx:
+                s.active_stream_id = None
+                s.pending_user_message = None
+                s.pending_attachments = []
+                s.pending_started_at = None
+                s.messages.append({
+                    'role': 'assistant',
+                    'content': f'**{_exc_label}:** {err_str}' + (f'\n\n*{_exc_hint}*' if _exc_hint else ''),
+                    'timestamp': int(time.time()),
+                    '_error': True,
+                })
+                try:
+                    s.save()
+                except Exception:
+                    pass
        _apperror_payload: dict = {'message': err_str, 'type': _exc_type}
        if _exc_hint:
            _apperror_payload['hint'] = _exc_hint
@@ -1568,6 +1633,8 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
        # Stop periodic checkpoint thread if it was started (Issue #765)
        if _checkpoint_stop is not None:
            _checkpoint_stop.set()
+        if _ckpt_thread is not None:
+            _ckpt_thread.join(timeout=15)
        _clear_thread_env()  # TD1: always clear thread-local context
        with STREAMS_LOCK:
            STREAMS.pop(stream_id, None)
@@ -1662,55 +1729,60 @@ def cancel_stream(stream_id: str) -> bool:
        _cancel_partial_text = STREAM_PARTIAL_TEXT.get(stream_id, '')

    # Session cleanup outside STREAMS_LOCK to preserve lock ordering.
+    # Acquire the per-session _agent_lock too, mirroring every other session
+    # writer (streaming success/error paths, periodic checkpoint, POST endpoints)
+    # so the cancel-path mutation races neither the checkpoint thread nor
+    # concurrent undo/retry calls.
    if _cancel_session_id:
-        try:
-            _cs = get_session(_cancel_session_id)
-            _cs.active_stream_id = None
-            _cs.pending_user_message = None
-            _cs.pending_attachments = []
-            _cs.pending_started_at = None
-            # Persist any partial assistant text that was streamed before cancel (#893).
-            # Preserving partial content means the user sees what the agent had
-            # produced rather than losing it entirely.  The marker is _partial=True
-            # (for session/UI identification) — NOT _error=True — so the partial
-            # content IS kept in the history sent to the agent on the next user
-            # message, letting the model continue from where it was cut off.
-            # See the inner comment on the append call below for the rationale.
-            partial_text = _cancel_partial_text.strip() if _cancel_partial_text else ''
-            if partial_text:
-                import re as _re
-                # Strip thinking/reasoning markup from partial content before saving.
-                # First pass: remove complete <think>...</think> and <thinking>...</thinking> blocks.
-                _stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*?</think(?:ing)?>',
-                                    '', partial_text,
-                                    flags=_re.DOTALL | _re.IGNORECASE).strip()
-                # Second pass: strip trailing UNCLOSED think/thinking block (the common
-                # cancel case — user stops mid-reasoning before the close tag appears).
-                _stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*',
-                                    '', _stripped,
-                                    flags=_re.DOTALL | _re.IGNORECASE).strip()
-                if _stripped:
-                    # Mark _partial=True for session/UI identification only.
-                    # Deliberately NOT _error=True — the partial content is real model
-                    # output and should be visible in conversation history so the model
-                    # can continue from it on the next turn (#893).
-                    _cs.messages.append({
-                        'role': 'assistant',
-                        'content': _stripped,
-                        '_partial': True,
-                        'timestamp': int(time.time()),
-                    })
-            # Cancel marker — flagged _error=True so it is stripped from conversation
-            # history on the next turn (prevents model from seeing "Task cancelled."
-            # as a prior assistant reply).
-            _cs.messages.append({
-                'role': 'assistant',
-                'content': '*Task cancelled.*',
-                '_error': True,
-                'timestamp': int(time.time()),
-            })
-            _cs.save()
-        except Exception:
-            logger.debug("Failed to clear session state on cancel for %s", _cancel_session_id)
+        with _get_session_agent_lock(_cancel_session_id):
+            try:
+                _cs = get_session(_cancel_session_id)
+                _cs.active_stream_id = None
+                _cs.pending_user_message = None
+                _cs.pending_attachments = []
+                _cs.pending_started_at = None
+                # Persist any partial assistant text that was streamed before cancel (#893).
+                # Preserving partial content means the user sees what the agent had
+                # produced rather than losing it entirely.  The marker is _partial=True
+                # (for session/UI identification only) — NOT _error=True — so the partial
+                # content IS kept in the history sent to the agent on the next user
+                # message, letting the model continue from where it was cut off.
+                # See the inner comment on the append call below for the rationale.
+                partial_text = _cancel_partial_text.strip() if _cancel_partial_text else ''
+                if partial_text:
+                    import re as _re
+                    # Strip thinking/reasoning markup from partial content before saving.
+                    # First pass: remove complete <thinking>...</thinking> blocks.
+                    _stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*?</think(?:ing)?>',
+                                        '', partial_text,
+                                        flags=_re.DOTALL | _re.IGNORECASE).strip()
+                    # Second pass: strip trailing UNCLOSED think/thinking block (the common
+                    # cancel case — user stops mid-reasoning before the close tag appears).
+                    _stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*',
+                                        '', _stripped,
+                                        flags=_re.DOTALL | _re.IGNORECASE).strip()
+                    if _stripped:
+                        # Mark _partial=True for session/UI identification only.
+                        # Deliberately NOT _error=True — the partial content is real model
+                        # output and should be visible in conversation history so the model
+                        # can continue from it on the next turn (#893).
+                        _cs.messages.append({
+                            'role': 'assistant',
+                            'content': _stripped,
+                            '_partial': True,
+                            'timestamp': int(time.time()),
+                        })
+                # Cancel marker — flagged _error=True so it is stripped from conversation
+                # history on the next turn (prevents model from seeing "Task cancelled."
+                # as a prior assistant reply).
+                _cs.messages.append({
+                    'role': 'assistant',
+                    'content': '*Task cancelled.*',
+                    '_error': True,
+                    'timestamp': int(time.time()),
+                })
+                _cs.save()
+            except Exception:
+                logger.debug("Failed to clear session state on cancel for %s", _cancel_session_id)

    return True