fix: harden session persistence and per-session lock handling during streaming (v0.50.175, #910) (#910)

Co-authored-by: starship-s Co-authored-by: nesquena-hermes <nesquena-hermes@users.noreply.github.com>
2026-04-23 14:25:43 -07:00
parent 5082f426f2
commit 5b923a9502
9 changed files with 1237 additions and 429 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,11 @@
  workspace subtree) and never enumerate blocked system roots. (`api/routes.py`,
  `api/workspace.py`, `static/panels.js`, `static/style.css`) (partial for #616)

+## [v0.50.175] — 2026-04-23
+
+### Fixed
+- **Session persistence hardened against concurrent write races** — all session-mutation paths (streaming success/error/cancel, periodic checkpoint, HTTP endpoints for title/personality/workspace/clear/pin/archive/project) now hold a per-session `_agent_lock` during in-memory mutation and `Session.save()`. The checkpoint thread is stopped and joined before the final save, preventing stale object clobbers. `Session.save()` uses fsync + atomic rename with a pid+thread_id tmp suffix. `_write_session_index()` gets a dedicated `_INDEX_WRITE_LOCK` so disk I/O runs outside the global `LOCK`, reducing head-of-line blocking. Context compression now runs the LLM call outside the lock with a stale-edit check (409) on write-back. (`api/streaming.py`, `api/models.py`, `api/routes.py`, `api/session_ops.py`, `api/config.py`) Closes #765. Co-authored by @starship-s.
+
 ## [v0.50.174] — 2026-04-23

 ### Fixed
--- a/api/config.py
+++ b/api/config.py
@@ -1683,6 +1683,25 @@ SESSION_AGENT_LOCKS_LOCK = threading.Lock()


 def _get_session_agent_lock(session_id: str) -> threading.Lock:
+    """Return the per-session Lock used to serialize all Session mutations.
+
+    Lock lifecycle invariant:
+      - A Lock is created lazily on first access and lives in SESSION_AGENT_LOCKS
+        for the lifetime of the session.
+      - The entry is pruned in /api/session/delete (under SESSION_AGENT_LOCKS_LOCK)
+        so deleted sessions don't leak a Lock forever.
+      - During context compression the agent may rotate session_id.  The
+        streaming thread migrates the lock entry atomically under
+        SESSION_AGENT_LOCKS_LOCK: it aliases the new session_id to the *same*
+        Lock object and pops the old-id entry (see streaming.py compression
+        block).  This ensures that subsequent callers using the new ID still
+        acquire the same Lock, while the old-id entry is removed to prevent a
+        leak.  The streaming thread already holds the Lock during this
+        migration, so the reference stays alive even after the dict entry is
+        removed.
+      - Lock contract: hold for the in-memory mutation + s.save() only; never
+        across network I/O (LLM calls, HTTP requests).
+    """
    with SESSION_AGENT_LOCKS_LOCK:
        if session_id not in SESSION_AGENT_LOCKS:
            SESSION_AGENT_LOCKS[session_id] = threading.Lock()
--- a/api/models.py
+++ b/api/models.py
@@ -1,10 +1,9 @@
-"""
-Hermes Web UI -- Session model and in-memory session store.
-"""
+"""Hermes Web UI -- Session model and in-memory session store."""
 import collections
 import json
 import logging
 import os
+import threading
 import time
 import uuid
 from pathlib import Path
@@ -19,6 +18,46 @@ from api.workspace import get_last_workspace

 logger = logging.getLogger(__name__)

+# ---------------------------------------------------------------------------
+# Stale temp-file cleanup
+# ---------------------------------------------------------------------------
+# Both Session.save() and _write_session_index() use the atomic-write pattern:
+#   write to  <path>.tmp.<pid>.<tid>  →  os.replace() to final path
+# If the process crashes between write and replace the .tmp file is left
+# behind.  Because the name embeds pid + tid, leftover files can never be
+# reused by a different process/thread, so they are safe to remove on the
+# next startup.  _cleanup_stale_tmp_files() is called from the full-rebuild
+# path of _write_session_index (i.e. at first index access / startup) and
+# removes any *.tmp.* file whose mtime is older than one hour.
+# ---------------------------------------------------------------------------
+
+_STALE_TMP_AGE_SECONDS = 3600  # 1 hour
+
+# Serializes index writers so concurrent Session.save() calls cannot race on
+# stale baselines while still allowing LOCK to be released before disk I/O.
+_INDEX_WRITE_LOCK = threading.RLock()
+
+
+def _cleanup_stale_tmp_files() -> None:
+    """Best-effort removal of stale ``*.tmp.*`` files from SESSION_DIR.
+
+    Only files whose mtime is older than ``_STALE_TMP_AGE_SECONDS`` are
+    removed so that in-flight writes from a long-running sibling process
+    are not disturbed.  Errors are logged and swallowed — this must never
+    prevent startup.
+    """
+    cutoff = time.time() - _STALE_TMP_AGE_SECONDS
+    try:
+        for p in SESSION_DIR.glob('*.tmp.*'):
+            try:
+                if p.stat().st_mtime < cutoff:
+                    p.unlink(missing_ok=True)
+                    logger.debug("Cleaned up stale tmp file: %s", p.name)
+            except OSError:
+                pass  # best-effort
+    except Exception:
+        pass  # SESSION_DIR may not exist yet; that's fine
+

 def _index_entry_exists(session_id: str, in_memory_ids=None) -> bool:
    """Return True if an index entry still has backing state.
@@ -46,58 +85,101 @@ def _write_session_index(updates=None):
    entries should be refreshed), this does a targeted in-place update of
    the existing index — O(1) for single-session changes.  When *updates*
    is None, a full rebuild is performed (used on startup / first call).
-    """
-    # Lazy full-rebuild path — used when index doesn't exist yet.
-    if updates is None or not SESSION_INDEX_FILE.exists():
-        entries = []
-        for p in SESSION_DIR.glob('*.json'):
-            if p.name.startswith('_'): continue
-            try:
-                s = Session.load(p.stem)
-                if s: entries.append(s.compact())
-            except Exception:
-                logger.debug("Failed to load session from %s", p)
-        with LOCK:
-            for s in SESSIONS.values():
-                if not any(e['session_id'] == s.session_id for e in entries):
-                    entries.append(s.compact())
-            entries.sort(key=lambda s: s['updated_at'], reverse=True)
-            _tmp = SESSION_INDEX_FILE.with_suffix('.tmp')
-            _tmp.write_text(json.dumps(entries, ensure_ascii=False, indent=2), encoding='utf-8')
-            os.replace(_tmp, SESSION_INDEX_FILE)
-        return

-    # Fast path: patch existing index with updated sessions.
-    # This avoids loading every session file on every single save().
-    # LOCK covers the entire read-patch-write to prevent concurrent save() calls
-    # from both reading the same baseline and one losing its update.
-    _fallback = False
-    try:
-        with LOCK:
-            existing = json.loads(SESSION_INDEX_FILE.read_text(encoding='utf-8'))
-            in_memory_ids = set(SESSIONS.keys())
-            existing = [
-                e for e in existing
-                if _index_entry_exists(e.get('session_id'), in_memory_ids=in_memory_ids)
-            ]
-            # Build lookup of updated entries
-            updated_map = {s.session_id: s.compact() for s in updates}
-            existing_ids = {e.get('session_id') for e in existing}
-            # Add any updated entries not yet in the index
-            for sid, entry in updated_map.items():
-                if sid not in existing_ids:
-                    existing.append(entry)
-            # Replace matching entries in-place
-            for i, e in enumerate(existing):
-                sid = e.get('session_id')
-                if sid in updated_map:
-                    existing[i] = updated_map[sid]
-            existing.sort(key=lambda s: s.get('updated_at', 0), reverse=True)
-            _tmp = SESSION_INDEX_FILE.with_suffix('.tmp')
-            _tmp.write_text(json.dumps(existing, ensure_ascii=False, indent=2), encoding='utf-8')
-            os.replace(_tmp, SESSION_INDEX_FILE)
-    except Exception:
-        _fallback = True
+    LOCK protects in-memory state snapshots and payload construction only;
+    disk I/O (write/flush/fsync/replace) always runs outside LOCK.
+    """
+    _tmp = SESSION_INDEX_FILE.with_suffix(f'.tmp.{os.getpid()}.{threading.current_thread().ident}')
+
+    with _INDEX_WRITE_LOCK:
+        # Lazy full-rebuild path — used when index doesn't exist yet.
+        if updates is None or not SESSION_INDEX_FILE.exists():
+            _cleanup_stale_tmp_files()  # best-effort sweep on startup / first call
+            entries = []
+            for p in SESSION_DIR.glob('*.json'):
+                if p.name.startswith('_'):
+                    continue
+                try:
+                    s = Session.load(p.stem)
+                    if s:
+                        entries.append(s.compact())
+                except Exception:
+                    logger.debug("Failed to load session from %s", p)
+
+            with LOCK:
+                existing_ids = {e.get('session_id') for e in entries}
+                for s in SESSIONS.values():
+                    if s.session_id not in existing_ids:
+                        entries.append(s.compact())
+                entries.sort(key=lambda s: s.get('updated_at', 0), reverse=True)
+                _payload = json.dumps(entries, ensure_ascii=False, indent=2)
+
+            try:
+                with open(_tmp, 'w', encoding='utf-8') as f:
+                    f.write(_payload)
+                    f.flush()
+                    os.fsync(f.fileno())
+                os.replace(_tmp, SESSION_INDEX_FILE)
+            except Exception:
+                # Best-effort cleanup of stale tmp on failure
+                try:
+                    _tmp.unlink(missing_ok=True)
+                except Exception:
+                    pass
+                raise
+            return
+
+        # Fast path: patch existing index with updated sessions.
+        # This avoids loading every session file on every single save().
+        _fallback = False
+        try:
+            with LOCK:
+                existing = json.loads(SESSION_INDEX_FILE.read_text(encoding='utf-8'))
+                in_memory_ids = set(SESSIONS.keys())
+
+                # Avoid N filesystem exists() checks under LOCK by collecting
+                # on-disk IDs once.
+                on_disk_ids = {
+                    p.stem
+                    for p in SESSION_DIR.glob('*.json')
+                    if not p.name.startswith('_')
+                }
+
+                existing = [
+                    e for e in existing
+                    if (e.get('session_id') in in_memory_ids or e.get('session_id') in on_disk_ids)
+                ]
+
+                # Build lookup of updated entries
+                updated_map = {s.session_id: s.compact() for s in updates}
+                existing_ids = {e.get('session_id') for e in existing}
+                # Add any updated entries not yet in the index
+                for sid, entry in updated_map.items():
+                    if sid not in existing_ids:
+                        existing.append(entry)
+                # Replace matching entries in-place
+                for i, e in enumerate(existing):
+                    sid = e.get('session_id')
+                    if sid in updated_map:
+                        existing[i] = updated_map[sid]
+                existing.sort(key=lambda s: s.get('updated_at', 0), reverse=True)
+                _payload = json.dumps(existing, ensure_ascii=False, indent=2)
+
+            try:
+                with open(_tmp, 'w', encoding='utf-8') as f:
+                    f.write(_payload)
+                    f.flush()
+                    os.fsync(f.fileno())
+                os.replace(_tmp, SESSION_INDEX_FILE)
+            except Exception:
+                try:
+                    _tmp.unlink(missing_ok=True)
+                except Exception:
+                    pass
+                raise
+        except Exception:
+            _fallback = True
+
    if _fallback:
        # Corrupt or missing index — fall back to full rebuild (called outside LOCK to avoid deadlock)
        _write_session_index(updates=None)
@@ -157,10 +239,20 @@ class Session:
    def save(self, touch_updated_at: bool = True, skip_index: bool = False) -> None:
        if touch_updated_at:
            self.updated_at = time.time()
-        self.path.write_text(
-            json.dumps(self.__dict__, ensure_ascii=False, indent=2),
-            encoding='utf-8',
-        )
+        payload = json.dumps(self.__dict__, ensure_ascii=False, indent=2)
+        tmp = self.path.with_suffix(f'.tmp.{os.getpid()}.{threading.current_thread().ident}')
+        try:
+            with open(tmp, 'w', encoding='utf-8') as f:
+                f.write(payload)
+                f.flush()
+                os.fsync(f.fileno())
+            os.replace(tmp, self.path)
+        except Exception:
+            try:
+                tmp.unlink(missing_ok=True)
+            except Exception:
+                pass
+            raise
        if not skip_index:
            _write_session_index(updates=[self])

--- a/api/routes.py
+++ b/api/routes.py
@@ -66,6 +66,9 @@ from api.config import (
    MAX_FILE_BYTES,
    MAX_UPLOAD_BYTES,
    CHAT_LOCK,
+    _get_session_agent_lock,
+    SESSION_AGENT_LOCKS,
+    SESSION_AGENT_LOCKS_LOCK,
    load_settings,
    save_settings,
    set_hermes_default_model,
@@ -1049,8 +1052,9 @@ def handle_post(handler, parsed) -> bool:
            s = get_session(body["session_id"])
        except KeyError:
            return bad(handler, "Session not found", 404)
-        s.title = str(body["title"]).strip()[:80] or "Untitled"
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.title = str(body["title"]).strip()[:80] or "Untitled"
+            s.save()
        return j(handler, {"session": s.compact()})

    if parsed.path == "/api/personality/set":
@@ -1093,8 +1097,9 @@ def handle_post(handler, parsed) -> bool:
                prompt = "\n".join(p for p in parts if p)
            else:
                prompt = str(value)
-        s.personality = name if name else None
-        s.save()
+        with _get_session_agent_lock(sid):
+            s.personality = name if name else None
+            s.save()
        return j(handler, {"ok": True, "personality": s.personality, "prompt": prompt})

    if parsed.path == "/api/session/update":
@@ -1110,9 +1115,10 @@ def handle_post(handler, parsed) -> bool:
            new_ws = str(resolve_trusted_workspace(body.get("workspace", s.workspace)))
        except ValueError as e:
            return bad(handler, str(e))
-        s.workspace = new_ws
-        s.model = body.get("model", s.model)
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.workspace = new_ws
+            s.model = body.get("model", s.model)
+            s.save()
        set_last_workspace(new_ws)
        return j(handler, {"session": s.compact() | {"messages": s.messages}})

@@ -1134,6 +1140,10 @@ def handle_post(handler, parsed) -> bool:
            p.unlink(missing_ok=True)
        except Exception:
            logger.debug("Failed to unlink session file %s", p)
+        # Prune the per-session agent lock so deleted sessions don't leak
+        # Lock entries in SESSION_AGENT_LOCKS forever.
+        with SESSION_AGENT_LOCKS_LOCK:
+            SESSION_AGENT_LOCKS.pop(sid, None)
        try:
            SESSION_INDEX_FILE.unlink(missing_ok=True)
        except Exception:
@@ -1156,10 +1166,11 @@ def handle_post(handler, parsed) -> bool:
            s = get_session(body["session_id"])
        except KeyError:
            return bad(handler, "Session not found", 404)
-        s.messages = []
-        s.tool_calls = []
-        s.title = "Untitled"
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.messages = []
+            s.tool_calls = []
+            s.title = "Untitled"
+            s.save()
        return j(handler, {"ok": True, "session": s.compact()})

    if parsed.path == "/api/session/truncate":
@@ -1174,8 +1185,9 @@ def handle_post(handler, parsed) -> bool:
        except KeyError:
            return bad(handler, "Session not found", 404)
        keep = int(body["keep_count"])
-        s.messages = s.messages[:keep]
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.messages = s.messages[:keep]
+            s.save()
        return j(
            handler, {"ok": True, "session": s.compact() | {"messages": s.messages}}
        )
@@ -1448,8 +1460,9 @@ def handle_post(handler, parsed) -> bool:
            s = get_session(body["session_id"])
        except KeyError:
            return bad(handler, "Session not found", 404)
-        s.pinned = bool(body.get("pinned", True))
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.pinned = bool(body.get("pinned", True))
+            s.save()
        return j(handler, {"ok": True, "session": s.compact()})

    # ── Session archive (POST) ──
@@ -1462,8 +1475,9 @@ def handle_post(handler, parsed) -> bool:
            s = get_session(body["session_id"])
        except KeyError:
            return bad(handler, "Session not found", 404)
-        s.archived = bool(body.get("archived", True))
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.archived = bool(body.get("archived", True))
+            s.save()
        return j(handler, {"ok": True, "session": s.compact()})

    # ── Session move to project (POST) ──
@@ -1476,8 +1490,9 @@ def handle_post(handler, parsed) -> bool:
            s = get_session(body["session_id"])
        except KeyError:
            return bad(handler, "Session not found", 404)
-        s.project_id = body.get("project_id") or None
-        s.save()
+        with _get_session_agent_lock(body["session_id"]):
+            s.project_id = body.get("project_id") or None
+            s.save()
        return j(handler, {"ok": True, "session": s.compact()})

    # ── Project CRUD (POST) ──
@@ -2445,13 +2460,14 @@ def _handle_chat_start(handler, body):
        # Stale stream id from a previous run; clear and continue.
        s.active_stream_id = None
    stream_id = uuid.uuid4().hex
-    s.workspace = workspace
-    s.model = model
-    s.active_stream_id = stream_id
-    s.pending_user_message = msg
-    s.pending_attachments = attachments
-    s.pending_started_at = time.time()
-    s.save()
+    with _get_session_agent_lock(s.session_id):
+        s.workspace = workspace
+        s.model = model
+        s.active_stream_id = stream_id
+        s.pending_user_message = msg
+        s.pending_attachments = attachments
+        s.pending_started_at = time.time()
+        s.save()
    set_last_workspace(workspace)
    q = queue.Queue()
    with STREAMS_LOCK:
@@ -2470,15 +2486,14 @@ def _handle_chat_start(handler, body):

 def _handle_chat_sync(handler, body):
    """Fallback synchronous chat endpoint (POST /api/chat). Not used by frontend."""
-    from api.config import _get_session_agent_lock
-
    s = get_session(body["session_id"])
    msg = str(body.get("message", "")).strip()
    if not msg:
        return j(handler, {"error": "empty message"}, status=400)
    workspace = Path(body.get("workspace") or s.workspace).expanduser().resolve()
-    s.workspace = str(workspace)
-    s.model = body.get("model") or s.model
+    with _get_session_agent_lock(s.session_id):
+        s.workspace = str(workspace)
+        s.model = body.get("model") or s.model
    from api.streaming import _ENV_LOCK

    with _ENV_LOCK:
@@ -2559,14 +2574,15 @@ def _handle_chat_sync(handler, body):
                os.environ.pop("HERMES_SESSION_KEY", None)
            else:
                os.environ["HERMES_SESSION_KEY"] = old_session_key
-    s.messages = _restore_reasoning_metadata(
-        _previous_messages,
-        result.get("messages") or s.messages,
-    )
-    # Only auto-generate title when still default; preserves user renames
-    if s.title == "Untitled":
-        s.title = title_from(s.messages, s.title)
-    s.save()
+    with _get_session_agent_lock(s.session_id):
+        s.messages = _restore_reasoning_metadata(
+            _previous_messages,
+            result.get("messages") or s.messages,
+        )
+        # Only auto-generate title when still default; preserves user renames
+        if s.title == "Untitled":
+            s.title = title_from(s.messages, s.title)
+        s.save()
    # Sync to state.db for /insights (opt-in setting)
    try:
        if load_settings().get("sync_to_insights"):
@@ -3094,33 +3110,42 @@ def _handle_session_compress(handler, body):
        if not resolved_api_key:
            return bad(handler, "No provider configured -- cannot compress.")

-        with _cfg._get_session_agent_lock(sid):
-            original_messages = list(messages)
-            approx_tokens = _estimate_messages_tokens_rough(original_messages)
+        # Compute compression *outside* the lock — the LLM round-trip can take
+        # many seconds and we must not block cancel_stream or other writers.
+        # Lock contract: hold for the in-memory mutation only, never across
+        # network I/O.
+        original_messages = list(messages)
+        approx_tokens = _estimate_messages_tokens_rough(original_messages)

-            agent = _run_agent.AIAgent(
-                model=resolved_model,
-                provider=resolved_provider,
-                base_url=resolved_base_url,
-                api_key=resolved_api_key,
-                platform="cli",
-                quiet_mode=True,
-                enabled_toolsets=_resolve_cli_toolsets(),
-                session_id=sid,
-            )
-            compressed = agent.context_compressor.compress(
-                original_messages,
-                current_tokens=approx_tokens,
-                focus_topic=focus_topic,
-            )
-            new_tokens = _estimate_messages_tokens_rough(compressed)
-            summary = _summarize_manual_compression(
-                original_messages,
-                compressed,
-                approx_tokens,
-                new_tokens,
-                focus_topic=focus_topic,
-            )
+        agent = _run_agent.AIAgent(
+            model=resolved_model,
+            provider=resolved_provider,
+            base_url=resolved_base_url,
+            api_key=resolved_api_key,
+            platform="cli",
+            quiet_mode=True,
+            enabled_toolsets=_resolve_cli_toolsets(),
+            session_id=sid,
+        )
+        compressed = agent.context_compressor.compress(
+            original_messages,
+            current_tokens=approx_tokens,
+            focus_topic=focus_topic,
+        )
+        new_tokens = _estimate_messages_tokens_rough(compressed)
+        summary = _summarize_manual_compression(
+            original_messages,
+            compressed,
+            approx_tokens,
+            new_tokens,
+            focus_topic=focus_topic,
+        )
+
+        with _cfg._get_session_agent_lock(sid):
+            # Re-read messages to detect concurrent edits during the LLM call.
+            # If the history changed, the compression result is stale — abort.
+            if _sanitize_messages_for_api(s.messages) != original_messages:
+                return bad(handler, "Session was modified during compression; please retry.", 409)

            s.messages = compressed
            s.tool_calls = []
--- a/api/session_ops.py
+++ b/api/session_ops.py
@@ -9,7 +9,7 @@ from __future__ import annotations
 import logging
 from typing import Any

-from api.config import LOCK
+from api.config import LOCK, _get_session_agent_lock
 from api.models import get_session, SESSIONS

 logger = logging.getLogger(__name__)
@@ -27,38 +27,43 @@ def retry_last(session_id: str) -> dict[str, Any]:
        KeyError: session not found
        ValueError: no user message in transcript
    """
-    # get_session() and Session.save() both acquire the module-level LOCK
-    # internally (the latter via _write_session_index()), and LOCK is a
-    # non-reentrant threading.Lock — so they MUST be called outside our
-    # own `with LOCK:` block to avoid self-deadlocking.
-    #
-    # The race we close is the read-modify-write of s.messages: two
-    # concurrent /api/session/retry calls could otherwise both compute the
-    # same last_user_idx from the same history and double-truncate. We
-    # serialize just the in-memory mutation; persistence happens outside
-    # the lock and is naturally last-write-wins on a consistent state.
-    #
-    # Stale-object guard: on a cache miss, two concurrent get_session()
-    # calls can each load and cache a *different* Session instance for the
-    # same session_id (the second store_clobbers the first). Re-bind to
-    # the canonical cached instance inside the lock so the mutation lands
-    # on the object the next reader will see, not a stale parallel copy.
-    s = get_session(session_id)  # raises KeyError if missing
-    with LOCK:
-        s = SESSIONS.get(session_id, s)
-        history = s.messages or []
-        last_user_idx = None
-        for i in range(len(history) - 1, -1, -1):
-            if history[i].get('role') == 'user':
-                last_user_idx = i
-                break
-        if last_user_idx is None:
-            raise ValueError('No previous message to retry.')
+    # Acquire the per-session agent lock as the outermost lock so that the
+    # read-modify-write of s.messages is serialised with the periodic
+    # checkpoint thread, cancel_stream, and all other session writers.
+    # Lock ordering: _agent_lock → LOCK → _write_session_index (LOCK).
+    with _get_session_agent_lock(session_id):
+        # get_session() and Session.save() both acquire the module-level LOCK
+        # internally (the latter via _write_session_index()), and LOCK is a
+        # non-reentrant threading.Lock — so they MUST be called outside our
+        # own `with LOCK:` block to avoid self-deadlocking.
+        #
+        # The race we close is the read-modify-write of s.messages: two
+        # concurrent /api/session/retry calls could otherwise both compute the
+        # same last_user_idx from the same history and double-truncate. We
+        # serialize just the in-memory mutation; persistence happens inside
+        # the per-session lock so the checkpoint thread cannot race us.
+        #
+        # Stale-object guard: on a cache miss, two concurrent get_session()
+        # calls can each load and cache a *different* Session instance for the
+        # same session_id (the second store clobbers the first). Re-bind to
+        # the canonical cached instance inside the lock so the mutation lands
+        # on the object the next reader will see, not a stale parallel copy.
+        s = get_session(session_id)  # raises KeyError if missing
+        with LOCK:
+            s = SESSIONS.get(session_id, s)
+            history = s.messages or []
+            last_user_idx = None
+            for i in range(len(history) - 1, -1, -1):
+                if history[i].get('role') == 'user':
+                    last_user_idx = i
+                    break
+            if last_user_idx is None:
+                raise ValueError('No previous message to retry.')

-        last_user_text = _extract_text(history[last_user_idx].get('content', ''))
-        removed_count = len(history) - last_user_idx
-        s.messages = history[:last_user_idx]
-    s.save()
+            last_user_text = _extract_text(history[last_user_idx].get('content', ''))
+            removed_count = len(history) - last_user_idx
+            s.messages = history[:last_user_idx]
+        s.save()
    return {'last_user_text': last_user_text, 'removed_count': removed_count}


@@ -72,23 +77,28 @@ def undo_last(session_id: str) -> dict[str, Any]:
        KeyError: session not found
        ValueError: no user message in transcript
    """
-    s = get_session(session_id)  # acquires LOCK transiently
-    with LOCK:
-        # Stale-object guard — see retry_last for the rationale.
-        s = SESSIONS.get(session_id, s)
-        history = s.messages or []
-        last_user_idx = None
-        for i in range(len(history) - 1, -1, -1):
-            if history[i].get('role') == 'user':
-                last_user_idx = i
-                break
-        if last_user_idx is None:
-            raise ValueError('Nothing to undo.')
+    # Acquire the per-session agent lock as the outermost lock so that the
+    # read-modify-write of s.messages is serialised with the periodic
+    # checkpoint thread, cancel_stream, and all other session writers.
+    # Lock ordering: _agent_lock → LOCK → _write_session_index (LOCK).
+    with _get_session_agent_lock(session_id):
+        s = get_session(session_id)  # acquires LOCK transiently
+        with LOCK:
+            # Stale-object guard — see retry_last for the rationale.
+            s = SESSIONS.get(session_id, s)
+            history = s.messages or []
+            last_user_idx = None
+            for i in range(len(history) - 1, -1, -1):
+                if history[i].get('role') == 'user':
+                    last_user_idx = i
+                    break
+            if last_user_idx is None:
+                raise ValueError('Nothing to undo.')

-        removed_text = _extract_text(history[last_user_idx].get('content', ''))
-        removed_count = len(history) - last_user_idx
-        s.messages = history[:last_user_idx]
-    s.save()  # outside LOCK -- save() re-acquires LOCK via _write_session_index()
+            removed_text = _extract_text(history[last_user_idx].get('content', ''))
+            removed_count = len(history) - last_user_idx
+            s.messages = history[:last_user_idx]
+        s.save()  # outside LOCK -- save() re-acquires LOCK via _write_session_index()
    preview = (removed_text[:40] + '...') if len(removed_text) > 40 else removed_text
    return {
        'removed_count': removed_count,
--- a/api/streaming.py
+++ b/api/streaming.py
@@ -2,6 +2,7 @@
 Hermes Web UI -- SSE streaming engine and agent thread runner.
 Includes Sprint 10 cancel support via CANCEL_FLAGS.
 """
+import contextlib
 import json
 import logging
 import os
@@ -20,6 +21,7 @@ from api.config import (
    STREAMS, STREAMS_LOCK, CANCEL_FLAGS, AGENT_INSTANCES, STREAM_PARTIAL_TEXT,
    LOCK, SESSIONS, SESSION_DIR,
    _get_session_agent_lock, _set_thread_env, _clear_thread_env,
+    SESSION_AGENT_LOCKS, SESSION_AGENT_LOCKS_LOCK,
    resolve_model_provider,
 )
 from api.helpers import redact_session_data
@@ -534,18 +536,46 @@ def _run_background_title_update(session_id: str, user_text: str, assistant_text
            if next_title:
                logger.debug("Using local fallback for session title generation")
                source = 'fallback'
-        if next_title and next_title != current:
-            s.title = next_title
-            s.llm_title_generated = True
-            # Keep chronological ordering stable in the sidebar.
-            s.save(touch_updated_at=False)
+        wrote_title = False
+        effective_title = current
+        if next_title:
+            # Hold _agent_lock only for in-memory mutation + save so title write
+            # is serialized with checkpoint saves, cancel_stream, and other
+            # session-mutating endpoints. The LLM round-trip above ran outside
+            # the lock to avoid blocking other writers.
+            with _get_session_agent_lock(session_id):
+                # Stale-object guard: rebind to the canonical cached Session
+                # instance under LOCK before checking whether a user rename
+                # landed while the LLM title request was in-flight.
+                with LOCK:
+                    s = SESSIONS.get(session_id, s)
+                    effective_title = str(s.title or '').strip()
+                    invalid_existing_now = _looks_invalid_generated_title(s.title)
+                    still_auto = (
+                        effective_title == placeholder_title
+                        or effective_title in ('Untitled', 'New Chat', '')
+                        or _is_provisional_title(effective_title, s.messages)
+                        or invalid_existing_now
+                    )
+                if not still_auto:
+                    _put_title_status(put_event, session_id, 'skipped', 'manual_title', effective_title)
+                    return
+                if next_title != effective_title:
+                    s.title = next_title
+                    s.llm_title_generated = True
+                    # Keep chronological ordering stable in the sidebar.
+                    s.save(touch_updated_at=False)
+                    effective_title = s.title
+                    wrote_title = True
+
+        if wrote_title:
            if source == 'fallback':
-                _put_title_status(put_event, session_id, source, 'local_summary', s.title, raw_preview)
+                _put_title_status(put_event, session_id, source, 'local_summary', effective_title, raw_preview)
            else:
-                _put_title_status(put_event, session_id, source, llm_status, s.title, raw_preview)
-            put_event('title', {'session_id': session_id, 'title': s.title})
+                _put_title_status(put_event, session_id, source, llm_status, effective_title, raw_preview)
+            put_event('title', {'session_id': session_id, 'title': effective_title})
        else:
-            _put_title_status(put_event, session_id, 'skipped', source or 'unchanged', current, raw_preview)
+            _put_title_status(put_event, session_id, 'skipped', source or 'unchanged', effective_title, raw_preview)
    finally:
        put_event('stream_end', {'session_id': session_id})

@@ -830,6 +860,8 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
    # block can safely check `if _checkpoint_stop is not None` even when an
    # exception fires before the checkpoint thread is created (Issue #765).
    _checkpoint_stop = None
+    _ckpt_thread = None
+    _agent_lock = None
    try:
        s = get_session(session_id)
        s.workspace = str(Path(workspace).expanduser().resolve())
@@ -974,6 +1006,11 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
                _reasoning_text += str(text)
                put('reasoning', {'text': str(text)})

+            # Pre-initialise the activity counter here so on_tool (which
+            # closes over it) never captures an unbound name even if this
+            # block is reordered later (Issue #765).
+            _checkpoint_activity = [0]
+
            def on_tool(*cb_args, **cb_kwargs):
                event_type = None
                name = None
@@ -1224,7 +1261,7 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
            # response — better than a silent loss of the entire conversation turn.
            # The final s.save() at task completion handles the full session update + index.
            # (_checkpoint_stop is pre-initialised at the top of the outer try.)
-            _checkpoint_activity = [0]
+            # (_checkpoint_activity is already initialised before on_tool().)

            def _periodic_checkpoint():
                last_saved_activity = 0
@@ -1232,7 +1269,8 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
                    try:
                        cur = _checkpoint_activity[0]
                        if cur > last_saved_activity:
-                            s.save(skip_index=True)
+                            with _agent_lock:
+                                s.save(skip_index=True)
                            last_saved_activity = cur
                    except Exception as e:
                        logger.debug("Periodic checkpoint save failed: %s", e)
@@ -1251,193 +1289,214 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
                task_id=session_id,
                persist_user_message=msg_text,
            )
-            s.messages = _restore_reasoning_metadata(
-                _previous_messages,
-                result.get('messages') or s.messages,
-            )
-            # Strip XML tool-call blocks from assistant message content.
-            # DeepSeek and some other providers emit <function_calls>...</function_calls>
-            # in the raw response text; this must be removed before the content is
-            # saved to the session and displayed in the chat bubble. (#702)
-            for _m in s.messages:
-                if isinstance(_m, dict) and _m.get('role') == 'assistant':
-                    _raw_content = _m.get('content')
-                    if isinstance(_raw_content, str):
-                        _cleaned = _strip_xml_tool_calls(_raw_content)
-                        if _cleaned != _raw_content:
-                            _m['content'] = _cleaned
-                    elif isinstance(_raw_content, list):
-                        for _part in _raw_content:
-                            if isinstance(_part, dict) and isinstance(_part.get('text'), str):
-                                _part['text'] = _strip_xml_tool_calls(_part['text'])
+            if _checkpoint_stop is not None:
+                _checkpoint_stop.set()
+            if _ckpt_thread is not None:
+                _ckpt_thread.join(timeout=15)
+            with _agent_lock:
+                s.messages = _restore_reasoning_metadata(
+                    _previous_messages,
+                    result.get('messages') or s.messages,
+                )
+                # Strip XML tool-call blocks from assistant message content.
+                # DeepSeek and some other providers emit <function_calls>...</function_calls>
+                # in the raw response text; this must be removed before the content is
+                # saved to the session and displayed in the chat bubble. (#702)
+                for _m in s.messages:
+                    if isinstance(_m, dict) and _m.get('role') == 'assistant':
+                        _raw_content = _m.get('content')
+                        if isinstance(_raw_content, str):
+                            _cleaned = _strip_xml_tool_calls(_raw_content)
+                            if _cleaned != _raw_content:
+                                _m['content'] = _cleaned
+                        elif isinstance(_raw_content, list):
+                            for _part in _raw_content:
+                                if isinstance(_part, dict) and isinstance(_part.get('text'), str):
+                                    _part['text'] = _strip_xml_tool_calls(_part['text'])

-            # ── Detect silent agent failure (no assistant reply produced) ──
-            # When the agent catches an auth/network error internally it may return
-            # an empty final_response without raising — the stream would end with
-            # a done event containing zero assistant messages, leaving the user with
-            # no feedback. Emit an apperror so the client shows an inline error.
-            _assistant_added = any(
-                m.get('role') == 'assistant' and str(m.get('content') or '').strip()
-                for m in (result.get('messages') or [])
-            )
-            # _token_sent tracks whether on_token() was called (any streamed text)
-            if not _assistant_added and not _token_sent:
-                _last_err = getattr(agent, '_last_error', None) or result.get('error') or ''
-                _err_str = str(_last_err) if _last_err else ''
-                _err_lower = _err_str.lower()
-                _is_quota = (
-                    'insufficient credit' in _err_lower
-                    or 'credit balance' in _err_lower
-                    or 'credits exhausted' in _err_lower
-                    or 'quota_exceeded' in _err_lower
-                    or 'quota exceeded' in _err_lower
-                    or 'exceeded your current quota' in _err_lower
+                # ── Detect silent agent failure (no assistant reply produced) ──
+                # When the agent catches an auth/network error internally it may return
+                # an empty final_response without raising — the stream would end with
+                # a done event containing zero assistant messages, leaving the user with
+                # no feedback. Emit an apperror so the client shows an inline error.
+                _assistant_added = any(
+                    m.get('role') == 'assistant' and str(m.get('content') or '').strip()
+                    for m in (result.get('messages') or [])
                )
-                _is_auth = (
-                    not _is_quota and (
-                        '401' in _err_str
-                        or (_last_err and 'AuthenticationError' in type(_last_err).__name__)
-                        or 'authentication' in _err_lower
-                        or 'unauthorized' in _err_lower
-                        or 'invalid api key' in _err_lower
-                        or 'invalid_api_key' in _err_lower
+                # _token_sent tracks whether on_token() was called (any streamed text)
+                if not _assistant_added and not _token_sent:
+                    _last_err = getattr(agent, '_last_error', None) or result.get('error') or ''
+                    _err_str = str(_last_err) if _last_err else ''
+                    _err_lower = _err_str.lower()
+                    _is_quota = (
+                        'insufficient credit' in _err_lower
+                        or 'credit balance' in _err_lower
+                        or 'credits exhausted' in _err_lower
+                        or 'quota_exceeded' in _err_lower
+                        or 'quota exceeded' in _err_lower
+                        or 'exceeded your current quota' in _err_lower
                    )
+                    _is_auth = (
+                        not _is_quota and (
+                            '401' in _err_str
+                            or (_last_err and 'AuthenticationError' in type(_last_err).__name__)
+                            or 'authentication' in _err_lower
+                            or 'unauthorized' in _err_lower
+                            or 'invalid api key' in _err_lower
+                            or 'invalid_api_key' in _err_lower
+                        )
+                    )
+                    if _is_quota:
+                        _err_label = 'Out of credits'
+                        _err_type = 'quota_exhausted'
+                        _err_hint = 'Your provider account is out of credits. Top up your balance or switch providers via `hermes model`.'
+                    elif _is_auth:
+                        _err_label = 'Authentication failed'
+                        _err_type = 'auth_mismatch'
+                        _err_hint = (
+                            'The selected model may not be supported by your configured provider or '
+                            'your API key is invalid. Run `hermes model` in your terminal to '
+                            'update credentials, then restart the WebUI.'
+                        )
+                    else:
+                        _err_label = 'No response received'
+                        _err_type = 'no_response'
+                        _err_hint = 'Verify your API key is valid and the selected model is available for your account.'
+                    put('apperror', {
+                        'message': _err_str or f'{_err_label}.',
+                        'type': _err_type,
+                        'hint': _err_hint,
+                    })
+                    # Clear stream/pending state so the session does not appear
+                    # "agent_running" on reload after a silent failure.
+                    # Persist the error so it survives page reload.
+                    # _error=True ensures _sanitize_messages_for_api excludes it from
+                    # subsequent API calls so the LLM never sees its own error as prior context.
+                    s.active_stream_id = None
+                    s.pending_user_message = None
+                    s.pending_attachments = []
+                    s.pending_started_at = None
+                    s.messages.append({
+                        'role': 'assistant',
+                        'content': f'**{_err_label}:** {_err_str or _err_label}\n\n*{_err_hint}*',
+                        'timestamp': int(time.time()),
+                        '_error': True,
+                    })
+                    try:
+                        s.save()
+                    except Exception:
+                        pass
+                    return  # apperror already closes the stream on the client side
+
+                # ── Handle context compression side effects ──
+                # If compression fired inside run_conversation, the agent may have
+                # rotated its session_id. Detect and fix the mismatch so the WebUI
+                # continues writing to the correct session file.
+                #
+                # Lock migration: when session_id rotates, we alias the new ID to
+                # the *same* Lock object under SESSION_AGENT_LOCKS so that
+                # subsequent callers using _get_session_agent_lock(new_sid) get the
+                # same Lock the streaming thread is already holding.  We then pop
+                # the old-id entry to prevent a leak.  This is safe because we
+                # already hold _agent_lock (the Lock object itself), so the
+                # reference stays alive even after the dict entry is removed.
+                # Concurrent readers that already looked up the old ID will still
+                # see the same Lock object until they release it.
+                _agent_sid = getattr(agent, 'session_id', None)
+                _compressed = False
+                if _agent_sid and _agent_sid != session_id:
+                    old_sid = session_id
+                    new_sid = _agent_sid
+                    # Rename the session file
+                    old_path = SESSION_DIR / f'{old_sid}.json'
+                    new_path = SESSION_DIR / f'{new_sid}.json'
+                    s.session_id = new_sid
+                    with LOCK:
+                        if old_sid in SESSIONS:
+                            SESSIONS[new_sid] = SESSIONS.pop(old_sid)
+                    # Migrate the per-session lock: alias new_sid to the held
+                    # _agent_lock reference directly (not via old_sid lookup),
+                    # then remove the old_sid entry to prevent a leak.
+                    with SESSION_AGENT_LOCKS_LOCK:
+                        SESSION_AGENT_LOCKS[new_sid] = _agent_lock
+                        SESSION_AGENT_LOCKS.pop(old_sid, None)
+                    if old_path.exists() and not new_path.exists():
+                        try:
+                            old_path.rename(new_path)
+                        except OSError:
+                            logger.debug("Failed to rename session file during compression")
+                    _compressed = True
+                # Also detect compression via the result dict or compressor state
+                if not _compressed:
+                    _compressor = getattr(agent, 'context_compressor', None)
+                    if _compressor and getattr(_compressor, 'compression_count', 0) > 0:
+                        _compressed = True
+                # Notify the frontend that compression happened
+                if _compressed:
+                    put('compressed', {
+                        'message': 'Context auto-compressed to continue the conversation',
+                    })
+
+                # Stamp 'timestamp' on any messages that don't have one yet
+                _now = time.time()
+                for _m in s.messages:
+                    if isinstance(_m, dict) and not _m.get('timestamp') and not _m.get('_ts'):
+                        _m['timestamp'] = int(_now)
+                # Only auto-generate title when still default; preserves user renames
+                if s.title == 'Untitled' or s.title == 'New Chat' or not s.title:
+                    s.title = title_from(s.messages, s.title)
+                _looks_default = (s.title == 'Untitled' or s.title == 'New Chat' or not s.title)
+                _looks_provisional = _is_provisional_title(s.title, s.messages)
+                _invalid_existing_title = _looks_invalid_generated_title(s.title)
+                _should_bg_title = (
+                    (_looks_default or _looks_provisional or _invalid_existing_title)
+                    and (not getattr(s, 'llm_title_generated', False) or _invalid_existing_title)
                )
-                if _is_quota:
-                    _err_label = 'Out of credits'
-                    _err_type = 'quota_exhausted'
-                    _err_hint = 'Your provider account is out of credits. Top up your balance or switch providers via `hermes model`.'
-                elif _is_auth:
-                    _err_label = 'Authentication failed'
-                    _err_type = 'auth_mismatch'
-                    _err_hint = (
-                        'The selected model may not be supported by your configured provider or '
-                        'your API key is invalid. Run `hermes model` in your terminal to '
-                        'update credentials, then restart the WebUI.'
-                    )
-                else:
-                    _err_label = 'No response received'
-                    _err_type = 'no_response'
-                    _err_hint = 'Verify your API key is valid and the selected model is available for your account.'
-                put('apperror', {
-                    'message': _err_str or f'{_err_label}.',
-                    'type': _err_type,
-                    'hint': _err_hint,
-                })
-                # Clear stream/pending state so the session does not appear
-                # "agent_running" on reload after a silent failure.
+                _u0 = ''
+                _a0 = ''
+                if _should_bg_title:
+                    _u0, _a0 = _first_exchange_snippets(s.messages)
+                # Read token/cost usage from the agent object (if available)
+                input_tokens = getattr(agent, 'session_prompt_tokens', 0) or 0
+                output_tokens = getattr(agent, 'session_completion_tokens', 0) or 0
+                estimated_cost = getattr(agent, 'session_estimated_cost_usd', None)
+                s.input_tokens = (s.input_tokens or 0) + input_tokens
+                s.output_tokens = (s.output_tokens or 0) + output_tokens
+                if estimated_cost:
+                    s.estimated_cost = (s.estimated_cost or 0) + estimated_cost
+                # Persist tool-call summaries even when the final message history only
+                # kept bare tool rows and omitted explicit assistant tool_call IDs.
+                tool_calls = _extract_tool_calls_from_messages(
+                    s.messages,
+                    live_tool_calls=_live_tool_calls,
+                )
+                s.tool_calls = tool_calls
                s.active_stream_id = None
                s.pending_user_message = None
                s.pending_attachments = []
                s.pending_started_at = None
-                # Persist the error so it survives page reload.
-                # _error=True ensures _sanitize_messages_for_api excludes it from
-                # subsequent API calls so the LLM never sees its own error as prior context.
-                s.messages.append({
-                    'role': 'assistant',
-                    'content': f'**{_err_label}:** {_err_str or _err_label}\n\n*{_err_hint}*',
-                    'timestamp': int(time.time()),
-                    '_error': True,
-                })
-                try:
-                    s.save()
-                except Exception:
-                    pass
-                return  # apperror already closes the stream on the client side
-
-            # ── Handle context compression side effects ──
-            # If compression fired inside run_conversation, the agent may have
-            # rotated its session_id. Detect and fix the mismatch so the WebUI
-            # continues writing to the correct session file.
-            _agent_sid = getattr(agent, 'session_id', None)
-            _compressed = False
-            if _agent_sid and _agent_sid != session_id:
-                old_sid = session_id
-                new_sid = _agent_sid
-                # Rename the session file
-                old_path = SESSION_DIR / f'{old_sid}.json'
-                new_path = SESSION_DIR / f'{new_sid}.json'
-                s.session_id = new_sid
-                with LOCK:
-                    if old_sid in SESSIONS:
-                        SESSIONS[new_sid] = SESSIONS.pop(old_sid)
-                if old_path.exists() and not new_path.exists():
-                    try:
-                        old_path.rename(new_path)
-                    except OSError:
-                        logger.debug("Failed to rename session file during compression")
-                _compressed = True
-            # Also detect compression via the result dict or compressor state
-            if not _compressed:
-                _compressor = getattr(agent, 'context_compressor', None)
-                if _compressor and getattr(_compressor, 'compression_count', 0) > 0:
-                    _compressed = True
-            # Notify the frontend that compression happened
-            if _compressed:
-                put('compressed', {
-                    'message': 'Context auto-compressed to continue the conversation',
-                })
-
-            # Stamp 'timestamp' on any messages that don't have one yet
-            _now = time.time()
-            for _m in s.messages:
-                if isinstance(_m, dict) and not _m.get('timestamp') and not _m.get('_ts'):
-                    _m['timestamp'] = int(_now)
-            # Only auto-generate title when still default; preserves user renames
-            if s.title == 'Untitled' or s.title == 'New Chat' or not s.title:
-                s.title = title_from(s.messages, s.title)
-            _looks_default = (s.title == 'Untitled' or s.title == 'New Chat' or not s.title)
-            _looks_provisional = _is_provisional_title(s.title, s.messages)
-            _invalid_existing_title = _looks_invalid_generated_title(s.title)
-            _should_bg_title = (
-                (_looks_default or _looks_provisional or _invalid_existing_title)
-                and (not getattr(s, 'llm_title_generated', False) or _invalid_existing_title)
-            )
-            _u0 = ''
-            _a0 = ''
-            if _should_bg_title:
-                _u0, _a0 = _first_exchange_snippets(s.messages)
-            # Read token/cost usage from the agent object (if available)
-            input_tokens = getattr(agent, 'session_prompt_tokens', 0) or 0
-            output_tokens = getattr(agent, 'session_completion_tokens', 0) or 0
-            estimated_cost = getattr(agent, 'session_estimated_cost_usd', None)
-            s.input_tokens = (s.input_tokens or 0) + input_tokens
-            s.output_tokens = (s.output_tokens or 0) + output_tokens
-            if estimated_cost:
-                s.estimated_cost = (s.estimated_cost or 0) + estimated_cost
-            # Persist tool-call summaries even when the final message history only
-            # kept bare tool rows and omitted explicit assistant tool_call IDs.
-            tool_calls = _extract_tool_calls_from_messages(
-                s.messages,
-                live_tool_calls=_live_tool_calls,
-            )
-            s.tool_calls = tool_calls
-            s.active_stream_id = None
-            s.pending_user_message = None
-            s.pending_attachments = []
-            s.pending_started_at = None
-            # Tag the matching user message with attachment filenames for display on reload
-            # Only tag a user message whose content relates to this turn's text
-            # (msg_text is the full message including the [Attached files: ...] suffix)
-            if attachments:
-                for m in reversed(s.messages):
-                    if m.get('role') == 'user':
-                        content = str(m.get('content', ''))
-                        # Match if content is part of the sent message or vice-versa
-                        base_text = msg_text.split('\n\n[Attached files:')[0].strip() if '\n\n[Attached files:' in msg_text else msg_text
-                        if base_text[:60] in content or content[:60] in msg_text:
-                            m['attachments'] = attachments
+                # Tag the matching user message with attachment filenames for display on reload
+                # Only tag a user message whose content relates to this turn's text
+                # (msg_text is the full message including the [Attached files: ...] suffix)
+                if attachments:
+                    for m in reversed(s.messages):
+                        if m.get('role') == 'user':
+                            content = str(m.get('content', ''))
+                            # Match if content is part of the sent message or vice-versa
+                            base_text = msg_text.split('\n\n[Attached files:')[0].strip() if '\n\n[Attached files:' in msg_text else msg_text
+                            if base_text[:60] in content or content[:60] in msg_text:
+                                m['attachments'] = attachments
+                                break
+                # Persist reasoning trace in the session so it survives reload.
+                # Must run BEFORE s.save() — otherwise the mutation lives only in
+                # memory until the next turn's save, and the last-turn thinking card
+                # is lost when the user reloads immediately after a response.
+                if _reasoning_text and s.messages:
+                    for _rm in reversed(s.messages):
+                        if isinstance(_rm, dict) and _rm.get('role') == 'assistant':
+                            _rm['reasoning'] = _reasoning_text
                            break
-            # Persist reasoning trace in the session so it survives reload.
-            # Must run BEFORE s.save() — otherwise the mutation lives only in
-            # memory until the next turn's save, and the last-turn thinking card
-            # is lost when the user reloads immediately after a response.
-            if _reasoning_text and s.messages:
-                for _rm in reversed(s.messages):
-                    if isinstance(_rm, dict) and _rm.get('role') == 'assistant':
-                        _rm['reasoning'] = _reasoning_text
-                        break
-            s.save()
+                s.save()
            # Sync to state.db for /insights (opt-in setting)
            try:
                from api.config import load_settings as _load_settings
@@ -1543,23 +1602,29 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
        else:
            _exc_label, _exc_type, _exc_hint = 'Error', 'error', ''
        if s is not None:
-            s.active_stream_id = None
-            s.pending_user_message = None
-            s.pending_attachments = []
-            s.pending_started_at = None
+            if _checkpoint_stop is not None:
+                _checkpoint_stop.set()
+            if _ckpt_thread is not None:
+                _ckpt_thread.join(timeout=15)
            # Persist the error so it survives page reload.
            # _error=True ensures _sanitize_messages_for_api excludes it from subsequent
            # API calls so the LLM never sees its own error as prior context on the next turn.
-            s.messages.append({
-                'role': 'assistant',
-                'content': f'**{_exc_label}:** {err_str}' + (f'\n\n*{_exc_hint}*' if _exc_hint else ''),
-                'timestamp': int(time.time()),
-                '_error': True,
-            })
-            try:
-                s.save()
-            except Exception:
-                pass
+            _lock_ctx = _agent_lock if _agent_lock is not None else contextlib.nullcontext()
+            with _lock_ctx:
+                s.active_stream_id = None
+                s.pending_user_message = None
+                s.pending_attachments = []
+                s.pending_started_at = None
+                s.messages.append({
+                    'role': 'assistant',
+                    'content': f'**{_exc_label}:** {err_str}' + (f'\n\n*{_exc_hint}*' if _exc_hint else ''),
+                    'timestamp': int(time.time()),
+                    '_error': True,
+                })
+                try:
+                    s.save()
+                except Exception:
+                    pass
        _apperror_payload: dict = {'message': err_str, 'type': _exc_type}
        if _exc_hint:
            _apperror_payload['hint'] = _exc_hint
@@ -1568,6 +1633,8 @@ def _run_agent_streaming(session_id, msg_text, model, workspace, stream_id, atta
        # Stop periodic checkpoint thread if it was started (Issue #765)
        if _checkpoint_stop is not None:
            _checkpoint_stop.set()
+        if _ckpt_thread is not None:
+            _ckpt_thread.join(timeout=15)
        _clear_thread_env()  # TD1: always clear thread-local context
        with STREAMS_LOCK:
            STREAMS.pop(stream_id, None)
@@ -1662,55 +1729,60 @@ def cancel_stream(stream_id: str) -> bool:
        _cancel_partial_text = STREAM_PARTIAL_TEXT.get(stream_id, '')

    # Session cleanup outside STREAMS_LOCK to preserve lock ordering.
+    # Acquire the per-session _agent_lock too, mirroring every other session
+    # writer (streaming success/error paths, periodic checkpoint, POST endpoints)
+    # so the cancel-path mutation races neither the checkpoint thread nor
+    # concurrent undo/retry calls.
    if _cancel_session_id:
-        try:
-            _cs = get_session(_cancel_session_id)
-            _cs.active_stream_id = None
-            _cs.pending_user_message = None
-            _cs.pending_attachments = []
-            _cs.pending_started_at = None
-            # Persist any partial assistant text that was streamed before cancel (#893).
-            # Preserving partial content means the user sees what the agent had
-            # produced rather than losing it entirely.  The marker is _partial=True
-            # (for session/UI identification) — NOT _error=True — so the partial
-            # content IS kept in the history sent to the agent on the next user
-            # message, letting the model continue from where it was cut off.
-            # See the inner comment on the append call below for the rationale.
-            partial_text = _cancel_partial_text.strip() if _cancel_partial_text else ''
-            if partial_text:
-                import re as _re
-                # Strip thinking/reasoning markup from partial content before saving.
-                # First pass: remove complete <think>...</think> and <thinking>...</thinking> blocks.
-                _stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*?</think(?:ing)?>',
-                                    '', partial_text,
-                                    flags=_re.DOTALL | _re.IGNORECASE).strip()
-                # Second pass: strip trailing UNCLOSED think/thinking block (the common
-                # cancel case — user stops mid-reasoning before the close tag appears).
-                _stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*',
-                                    '', _stripped,
-                                    flags=_re.DOTALL | _re.IGNORECASE).strip()
-                if _stripped:
-                    # Mark _partial=True for session/UI identification only.
-                    # Deliberately NOT _error=True — the partial content is real model
-                    # output and should be visible in conversation history so the model
-                    # can continue from it on the next turn (#893).
-                    _cs.messages.append({
-                        'role': 'assistant',
-                        'content': _stripped,
-                        '_partial': True,
-                        'timestamp': int(time.time()),
-                    })
-            # Cancel marker — flagged _error=True so it is stripped from conversation
-            # history on the next turn (prevents model from seeing "Task cancelled."
-            # as a prior assistant reply).
-            _cs.messages.append({
-                'role': 'assistant',
-                'content': '*Task cancelled.*',
-                '_error': True,
-                'timestamp': int(time.time()),
-            })
-            _cs.save()
-        except Exception:
-            logger.debug("Failed to clear session state on cancel for %s", _cancel_session_id)
+        with _get_session_agent_lock(_cancel_session_id):
+            try:
+                _cs = get_session(_cancel_session_id)
+                _cs.active_stream_id = None
+                _cs.pending_user_message = None
+                _cs.pending_attachments = []
+                _cs.pending_started_at = None
+                # Persist any partial assistant text that was streamed before cancel (#893).
+                # Preserving partial content means the user sees what the agent had
+                # produced rather than losing it entirely.  The marker is _partial=True
+                # (for session/UI identification only) — NOT _error=True — so the partial
+                # content IS kept in the history sent to the agent on the next user
+                # message, letting the model continue from where it was cut off.
+                # See the inner comment on the append call below for the rationale.
+                partial_text = _cancel_partial_text.strip() if _cancel_partial_text else ''
+                if partial_text:
+                    import re as _re
+                    # Strip thinking/reasoning markup from partial content before saving.
+                    # First pass: remove complete <thinking>...</thinking> blocks.
+                    _stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*?</think(?:ing)?>',
+                                        '', partial_text,
+                                        flags=_re.DOTALL | _re.IGNORECASE).strip()
+                    # Second pass: strip trailing UNCLOSED think/thinking block (the common
+                    # cancel case — user stops mid-reasoning before the close tag appears).
+                    _stripped = _re.sub(r'<think(?:ing)?\b[^>]*>.*',
+                                        '', _stripped,
+                                        flags=_re.DOTALL | _re.IGNORECASE).strip()
+                    if _stripped:
+                        # Mark _partial=True for session/UI identification only.
+                        # Deliberately NOT _error=True — the partial content is real model
+                        # output and should be visible in conversation history so the model
+                        # can continue from it on the next turn (#893).
+                        _cs.messages.append({
+                            'role': 'assistant',
+                            'content': _stripped,
+                            '_partial': True,
+                            'timestamp': int(time.time()),
+                        })
+                # Cancel marker — flagged _error=True so it is stripped from conversation
+                # history on the next turn (prevents model from seeing "Task cancelled."
+                # as a prior assistant reply).
+                _cs.messages.append({
+                    'role': 'assistant',
+                    'content': '*Task cancelled.*',
+                    '_error': True,
+                    'timestamp': int(time.time()),
+                })
+                _cs.save()
+            except Exception:
+                logger.debug("Failed to clear session state on cancel for %s", _cancel_session_id)

    return True
--- a/tests/test_issue765_streaming_persistence.py
+++ b/tests/test_issue765_streaming_persistence.py
@@ -255,13 +255,101 @@ class TestPeriodicCheckpoint:
        assert data["updated_at"] > ts_before, "Checkpoint should update updated_at"


-class TestCheckpointVariableLifecycle:
-    """Regression guard: the outer `finally` must not UnboundLocalError when an
-    exception fires before the checkpoint thread is created.  _checkpoint_stop
-    is initialised to None at the very top of the outer try block so the
-    finally's `if _checkpoint_stop is not None` branch is always safe.
+class TestIssue765FollowupHardening:
+    """Regression tests for the follow-up hardening pass on Issue #765.
+
+    Includes the guard that the outer `finally` must not UnboundLocalError when
+    an exception fires before the checkpoint thread is created.
    """

+    def test_same_session_concurrent_saves_use_distinct_temp_files(self, monkeypatch):
+        """Two concurrent saves of the same session must not collide on one tmp path.
+
+        The key regression guard here is that each save call should reach os.replace()
+        with a distinct source tmp path. With the old shared `<sid>.tmp` scheme, both
+        threads would target the same path and the second replace would deterministically
+        fail once the first consume/remove happened.
+        """
+        s = _make_session("same_sid")
+        s.save(skip_index=True)  # seed the file on disk
+
+        original_replace = models.os.replace
+        barrier = threading.Barrier(2)
+        replace_sources = []
+        errors = []
+
+        def _replace_with_barrier(src, dst):
+            replace_sources.append(str(src))
+            barrier.wait(timeout=5)
+            return original_replace(src, dst)
+
+        monkeypatch.setattr(models.os, "replace", _replace_with_barrier)
+
+        def _save_worker():
+            try:
+                s.save(skip_index=True)
+            except Exception as e:
+                errors.append(e)
+
+        t1 = threading.Thread(target=_save_worker)
+        t2 = threading.Thread(target=_save_worker)
+        t1.start()
+        t2.start()
+        t1.join(timeout=5)
+        t2.join(timeout=5)
+
+        assert not errors, f"Concurrent same-session saves should not fail: {errors}"
+        assert len(replace_sources) == 2, f"Expected 2 replace calls, got {replace_sources}"
+        assert len(set(replace_sources)) == 2, (
+            "Concurrent same-session saves must use distinct temp files; "
+            f"got {replace_sources}"
+        )
+        data = json.loads(s.path.read_text(encoding="utf-8"))
+        assert data["session_id"] == "same_sid"
+
+    def test_success_path_joins_checkpoint_before_session_mutation(self):
+        """Static guard: success path must stop/join checkpoint thread before mutating.
+
+        This keeps the post-run_conversation session rewrite serialized relative to the
+        periodic checkpoint worker.
+        """
+        src = (Path(__file__).parent.parent / "api" / "streaming.py").read_text(
+            encoding="utf-8"
+        )
+        stop_idx = src.find("if _checkpoint_stop is not None:\n                _checkpoint_stop.set()")
+        join_idx = src.find("if _ckpt_thread is not None:\n                _ckpt_thread.join(timeout=15)")
+        lock_idx = src.find("with _agent_lock:\n                s.messages = _restore_reasoning_metadata(")
+        save_idx = src.find("s.messages = _restore_reasoning_metadata(")
+
+        assert stop_idx != -1, "Success path must stop the checkpoint thread"
+        assert join_idx != -1, "Success path must join the checkpoint thread"
+        assert lock_idx != -1, "Success path must serialize mutation with _agent_lock"
+        assert save_idx != -1, "Success path restore/mutation block not found"
+        assert stop_idx < join_idx < lock_idx <= save_idx, (
+            "Checkpoint stop/join must happen before the success-path session mutation block"
+        )
+
+    def test_silent_failure_path_does_not_reacquire_agent_lock(self):
+        """Silent-failure path must not nest `_agent_lock` inside the success lock.
+
+        Reacquiring the same per-session lock inside the post-run_conversation block
+        deadlocks because `_get_session_agent_lock()` returns a non-reentrant Lock.
+        """
+        src = (Path(__file__).parent.parent / "api" / "streaming.py").read_text(
+            encoding="utf-8"
+        )
+        outer_lock_idx = src.find("with _agent_lock:\n                s.messages = _restore_reasoning_metadata(")
+        silent_failure_idx = src.find("if not _assistant_added and not _token_sent:")
+        inner_lock_idx = src.find("with _agent_lock:", outer_lock_idx + 1)
+        compression_idx = src.find("# ── Handle context compression side effects ──")
+
+        assert outer_lock_idx != -1, "Outer success-path _agent_lock block not found"
+        assert silent_failure_idx != -1, "Silent-failure branch not found"
+        assert compression_idx != -1, "Compression marker not found"
+        assert not (
+            inner_lock_idx != -1 and silent_failure_idx < inner_lock_idx < compression_idx
+        ), "Silent-failure path must not reacquire _agent_lock inside the outer lock"
+
    def test_checkpoint_stop_initialised_before_any_raiseable_code(self):
        """Static check: `_checkpoint_stop = None` must appear before any code
        that could raise inside _run_agent_streaming's outer try."""
@@ -271,7 +359,11 @@ class TestCheckpointVariableLifecycle:
        lines = src.splitlines()
        try_line = next(
            i for i, ln in enumerate(lines, 1)
-            if ln.rstrip().endswith("try:") and lines[i - 2].strip().startswith("_checkpoint_stop")
+            if ln.rstrip().endswith("try:")
+            and any(
+                lines[j].strip().startswith("_checkpoint_stop = None")
+                for j in range(max(0, i - 4), i - 1)
+            )
        )
        # The assignment must precede the `try:` — not sit inside the nested
        # block where an earlier line could raise before it runs.
@@ -302,3 +394,446 @@ class TestCheckpointVariableLifecycle:

        with pytest.raises(ValueError, match="early failure"):
            mimic_run_agent_streaming()
+
+    def test_agent_lock_null_guard_in_except_block(self):
+        """The except block must not crash with AttributeError when _agent_lock
+        is None (e.g. when get_session succeeds but _get_session_agent_lock
+        hasn't been called yet, or _get_session_agent_lock itself raised).
+
+        The code must use a nullcontext fallback rather than unconditionally
+        entering `with _agent_lock:`."""
+        src = (Path(__file__).parent.parent / "api" / "streaming.py").read_text(
+            encoding="utf-8"
+        )
+        # Verify contextlib.nullcontext is used as a fallback
+        assert "contextlib.nullcontext()" in src, (
+            "The except block must guard _agent_lock being None by falling "
+            "back to contextlib.nullcontext() instead of unconditionally "
+            "entering `with _agent_lock:`"
+        )
+        # Verify the except block uses _lock_ctx (the guarded variable)
+        assert "_lock_ctx" in src, (
+            "The except block must assign _agent_lock / nullcontext to a "
+            "variable and use it, not enter `with _agent_lock:` directly"
+        )
+
+    def test_periodic_checkpoint_uses_agent_lock(self):
+        """The periodic checkpoint thread must hold _agent_lock while saving
+        to prevent concurrent mutation races with other endpoints."""
+        src = (Path(__file__).parent.parent / "api" / "streaming.py").read_text(
+            encoding="utf-8"
+        )
+        # Find the _periodic_checkpoint function
+        ckpt_idx = src.find("def _periodic_checkpoint():")
+        assert ckpt_idx != -1, "_periodic_checkpoint function not found"
+        ckpt_block = src[ckpt_idx:ckpt_idx + 600]
+        assert "with _agent_lock:" in ckpt_block, (
+            "_periodic_checkpoint must hold _agent_lock while calling s.save() "
+            "to prevent race conditions with other session-mutating endpoints"
+        )
+
+    def test_background_title_update_rebinds_to_canonical_session_instance(self):
+        """Guard against stale Session object mutation after LLM round-trip.
+
+        _run_background_title_update must re-bind `s` to SESSIONS.get(session_id,
+        s) under LOCK before deciding whether a manual rename should block the
+        generated title write.
+        """
+        src = (Path(__file__).parent.parent / "api" / "streaming.py").read_text(
+            encoding="utf-8"
+        )
+        fn_idx = src.find("def _run_background_title_update(")
+        assert fn_idx != -1, "_run_background_title_update not found"
+        fn_block = src[fn_idx:fn_idx + 3200]
+        assert "with LOCK:" in fn_block, (
+            "_run_background_title_update must acquire LOCK before rebinding "
+            "to canonical cached session instance"
+        )
+        assert "s = SESSIONS.get(session_id, s)" in fn_block, (
+            "_run_background_title_update must rebind to canonical cached "
+            "session instance under LOCK"
+        )
+
+    def test_cancel_stream_uses_agent_lock(self):
+        """cancel_stream must hold _agent_lock during session cleanup to
+        prevent races with checkpoint saves and other writers."""
+        src = (Path(__file__).parent.parent / "api" / "streaming.py").read_text(
+            encoding="utf-8"
+        )
+        cancel_idx = src.find("def cancel_stream(")
+        assert cancel_idx != -1, "cancel_stream function not found"
+        cancel_block = src[cancel_idx:]
+        # Find the session cleanup section
+        cleanup_idx = cancel_block.find("Session cleanup outside STREAMS_LOCK")
+        assert cleanup_idx != -1, "Session cleanup comment not found in cancel_stream"
+        cleanup_section = cancel_block[cleanup_idx:cleanup_idx + 800]
+        assert "_get_session_agent_lock" in cleanup_section, (
+            "cancel_stream must acquire _get_session_agent_lock during "
+            "session cleanup to serialise with the checkpoint thread and "
+            "other session-mutating endpoints"
+        )
+
+    def test_session_ops_retry_undo_hold_agent_lock(self):
+        """retry_last and undo_last must hold _get_session_agent_lock for the
+        entire read-modify-save cycle."""
+        src = (Path(__file__).parent.parent / "api" / "session_ops.py").read_text(
+            encoding="utf-8"
+        )
+        assert "_get_session_agent_lock" in src, (
+            "session_ops must import _get_session_agent_lock"
+        )
+        # Both functions must use with _get_session_agent_lock(session_id):
+        for func_name in ("retry_last", "undo_last"):
+            func_idx = src.find(f"def {func_name}(")
+            assert func_idx != -1, f"{func_name} not found in session_ops.py"
+            func_block = src[func_idx:func_idx + 1200]
+            assert "with _get_session_agent_lock" in func_block, (
+                f"{func_name} must wrap its read-modify-save cycle in "
+                f"with _get_session_agent_lock(session_id)"
+            )
+
+    def test_periodic_checkpoint_mutation_race_with_undo_last(self, tmp_path, monkeypatch):
+        """Run _periodic_checkpoint against a session whose messages list is
+        concurrently truncated by undo_last; the on-disk JSON must remain
+        parseable and internally consistent.
+
+        The simulated checkpoint mirrors production by acquiring
+        _get_session_agent_lock around s.save(), and we assert that every
+        on-disk snapshot's messages list is one of the allowed snapshots
+        (never an interleaving of fields from two different saves).
+        """
+        session_dir = tmp_path / "sessions_undo_race"
+        session_dir.mkdir()
+        index_file = session_dir / "_index.json"
+        monkeypatch.setattr(models, "SESSION_DIR", session_dir)
+        monkeypatch.setattr(models, "SESSION_INDEX_FILE", index_file)
+        models.SESSIONS.clear()
+        try:
+            s = Session(
+                session_id="race_test",
+                title="Race Test",
+                messages=[
+                    {"role": "user", "content": "first"},
+                    {"role": "assistant", "content": "reply 1"},
+                    {"role": "user", "content": "second"},
+                    {"role": "assistant", "content": "reply 2"},
+                    {"role": "user", "content": "third"},
+                    {"role": "assistant", "content": "reply 3"},
+                ],
+            )
+            s.save()
+            models.SESSIONS[s.session_id] = s
+
+            _checkpoint_stop = threading.Event()
+            _checkpoint_activity = [0]
+            errors = []
+            # Collect every on-disk messages snapshot observed by the
+            # checkpoint thread so we can assert atomicity after the run.
+            checkpoint_snapshots = []
+            _lock = threading.Lock()
+
+            from api.config import _get_session_agent_lock
+            _agent_lock = _get_session_agent_lock("race_test")
+
+            def _periodic_checkpoint():
+                last = 0
+                while not _checkpoint_stop.wait(0.01):
+                    try:
+                        cur = _checkpoint_activity[0]
+                        if cur > last:
+                            with _agent_lock:
+                                s.save(skip_index=True)
+                            # Read back the on-disk JSON to verify atomicity
+                            try:
+                                snap = json.loads(s.path.read_text())
+                                with _lock:
+                                    checkpoint_snapshots.append(snap.get("messages"))
+                            except Exception:
+                                pass
+                            last = cur
+                    except Exception as e:
+                        errors.append(e)
+
+            t = threading.Thread(target=_periodic_checkpoint, daemon=True)
+            t.start()
+
+            from api.session_ops import undo_last
+            # Collect the allowed message snapshots (each state the session
+            # is in at a point where a checkpoint might observe it).
+            allowed_message_snapshots = []
+            # The initial state (before any undo) is a valid checkpoint target.
+            allowed_message_snapshots.append(
+                [dict(m) if isinstance(m, dict) else m for m in s.messages]
+            )
+            for _ in range(5):
+                _checkpoint_activity[0] += 1
+                time.sleep(0.02)
+                try:
+                    undo_last("race_test")
+                except ValueError:
+                    pass
+                # Record the post-undo state (before appending new messages)
+                # as an allowed snapshot — the checkpoint may observe this.
+                allowed_message_snapshots.append(
+                    [dict(m) if isinstance(m, dict) else m for m in s.messages]
+                )
+                # Wrap mutation + save in _agent_lock to mirror production
+                # paths and prevent the checkpoint from observing an
+                # intermediate +1-message snapshot.
+                with _agent_lock:
+                    s.messages.append({"role": "user", "content": f"msg-{_}"})
+                    s.messages.append({"role": "assistant", "content": f"ans-{_}"})
+                    # Record the in-memory messages list *before* save so we
+                    # can verify that every checkpoint snapshot matches one
+                    # of these.
+                    allowed_message_snapshots.append(
+                        [dict(m) if isinstance(m, dict) else m for m in s.messages]
+                    )
+                    s.save()
+
+            _checkpoint_stop.set()
+            t.join(timeout=2)
+
+            assert not errors, f"Checkpoint thread encountered errors: {errors}"
+            # Verify the on-disk JSON is parseable
+            data = json.loads(s.path.read_text())
+            assert data["session_id"] == "race_test"
+            # Messages must be a list (not corrupted by concurrent mutation)
+            assert isinstance(data["messages"], list)
+            # Contract assertion: every checkpoint snapshot's messages must
+            # equal one of the allowed in-memory snapshots, never an
+            # interleaving of fields from two different saves.  This assertion
+            # has teeth: if the _agent_lock were removed from the checkpoint
+            # or the undo path, concurrent mutations would produce snapshots
+            # that match no allowed state (e.g. a list with some messages
+            # from before undo and some from after).
+            for snap_msgs in checkpoint_snapshots:
+                if snap_msgs is None:
+                    continue
+                # Normalize for comparison (strip display-only metadata)
+                normalized = [
+                    {k: v for k, v in m.items() if k in ("role", "content")}
+                    if isinstance(m, dict) else m
+                    for m in snap_msgs
+                ]
+                matched = False
+                for allowed in allowed_message_snapshots:
+                    norm_allowed = [
+                        {k: v for k, v in m.items() if k in ("role", "content")}
+                        if isinstance(m, dict) else m
+                        for m in allowed
+                    ]
+                    if normalized == norm_allowed:
+                        matched = True
+                        break
+                assert matched, (
+                    f"Checkpoint snapshot {normalized!r} does not match any "
+                    f"allowed state — this indicates a serialization failure "
+                    f"(the _agent_lock is not preventing interleaved writes)."
+                )
+        finally:
+            models.SESSIONS.clear()
+
+    def test_cancel_stream_concurrent_checkpoint_produces_valid_json(self, tmp_path, monkeypatch):
+        """Run cancel_stream while a _periodic_checkpoint thread is concurrently
+        saving the same session; the resulting on-disk JSON must be parseable
+        and active_stream_id must be None.
+
+        The simulated checkpoint mirrors production by acquiring
+        _get_session_agent_lock around s.save(), and we assert that every
+        on-disk snapshot is internally consistent (never an interleaving
+        of fields from two different saves).
+        """
+        session_dir = tmp_path / "sessions_cancel_race"
+        session_dir.mkdir()
+        index_file = session_dir / "_index.json"
+        monkeypatch.setattr(models, "SESSION_DIR", session_dir)
+        monkeypatch.setattr(models, "SESSION_INDEX_FILE", index_file)
+        models.SESSIONS.clear()
+        try:
+            s = Session(
+                session_id="cancel_race",
+                title="Cancel Race Test",
+                messages=[
+                    {"role": "user", "content": "hello"},
+                    {"role": "assistant", "content": "world"},
+                ],
+                active_stream_id="stream-abc",
+            )
+            s.save()
+            models.SESSIONS[s.session_id] = s
+
+            _checkpoint_stop = threading.Event()
+            _checkpoint_activity = [0]
+            errors = []
+            # Collect every on-disk snapshot observed by the checkpoint thread.
+            checkpoint_snapshots = []
+            _snap_lock = threading.Lock()
+
+            from api.config import _get_session_agent_lock
+            _agent_lock = _get_session_agent_lock("cancel_race")
+
+            def _periodic_checkpoint():
+                last = 0
+                while not _checkpoint_stop.wait(0.01):
+                    try:
+                        cur = _checkpoint_activity[0]
+                        if cur > last:
+                            with _agent_lock:
+                                s.save(skip_index=True)
+                            # Read back the on-disk JSON to verify atomicity
+                            try:
+                                snap = json.loads(s.path.read_text())
+                                with _snap_lock:
+                                    checkpoint_snapshots.append(snap)
+                            except Exception:
+                                pass
+                            last = cur
+                    except Exception as e:
+                        errors.append(e)
+
+            t = threading.Thread(target=_periodic_checkpoint, daemon=True)
+            t.start()
+
+            # Simulate cancel_stream session cleanup directly
+            for i in range(10):
+                _checkpoint_activity[0] += 1
+                time.sleep(0.01)
+                with _get_session_agent_lock("cancel_race"):
+                    s.active_stream_id = None
+                    s.pending_user_message = None
+                    s.pending_attachments = []
+                    s.pending_started_at = None
+                    s.save()
+
+            _checkpoint_stop.set()
+            t.join(timeout=2)
+
+            assert not errors, f"Checkpoint thread encountered errors: {errors}"
+            data = json.loads(s.path.read_text())
+            assert data["session_id"] == "cancel_race"
+            assert data["active_stream_id"] is None, (
+                "active_stream_id must be None after cancel cleanup"
+            )
+            assert isinstance(data["messages"], list)
+            # Contract assertion: every checkpoint snapshot must be
+            # internally consistent (no interleaving of fields from two
+            # different saves).  Because both the cancel cleanup and the
+            # checkpoint hold the same _agent_lock, they are serialized —
+            # but ordering is nondeterministic, so a snapshot taken
+            # *before* cancel will see active_stream_id="stream-abc" and
+            # one taken *after* will see None.  The guarantee is that
+            # each snapshot is self-consistent, never a partial mix.
+            #
+            # This assertion has teeth: if the _agent_lock were removed
+            # from either the checkpoint or the cancel path, a snapshot
+            # could see active_stream_id=None while pending_user_message
+            # still holds the pre-cancel value — a partial state that
+            # violates the atomicity contract.
+            for snap in checkpoint_snapshots:
+                assert isinstance(snap.get("messages"), list), (
+                    "Checkpoint snapshot messages must be a list"
+                )
+                assert snap.get("active_stream_id") in ("stream-abc", None), (
+                    "Checkpoint snapshot active_stream_id must be either "
+                    "the initial value or None (serialized, not interleaved), "
+                    f"got {snap.get('active_stream_id')!r}"
+                )
+                # When active_stream_id is None, the cancel cleanup must
+                # have run — so all four cancel fields must be cleared
+                # atomically.  A partial state (e.g. active_stream_id=None
+                # but pending_user_message still set) would indicate a
+                # serialization failure.
+                if snap.get("active_stream_id") is None:
+                    assert snap.get("pending_user_message") is None, (
+                        "Snapshot with active_stream_id=None must also have "
+                        "pending_user_message=None (atomic cancel cleanup "
+                        "under _agent_lock)"
+                    )
+                    assert snap.get("pending_attachments") == [] or snap.get("pending_attachments") is None, (
+                        "Snapshot with active_stream_id=None must also have "
+                        "empty pending_attachments (atomic cancel cleanup "
+                        "under _agent_lock)"
+                    )
+                    assert snap.get("pending_started_at") is None, (
+                        "Snapshot with active_stream_id=None must also have "
+                        "pending_started_at=None (atomic cancel cleanup "
+                        "under _agent_lock)"
+                    )
+        finally:
+            models.SESSIONS.clear()
+
+    def test_lock_identity_preserved_after_session_id_rotation(self):
+        """When compression rotates session_id, the per-session lock must be
+        aliased so that _get_session_agent_lock(new_sid) returns the *same*
+        Lock object as _get_session_agent_lock(old_sid).
+
+        This is a static guard: it directly simulates the migration that
+        streaming.py performs inside the compression rotation block.
+        """
+        from api.config import (
+            _get_session_agent_lock,
+            SESSION_AGENT_LOCKS,
+            SESSION_AGENT_LOCKS_LOCK,
+        )
+        old_sid = "pre-rotation-id"
+        new_sid = "post-rotation-id"
+
+        # Acquire the lock under the old ID
+        old_lock = _get_session_agent_lock(old_sid)
+
+        # Simulate the migration that streaming.py does during compression:
+        # alias new_sid → held _agent_lock reference, then pop old_sid.
+        _agent_lock = old_lock
+        with SESSION_AGENT_LOCKS_LOCK:
+            SESSION_AGENT_LOCKS[new_sid] = _agent_lock
+            SESSION_AGENT_LOCKS.pop(old_sid, None)
+
+        # Now looking up the new ID must return the exact same Lock object
+        new_lock = _get_session_agent_lock(new_sid)
+        assert new_lock is old_lock, (
+            f"After rotation, _get_session_agent_lock({new_sid!r}) must "
+            f"return the same Lock object as _get_session_agent_lock({old_sid!r}); "
+            f"got {new_lock!r} vs {old_lock!r}"
+        )
+
+        # The old ID entry must no longer exist (it was popped)
+        with SESSION_AGENT_LOCKS_LOCK:
+            assert old_sid not in SESSION_AGENT_LOCKS, (
+                f"Old session ID {old_sid!r} must be removed from "
+                f"SESSION_AGENT_LOCKS after rotation"
+            )
+
+        # Cleanup
+        with SESSION_AGENT_LOCKS_LOCK:
+            SESSION_AGENT_LOCKS.pop(new_sid, None)
+
+    def test_lock_rotation_migration_survives_old_id_already_pruned(self):
+        """Compression lock migration must not require old_sid to exist in dict.
+
+        A concurrent /api/session/delete can prune old_sid before rotation code
+        runs. The migration must still succeed by assigning the held _agent_lock
+        reference directly.
+        """
+        from api.config import (
+            _get_session_agent_lock,
+            SESSION_AGENT_LOCKS,
+            SESSION_AGENT_LOCKS_LOCK,
+        )
+        old_sid = "pre-rotation-pruned"
+        new_sid = "post-rotation-pruned"
+
+        _agent_lock = _get_session_agent_lock(old_sid)
+        with SESSION_AGENT_LOCKS_LOCK:
+            SESSION_AGENT_LOCKS.pop(old_sid, None)  # simulate concurrent prune
+
+        # Must not raise KeyError even though old_sid is absent.
+        with SESSION_AGENT_LOCKS_LOCK:
+            SESSION_AGENT_LOCKS[new_sid] = _agent_lock
+            SESSION_AGENT_LOCKS.pop(old_sid, None)
+
+        new_lock = _get_session_agent_lock(new_sid)
+        assert new_lock is _agent_lock
+
+        with SESSION_AGENT_LOCKS_LOCK:
+            SESSION_AGENT_LOCKS.pop(new_sid, None)
--- a/tests/test_session_index.py
+++ b/tests/test_session_index.py
@@ -382,6 +382,56 @@ def test_deadlock_guard_on_fallback():
    assert isinstance(index, list)


+def test_incremental_index_disk_io_runs_outside_lock(monkeypatch):
+    """Fast-path disk I/O (fsync/replace) must run after releasing LOCK."""
+    index_file = models.SESSION_INDEX_FILE
+
+    sA = _make_session("sess_a", "Alpha", updated_at=100.0)
+    sA.path.write_text(json.dumps(sA.__dict__, ensure_ascii=False, indent=2), encoding="utf-8")
+    _write_session_index(updates=None)  # seed index
+
+    sA.title = "Alpha V2"
+    sA.updated_at = 200.0
+
+    fsync_lock_states = []
+    original_fsync = models.os.fsync
+
+    def _observing_fsync(fd):
+        fsync_lock_states.append(models.LOCK.locked())
+        return original_fsync(fd)
+
+    monkeypatch.setattr(models.os, "fsync", _observing_fsync)
+
+    _write_session_index(updates=[sA])
+
+    assert fsync_lock_states, "Expected at least one fsync call during index write"
+    assert not any(fsync_lock_states), (
+        "_write_session_index fast path must not hold LOCK during fsync/disk I/O"
+    )
+
+
+def test_full_rebuild_index_disk_io_runs_outside_lock(monkeypatch):
+    """Full-rebuild disk I/O (fsync/replace) must run after releasing LOCK."""
+    sA = _make_session("sess_a", "Alpha", updated_at=100.0)
+    sA.path.write_text(json.dumps(sA.__dict__, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    fsync_lock_states = []
+    original_fsync = models.os.fsync
+
+    def _observing_fsync(fd):
+        fsync_lock_states.append(models.LOCK.locked())
+        return original_fsync(fd)
+
+    monkeypatch.setattr(models.os, "fsync", _observing_fsync)
+
+    _write_session_index(updates=None)
+
+    assert fsync_lock_states, "Expected at least one fsync call during index write"
+    assert not any(fsync_lock_states), (
+        "_write_session_index full rebuild must not hold LOCK during fsync/disk I/O"
+    )
+
+
 def test_all_sessions_ignores_stale_index_entries():
    """Reading via all_sessions() must not surface ghost rows from _index.json."""
    index_file = models.SESSION_INDEX_FILE
--- a/tests/test_sprint41.py
+++ b/tests/test_sprint41.py
@@ -164,7 +164,7 @@ class TestIssue495TitleStreaming(unittest.TestCase):
        # After the stream_end fix, title uses original session_id param (not s.session_id
        # which can be rotated during context compression — see #652 fix)
        self.assertIn(
-            "put_event('title', {'session_id': session_id, 'title': s.title})",
+            "put_event('title', {'session_id': session_id, 'title': effective_title})",
            STREAMING_PY,
            "streaming.py should emit a title SSE event when title is updated",
        )