fix: persist onboarding_completed for CLI-configured users on first chat_ready (#922)

* fix: persist onboarding_completed for CLI-configured users on first chat_ready (v0.50.179, #921) Co-authored-by: bsgdigital * fix(onboarding): don't 500 the status endpoint if save_settings fails The #921 persist call `save_settings({"onboarding_completed": True})` in get_onboarding_status() raises if the settings.json write fails (read-only filesystem, disk full, permission error). That turns every /api/onboarding/status call into a 500 until the disk is writable, which is much worse UX than losing the persistence-across-restart guard. Wrapped in try/except so persistence becomes best-effort. The function still sets settings["onboarding_completed"] = True in memory on success, and `completed` reflects `config_auto_completed` on this request either way, so the user sees the right state even when the write fails — only the next-restart protection degrades. Added regression test that patches save_settings to raise OSError and asserts the endpoint still returns completed=True without raising. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: nesquena-hermes <nesquena-hermes@users.noreply.github.com> Co-authored-by: Nathan Esquenazi <nesquena@gmail.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 15:46:02 -07:00
parent 1011918d50
commit a3647570fb
3 changed files with 93 additions and 14 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,11 @@
  workspace subtree) and never enumerate blocked system roots. (`api/routes.py`,
  `api/workspace.py`, `static/panels.js`, `static/style.css`) (partial for #616)
 ## [v0.50.179] — 2026-04-23
 ### Fixed
 - **Onboarding wizard clobbering CLI users' config after server restart** — CLI-configured users (who set up via `hermes model` / `hermes auth`) had no `onboarding_completed` flag in `settings.json`. After a git branch switch or server restart, `verify_hermes_imports()` could momentarily return `imports_ok=False`, making `chat_ready=False` and causing the wizard to reappear with a destructive dropdown default (openrouter). Fixed by writing `onboarding_completed: True` to `settings.json` the first time `config_auto_completed` evaluates to `True`, so the flag survives future transient import failures. (`api/onboarding.py`) Co-authored by @bsgdigital.
 ## [v0.50.177] — 2026-04-23
 ### Fixed
--- a/api/onboarding.py
+++ b/api/onboarding.py
@@ -435,6 +435,25 @@ def get_onboarding_status() -> dict:
    config_exists = Path(_get_config_path()).exists()
    config_auto_completed = config_exists and bool(runtime.get("chat_ready"))
    # Persist the flag so it survives future transient import failures (e.g. after
    # a git branch switch in the hermes-agent repo).  Without this, a CLI-configured
    # user who never ran the wizard has no onboarding_completed flag — any momentary
    # imports_ok=False during restart makes chat_ready=False, config_auto_completed=False,
    # and the wizard reappears with a broken dropdown that clobbers their config.
    #
    # Best-effort: if save_settings raises (read-only FS, disk full, permission error),
    # log and continue.  The `config_auto_completed` branch of `completed=` below still
    # returns True for this request, so the user sees the correct state — only the
    # persistence-across-restart guarantee is degraded.  Raising here would turn every
    # /api/onboarding/status call into a 500 until disk was writable, which is worse UX
    # than losing the next-restart protection.
    if config_auto_completed and not settings.get("onboarding_completed"):
        try:
            save_settings({"onboarding_completed": True})
            settings["onboarding_completed"] = True
        except Exception:
            logger.debug("Failed to persist onboarding_completed", exc_info=True)
    return {
        "completed": bool(settings.get("onboarding_completed")) or auto_completed or config_auto_completed,
        "settings": {
--- a/tests/test_onboarding_existing_config.py
+++ b/tests/test_onboarding_existing_config.py
@@ -116,6 +116,50 @@ class TestOnboardingGate:
        assert "config_exists" in result["system"]
        assert result["system"]["config_exists"] is True
    def test_persist_failure_does_not_break_status_endpoint(self):
        """save_settings() failure (read-only FS, disk full) must not turn the
        status endpoint into a 500.  The persistence-across-restart guarantee
        degrades but `completed` still reflects the live `config_auto_completed`
        signal so the user isn't blocked from using the UI.
        """
        import api.onboarding as mod
        settings = {"onboarding_completed": False}
        runtime = {
            "chat_ready": True,
            "provider_configured": True,
            "provider_ready": True,
            "setup_state": "ready",
            "provider_note": "test",
            "current_provider": "openrouter",
            "current_model": "anthropic/claude-sonnet-4.6",
            "current_base_url": None,
            "env_path": "/tmp/.hermes_test/.env",
        }
        fake_config_path = pathlib.Path("/tmp/_test_config.yaml")
        with (
            mock.patch.object(mod, "load_settings", return_value=settings),
            mock.patch.object(mod, "get_config", return_value={}),
            mock.patch.object(mod, "verify_hermes_imports", return_value=(True, [], {})),
            mock.patch.object(mod, "_status_from_runtime", return_value=runtime),
            mock.patch.object(mod, "load_workspaces", return_value=[]),
            mock.patch.object(mod, "get_last_workspace", return_value=None),
            mock.patch.object(mod, "get_available_models", return_value=[]),
            mock.patch.object(mod, "_get_config_path", return_value=fake_config_path),
            mock.patch.object(pathlib.Path, "exists", return_value=True),
            mock.patch.object(
                mod, "save_settings", side_effect=OSError("read-only filesystem")
            ),
        ):
            # Must not raise — persistence failure is best-effort.
            result = mod.get_onboarding_status()
        # completed still reflects the live signal via config_auto_completed
        assert result["completed"] is True, (
            "Status endpoint must still return completed=True via the live "
            "config_auto_completed signal when persistence fails"
        )
 class TestApplyOnboardingSetupGuard:
    """Fix #2: apply_onboarding_setup must not silently overwrite config.yaml."""
@@ -303,20 +347,27 @@ class TestOnboardingGateIntegration:
        # Write a fake API key so provider_ready (and thus chat_ready) fires
        # — but only when hermes_cli imports are available
        data, _ = _http_get("/api/onboarding/status")
-        if data["system"]["hermes_found"] and data["system"]["imports_ok"]:
+        try:
-            (hermes_home / ".env").write_text(
+            if data["system"]["hermes_found"] and data["system"]["imports_ok"]:
-                "OPENROUTER_API_KEY=test-existing-key\n", encoding="utf-8"
+                (hermes_home / ".env").write_text(
-            )
+                    "OPENROUTER_API_KEY=test-e...\n", encoding="utf-8"
-            data, status = _http_get("/api/onboarding/status")
+                )
-            assert status == 200
+                data, status = _http_get("/api/onboarding/status")
-            assert data["completed"] is True, (
+                assert status == 200
-                "Existing config + chat_ready must auto-complete onboarding."
+                assert data["completed"] is True, (
-            )
+                    "Existing config + chat_ready must auto-complete onboarding."
-        else:
+                )
-            # Agent not installed: chat_ready is always False, so wizard still
+            else:
-            # fires — that is the correct behaviour (can't verify readiness).
+                # Agent not installed: chat_ready is always False, so wizard still
-            assert data["completed"] is False
+                # fires — that is the correct behaviour (can't verify readiness).
-
+                assert data["completed"] is False
        finally:
            # Clean up: the auto-persist in get_onboarding_status() (#921) writes
            # onboarding_completed=True to settings.json when config_auto_completed fires.
            # Reset to avoid contaminating subsequent tests.
            (hermes_home / "config.yaml").unlink(missing_ok=True)
            (hermes_home / ".env").unlink(missing_ok=True)
            _http_post("/api/settings", {"onboarding_completed": False})
    @_needs_yaml
    def test_setup_blocked_for_existing_config(self):
        """POST /api/onboarding/setup must return config_exists error if config.yaml exists."""
@@ -366,3 +417,7 @@ class TestOnboardingGateIntegration:
        assert data.get("error") != "config_exists", (
            "confirm_overwrite=True must bypass the guard."
        )
        # Clean up so onboarding_completed=True left by this test's setup call
        # does not contaminate subsequent tests (#921 test isolation).
        (hermes_home / "config.yaml").unlink(missing_ok=True)
        _http_post("/api/settings", {"onboarding_completed": False})