fix: persist onboarding_completed for CLI-configured users on first chat_ready (#922)

* fix: persist onboarding_completed for CLI-configured users on first chat_ready (v0.50.179, #921)

Co-authored-by: bsgdigital

* fix(onboarding): don't 500 the status endpoint if save_settings fails

The #921 persist call `save_settings({"onboarding_completed": True})` in
get_onboarding_status() raises if the settings.json write fails
(read-only filesystem, disk full, permission error). That turns every
/api/onboarding/status call into a 500 until the disk is writable,
which is much worse UX than losing the persistence-across-restart guard.

Wrapped in try/except so persistence becomes best-effort. The function
still sets settings["onboarding_completed"] = True in memory on success,
and `completed` reflects `config_auto_completed` on this request either
way, so the user sees the right state even when the write fails — only
the next-restart protection degrades.

Added regression test that patches save_settings to raise OSError and
asserts the endpoint still returns completed=True without raising.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: nesquena-hermes <nesquena-hermes@users.noreply.github.com>
Co-authored-by: Nathan Esquenazi <nesquena@gmail.com>
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
nesquena-hermes
2026-04-23 15:46:02 -07:00
committed by GitHub
parent 1011918d50
commit a3647570fb
3 changed files with 93 additions and 14 deletions

View File

@@ -29,6 +29,11 @@
workspace subtree) and never enumerate blocked system roots. (`api/routes.py`, workspace subtree) and never enumerate blocked system roots. (`api/routes.py`,
`api/workspace.py`, `static/panels.js`, `static/style.css`) (partial for #616) `api/workspace.py`, `static/panels.js`, `static/style.css`) (partial for #616)
## [v0.50.179] — 2026-04-23
### Fixed
- **Onboarding wizard clobbering CLI users' config after server restart** — CLI-configured users (who set up via `hermes model` / `hermes auth`) had no `onboarding_completed` flag in `settings.json`. After a git branch switch or server restart, `verify_hermes_imports()` could momentarily return `imports_ok=False`, making `chat_ready=False` and causing the wizard to reappear with a destructive dropdown default (openrouter). Fixed by writing `onboarding_completed: True` to `settings.json` the first time `config_auto_completed` evaluates to `True`, so the flag survives future transient import failures. (`api/onboarding.py`) Co-authored by @bsgdigital.
## [v0.50.177] — 2026-04-23 ## [v0.50.177] — 2026-04-23
### Fixed ### Fixed

View File

@@ -435,6 +435,25 @@ def get_onboarding_status() -> dict:
config_exists = Path(_get_config_path()).exists() config_exists = Path(_get_config_path()).exists()
config_auto_completed = config_exists and bool(runtime.get("chat_ready")) config_auto_completed = config_exists and bool(runtime.get("chat_ready"))
# Persist the flag so it survives future transient import failures (e.g. after
# a git branch switch in the hermes-agent repo). Without this, a CLI-configured
# user who never ran the wizard has no onboarding_completed flag — any momentary
# imports_ok=False during restart makes chat_ready=False, config_auto_completed=False,
# and the wizard reappears with a broken dropdown that clobbers their config.
#
# Best-effort: if save_settings raises (read-only FS, disk full, permission error),
# log and continue. The `config_auto_completed` branch of `completed=` below still
# returns True for this request, so the user sees the correct state — only the
# persistence-across-restart guarantee is degraded. Raising here would turn every
# /api/onboarding/status call into a 500 until disk was writable, which is worse UX
# than losing the next-restart protection.
if config_auto_completed and not settings.get("onboarding_completed"):
try:
save_settings({"onboarding_completed": True})
settings["onboarding_completed"] = True
except Exception:
logger.debug("Failed to persist onboarding_completed", exc_info=True)
return { return {
"completed": bool(settings.get("onboarding_completed")) or auto_completed or config_auto_completed, "completed": bool(settings.get("onboarding_completed")) or auto_completed or config_auto_completed,
"settings": { "settings": {

View File

@@ -116,6 +116,50 @@ class TestOnboardingGate:
assert "config_exists" in result["system"] assert "config_exists" in result["system"]
assert result["system"]["config_exists"] is True assert result["system"]["config_exists"] is True
def test_persist_failure_does_not_break_status_endpoint(self):
"""save_settings() failure (read-only FS, disk full) must not turn the
status endpoint into a 500. The persistence-across-restart guarantee
degrades but `completed` still reflects the live `config_auto_completed`
signal so the user isn't blocked from using the UI.
"""
import api.onboarding as mod
settings = {"onboarding_completed": False}
runtime = {
"chat_ready": True,
"provider_configured": True,
"provider_ready": True,
"setup_state": "ready",
"provider_note": "test",
"current_provider": "openrouter",
"current_model": "anthropic/claude-sonnet-4.6",
"current_base_url": None,
"env_path": "/tmp/.hermes_test/.env",
}
fake_config_path = pathlib.Path("/tmp/_test_config.yaml")
with (
mock.patch.object(mod, "load_settings", return_value=settings),
mock.patch.object(mod, "get_config", return_value={}),
mock.patch.object(mod, "verify_hermes_imports", return_value=(True, [], {})),
mock.patch.object(mod, "_status_from_runtime", return_value=runtime),
mock.patch.object(mod, "load_workspaces", return_value=[]),
mock.patch.object(mod, "get_last_workspace", return_value=None),
mock.patch.object(mod, "get_available_models", return_value=[]),
mock.patch.object(mod, "_get_config_path", return_value=fake_config_path),
mock.patch.object(pathlib.Path, "exists", return_value=True),
mock.patch.object(
mod, "save_settings", side_effect=OSError("read-only filesystem")
),
):
# Must not raise — persistence failure is best-effort.
result = mod.get_onboarding_status()
# completed still reflects the live signal via config_auto_completed
assert result["completed"] is True, (
"Status endpoint must still return completed=True via the live "
"config_auto_completed signal when persistence fails"
)
class TestApplyOnboardingSetupGuard: class TestApplyOnboardingSetupGuard:
"""Fix #2: apply_onboarding_setup must not silently overwrite config.yaml.""" """Fix #2: apply_onboarding_setup must not silently overwrite config.yaml."""
@@ -303,20 +347,27 @@ class TestOnboardingGateIntegration:
# Write a fake API key so provider_ready (and thus chat_ready) fires # Write a fake API key so provider_ready (and thus chat_ready) fires
# — but only when hermes_cli imports are available # — but only when hermes_cli imports are available
data, _ = _http_get("/api/onboarding/status") data, _ = _http_get("/api/onboarding/status")
if data["system"]["hermes_found"] and data["system"]["imports_ok"]: try:
(hermes_home / ".env").write_text( if data["system"]["hermes_found"] and data["system"]["imports_ok"]:
"OPENROUTER_API_KEY=test-existing-key\n", encoding="utf-8" (hermes_home / ".env").write_text(
) "OPENROUTER_API_KEY=test-e...\n", encoding="utf-8"
data, status = _http_get("/api/onboarding/status") )
assert status == 200 data, status = _http_get("/api/onboarding/status")
assert data["completed"] is True, ( assert status == 200
"Existing config + chat_ready must auto-complete onboarding." assert data["completed"] is True, (
) "Existing config + chat_ready must auto-complete onboarding."
else: )
# Agent not installed: chat_ready is always False, so wizard still else:
# fires — that is the correct behaviour (can't verify readiness). # Agent not installed: chat_ready is always False, so wizard still
assert data["completed"] is False # fires — that is the correct behaviour (can't verify readiness).
assert data["completed"] is False
finally:
# Clean up: the auto-persist in get_onboarding_status() (#921) writes
# onboarding_completed=True to settings.json when config_auto_completed fires.
# Reset to avoid contaminating subsequent tests.
(hermes_home / "config.yaml").unlink(missing_ok=True)
(hermes_home / ".env").unlink(missing_ok=True)
_http_post("/api/settings", {"onboarding_completed": False})
@_needs_yaml @_needs_yaml
def test_setup_blocked_for_existing_config(self): def test_setup_blocked_for_existing_config(self):
"""POST /api/onboarding/setup must return config_exists error if config.yaml exists.""" """POST /api/onboarding/setup must return config_exists error if config.yaml exists."""
@@ -366,3 +417,7 @@ class TestOnboardingGateIntegration:
assert data.get("error") != "config_exists", ( assert data.get("error") != "config_exists", (
"confirm_overwrite=True must bypass the guard." "confirm_overwrite=True must bypass the guard."
) )
# Clean up so onboarding_completed=True left by this test's setup call
# does not contaminate subsequent tests (#921 test isolation).
(hermes_home / "config.yaml").unlink(missing_ok=True)
_http_post("/api/settings", {"onboarding_completed": False})