From cd01e4d5bab0e976baa0935b6b22ea63e58a7449 Mon Sep 17 00:00:00 2001
From: bergeouss <48155732+bergeouss@users.noreply.github.com>
Date: Thu, 23 Apr 2026 18:45:46 +0200
Subject: [PATCH] feat(models): live-first model fetching for all OpenAI-compat
 providers (#892)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat(models): live-first model fetching for all OpenAI-compat providers (#871)

The WebUI model picker relied on hardcoded _PROVIDER_MODELS as primary
source for providers like zai, minimax, mistralai, xai, openai-codex,
deepseek, and gemini. These lists go stale — new models don't appear
until someone manually updates the dict.

Add an OpenAI-compat /v1/models fetch fallback in _handle_live_models()
that fires when provider_model_ids() is unavailable or returns []. The
resolution chain is now:

  1. hermes_cli.provider_model_ids() (agent's live fetch)
  2. Custom providers from config.yaml
  3. Direct /v1/models fetch for known OpenAI-compat endpoints
  4. Static _PROVIDER_MODELS as last-resort offline fallback

Covers: zai, minimax, mistralai, xai, openai-codex, deepseek, gemini.

Uses urllib (stdlib) — no new dependencies. Static lists remain as
offline fallback so the UI always shows something.

Closes #871

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

* refactor(models): address review feedback on live fetch (#892)

Five changes from nesquena-hermes review:

1. Move _OPENAI_COMPAT_ENDPOINTS to module level — avoid dict
   reconstruction per request
2. Document urllib blocking behavior — 8s timeout acceptable because
   server is threaded and frontend enriches in background
3. Add TODO comment for TTL-based caching follow-up
4. Remove openai-codex from endpoint map — same endpoint as base
   openai provider, already covered by provider_model_ids()
5. Restrict API key lookup to provider-scoped and model.api_key only
   — remove top-level api_key fallback to prevent cross-provider
   key leakage

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
---
 api/routes.py | 70 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 65 insertions(+), 5 deletions(-)

diff --git a/api/routes.py b/api/routes.py
index ca5c78e..b8c0c05 100644
--- a/api/routes.py
+++ b/api/routes.py
@@ -24,6 +24,27 @@ _PROVIDER_ALIASES = {
     "openai-codex": "openai",
 }
 
+# OpenAI-compatible /v1/models endpoints for live model discovery.
+# Used as fallback when hermes_cli.provider_model_ids() is unavailable or
+# returns [] for a provider (#871).  Kept at module level so the dict is
+# built once, not reconstructed per request.
+_OPENAI_COMPAT_ENDPOINTS = {
+    "zai": "https://api.z.ai/v1",
+    "minimax": "https://api.minimax.chat/v1",
+    "mistralai": "https://api.mistral.ai/v1",
+    "xai": "https://api.x.ai/v1",
+    "deepseek": "https://api.deepseek.com/v1",
+    "gemini": "https://generativelanguage.googleapis.com/v1beta/openai",
+}
+# NOTE: "openai-codex" is excluded because it maps to the same endpoint as
+# the base "openai" provider (api.openai.com/v1).  When both are configured
+# the openai provider is already wired through provider_model_ids(); codex-
+# specific model filtering happens downstream in hermes_cli.
+#
+# TODO: Add TTL-based caching (e.g. 60s) so repeated model-list requests
+# don't hit provider APIs.  The frontend already caches via _liveModelCache
+# but the backend re-fetches on every /api/models/live call.
+
 from api.config import (
     STATE_DIR,
     SESSION_DIR,
@@ -2168,9 +2189,7 @@ def _handle_live_models(handler, parsed):
             ids = _pmi(provider)
         except Exception as _import_err:
             logger.debug("provider_model_ids import failed for %s: %s", provider, _import_err)
-            # Last resort: return the WebUI's own static catalog
-            from api.config import _PROVIDER_MODELS as _pm
-            ids = [m["id"] for m in _pm.get(provider, [])]
+            ids = []
 
         if not ids:
             # For 'custom' provider, provider_model_ids() returns [] because
@@ -2188,8 +2207,49 @@ def _handle_live_models(handler, parsed):
                         ]
                 except Exception:
                     pass
-            if not ids:
-                return j(handler, {"provider": provider, "models": [], "count": 0})
+
+        # ── OpenAI-compat live fetch fallback ──────────────────────────────────
+        # When provider_model_ids() is unavailable or returns [] for a provider
+        # that exposes a standard /v1/models endpoint, fetch directly.  This
+        # eliminates the need to keep _PROVIDER_MODELS in sync for providers
+        # that have a discoverable API (#871).
+        #
+        # WARNING: This uses synchronous urllib.request which blocks the worker
+        # thread for up to 8 seconds on timeout. This is acceptable because:
+        #  (a) the server uses threading (not async), so other requests continue;
+        #  (b) the frontend shows the static list immediately and enriches in
+        #      the background via _fetchLiveModels(), so the user never waits.
+        if not ids:
+            _ep = _OPENAI_COMPAT_ENDPOINTS.get(provider)
+            if _ep:
+                try:
+                    import urllib.request
+                    _providers_cfg = cfg.get("providers", {})
+                    _prov = _providers_cfg.get(provider, {}) if isinstance(_providers_cfg, dict) else {}
+                    # Only use provider-scoped key — never fall back to a top-level
+                    # api_key which may belong to a different provider.
+                    _key = _prov.get("api_key") if isinstance(_prov, dict) else None
+                    if not _key:
+                        _key = cfg.get("model", {}).get("api_key")
+                    if _key:
+                        _req = urllib.request.Request(
+                            f"{_ep}/models",
+                            headers={"Authorization": f"Bearer {_key}"},
+                        )
+                        with urllib.request.urlopen(_req, timeout=8) as _resp:
+                            _body = json.loads(_resp.read())
+                        ids = [m.get("id", "") for m in _body.get("data", []) if m.get("id")]
+                        logger.debug("Live-fetched %d models from %s /v1/models", len(ids), provider)
+                except Exception as _fetch_err:
+                    logger.debug("Live fetch from %s failed: %s", provider, _fetch_err)
+                    # Fall through to static list below
+
+        # Static fallback — only reached when live fetch also failed.
+        if not ids:
+            from api.config import _PROVIDER_MODELS as _pm
+            ids = [m["id"] for m in _pm.get(provider, [])]
+        if not ids:
+            return j(handler, {"provider": provider, "models": [], "count": 0})
 
         # Normalise to {id, label} — provider_model_ids() returns plain string IDs.
         # For ollama-cloud use the shared Ollama formatter (handles `:variant` suffix).