fix: modernize live tests and gemini ids

2026-01-12 06:58:31 +00:00 · 2026-01-12 06:58:31 +00:00 · 1850013cae
commit 1850013cae
parent 79cbb20988
11 changed files with 1053 additions and 593 deletions
--- a/docs/concepts/model-providers.md
+++ b/docs/concepts/model-providers.md
@ -76,7 +76,7 @@ Clawdbot ships with the pi‑ai catalog. These providers require **no**
 - Provider: `google`
 - Auth: `GEMINI_API_KEY`
- Example model: `google/gemini-3-pro`
+- Example model: `google/gemini-3-pro-preview`
 - CLI: `clawdbot onboard --auth-choice gemini-api-key`
 ### Google Vertex / Antigravity / Gemini CLI
--- a/docs/testing.md
+++ b/docs/testing.md
@ -25,9 +25,8 @@ When you touch tests or want extra confidence:
 - Coverage gate: `pnpm test:coverage`
 - E2E suite: `pnpm test:e2e`
-When debugging real providers/models (requires real creds; skipped by default):
+When debugging real providers/models (requires real creds):
- Live suite (models only): `CLAWDBOT_LIVE_TEST=1 pnpm test:live`
+- Live suite (models + gateway tool/image probes): `pnpm test:live`
 - Live suite (models + providers): `LIVE=1 pnpm test:live`
 Tip: when you only need one failing case, prefer narrowing live tests via the allowlist env vars described below.
@ -67,7 +66,7 @@ Think of the suites as “increasing realism” (and increasing flakiness/cost):
 - Command: `pnpm test:live`
 - Config: `vitest.live.config.ts`
 - Files: `src/**/*.live.test.ts`
- Default: **skipped** unless `CLAWDBOT_LIVE_TEST=1` or `LIVE=1`
+- Default: **enabled** by `pnpm test:live` (sets `CLAWDBOT_LIVE_TEST=1`)
 - Scope:
  - “Does this provider/model actually work *today* with real creds?”
  - Catch provider format changes, tool-calling quirks, auth issues, and rate limit behavior
@ -75,6 +74,8 @@ Think of the suites as “increasing realism” (and increasing flakiness/cost):
  - Not CI-stable by design (real networks, real provider policies, quotas, outages)
  - Costs money / uses rate limits
  - Prefer running narrowed subsets instead of “everything”
  - Live runs will source `~/.profile` to pick up missing API keys
  - Anthropic key rotation: set `CLAWDBOT_LIVE_ANTHROPIC_KEYS="sk-...,sk-..."` (or `CLAWDBOT_LIVE_ANTHROPIC_KEY=sk-...`) or multiple `ANTHROPIC_API_KEY*` vars; tests will retry on rate limits
 ## Which suite should I run?
@ -97,10 +98,11 @@ Live tests are split into two layers so we can isolate failures:
  - Use `getApiKeyForModel` to select models you have creds for
  - Run a small completion per model (and targeted regressions where needed)
 - How to enable:
-  - `CLAWDBOT_LIVE_TEST=1` or `LIVE=1`
+  - `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly)
-  - `CLAWDBOT_LIVE_ALL_MODELS=1` (required for this test to run)
+- Set `CLAWDBOT_LIVE_MODELS=modern` (or `all`, alias for modern) to actually run this suite; otherwise it skips to keep `pnpm test:live` focused on gateway smoke
 - How to select models:
-  - `CLAWDBOT_LIVE_MODELS=all` to run everything with keys
+  - `CLAWDBOT_LIVE_MODELS=modern` to run the modern allowlist (Opus/Sonnet/Haiku 4.5, GPT-5.x + Codex, Gemini 3, GLM 4.7, MiniMax M2.1, Grok 4)
  - `CLAWDBOT_LIVE_MODELS=all` is an alias for the modern allowlist
  - or `CLAWDBOT_LIVE_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,..."` (comma allowlist)
 - How to select providers:
  - `CLAWDBOT_LIVE_PROVIDERS="google,google-antigravity,google-gemini-cli"` (comma allowlist)
@ -128,18 +130,16 @@ Live tests are split into two layers so we can isolate failures:
  - image probe: the test attaches a generated PNG (cat + randomized code) and expects the model to return `cat <CODE>`.
  - Implementation reference: `src/gateway/gateway-models.profiles.live.test.ts` and `src/gateway/live-image-probe.ts`.
 - How to enable:
-  - `CLAWDBOT_LIVE_TEST=1` or `LIVE=1`
+  - `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly)
  - `CLAWDBOT_LIVE_GATEWAY=1` (required for this test to run)
 - How to select models:
-  - `CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1` to scan all discovered models with keys
+  - Default: modern allowlist (Opus/Sonnet/Haiku 4.5, GPT-5.x + Codex, Gemini 3, GLM 4.7, MiniMax M2.1, Grok 4)
-  - or set `CLAWDBOT_LIVE_GATEWAY_MODELS="provider/model,provider/model,..."` to narrow quickly
+  - `CLAWDBOT_LIVE_GATEWAY_MODELS=all` is an alias for the modern allowlist
  - Or set `CLAWDBOT_LIVE_GATEWAY_MODELS="provider/model"` (or comma list) to narrow
 - How to select providers (avoid “OpenRouter everything”):
  - `CLAWDBOT_LIVE_GATEWAY_PROVIDERS="google,google-antigravity,google-gemini-cli,openai,anthropic,zai,minimax"` (comma allowlist)
- Optional tool-calling stress:
+- Tool + image probes are always on in this live test:
-  - `CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1` enables an extra “exec writes file → read reads it back → echo nonce” check.
+  - `read` probe + `exec+read` probe (tool stress)
-  - This is specifically meant to catch tool-calling compatibility issues across providers (formatting, history replay, tool_result pairing, etc.).
+  - image probe runs when the model advertises image input support
 - Optional image send smoke:
  - `CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1` sends a real image attachment through the gateway agent pipeline (multimodal message) and asserts the model can read back a per-run code from the image.
  - Flow (high level):
    - Test generates a tiny PNG with “CAT” + random code (`src/gateway/live-image-probe.ts`)
    - Sends it via `agent` `attachments: [{ mimeType: "image/png", content: "<base64>" }]`
@ -159,7 +159,7 @@ pnpm clawdbot models list --json
 - Test: `src/agents/anthropic.setup-token.live.test.ts`
 - Goal: verify Claude CLI setup-token (or a pasted setup-token profile) can complete an Anthropic prompt.
 - Enable:
-  - `CLAWDBOT_LIVE_TEST=1` or `LIVE=1`
+  - `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly)
  - `CLAWDBOT_LIVE_SETUP_TOKEN=1`
 - Token sources (pick one):
  - Profile: `CLAWDBOT_LIVE_SETUP_TOKEN_PROFILE=anthropic:setup-token-test`
@ -171,7 +171,7 @@ Setup example:
 ```bash
 clawdbot models auth paste-token --provider anthropic --profile-id anthropic:setup-token-test
-CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFILE=anthropic:setup-token-test pnpm test:live src/agents/anthropic.setup-token.live.test.ts
+CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFILE=anthropic:setup-token-test pnpm test:live src/agents/anthropic.setup-token.live.test.ts
 ```
 ## Live: CLI backend smoke (Claude CLI or other local CLIs)
@ -179,7 +179,7 @@ CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFI
 - Test: `src/gateway/gateway-cli-backend.live.test.ts`
 - Goal: validate the Gateway + agent pipeline using a local CLI backend, without touching your default config.
 - Enable:
-  - `CLAWDBOT_LIVE_TEST=1` or `LIVE=1`
+  - `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly)
  - `CLAWDBOT_LIVE_CLI_BACKEND=1`
 - Defaults:
  - Model: `claude-cli/claude-sonnet-4-5`
@ -200,7 +200,7 @@ CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFI
 Example:
 ```bash
-CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_CLI_BACKEND=1 \
+CLAWDBOT_LIVE_CLI_BACKEND=1 \
  CLAWDBOT_LIVE_CLI_BACKEND_MODEL="claude-cli/claude-sonnet-4-5" \
  pnpm test:live src/gateway/gateway-cli-backend.live.test.ts
 ```
@ -210,17 +210,17 @@ CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_CLI_BACKEND=1 \
 Narrow, explicit allowlists are fastest and least flaky:
 - Single model, direct (no gateway):
-  - `CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_ALL_MODELS=1 CLAWDBOT_LIVE_MODELS="openai/gpt-5.2" pnpm test:live src/agents/models.profiles.live.test.ts`
+  - `CLAWDBOT_LIVE_MODELS="openai/gpt-5.2" pnpm test:live src/agents/models.profiles.live.test.ts`
 - Single model, gateway smoke:
-  - `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
+  - `CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
- Tool calling across several providers (exec + read probe):
+- Tool calling across several providers:
-  - `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-flash,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
+  - `CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-flash-preview,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
 - Google focus (Gemini API key + Antigravity):
-  - Gemini (API key): `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="google/gemini-3-flash" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
+  - Gemini (API key): `CLAWDBOT_LIVE_GATEWAY_MODELS="google/gemini-3-flash-preview" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
-  - Antigravity (OAuth): `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-pro-high" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
+  - Antigravity (OAuth): `CLAWDBOT_LIVE_GATEWAY_MODELS="google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-pro-high" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
 Notes:
 - `google/...` uses the Gemini API (API key).
@ -240,20 +240,20 @@ This is the “common models” run we expect to keep working:
 - OpenAI (non-Codex): `openai/gpt-5.2` (optional: `openai/gpt-5.1`)
 - OpenAI Codex: `openai-codex/gpt-5.2` (optional: `openai-codex/gpt-5.2-codex`)
 - Anthropic: `anthropic/claude-opus-4-5` (or `anthropic/claude-sonnet-4-5`)
- Google (Gemini API): `google/gemini-3-pro` and `google/gemini-3-flash` (avoid older Gemini 2.x models)
+- Google (Gemini API): `google/gemini-3-pro-preview` and `google/gemini-3-flash-preview` (avoid older Gemini 2.x models)
 - Google (Antigravity): `google-antigravity/claude-opus-4-5-thinking` and `google-antigravity/gemini-3-flash`
 - Z.AI (GLM): `zai/glm-4.7`
 - MiniMax: `minimax/minimax-m2.1`
 Run gateway smoke with tools + image:
-`LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-pro,google/gemini-3-flash,google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-flash,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
+`CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-pro-preview,google/gemini-3-flash-preview,google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-flash,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
 ### Baseline: tool calling (Read + optional Exec)
 Pick at least one per provider family:
 - OpenAI: `openai/gpt-5.2` (or `openai/gpt-5-mini`)
 - Anthropic: `anthropic/claude-opus-4-5` (or `anthropic/claude-sonnet-4-5`)
- Google: `google/gemini-3-flash` (or `google/gemini-3-pro`)
+- Google: `google/gemini-3-flash-preview` (or `google/gemini-3-pro-preview`)
 - Z.AI (GLM): `zai/glm-4.7`
 - MiniMax: `minimax/minimax-m2.1`
@ -265,7 +265,7 @@ Optional additional coverage (nice to have):
 ### Vision: image send (attachment → multimodal message)
-Run with `CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1` and include at least one image-capable model in `CLAWDBOT_LIVE_GATEWAY_MODELS` (Claude/Gemini/OpenAI vision-capable variants, etc.).
+Include at least one image-capable model in `CLAWDBOT_LIVE_GATEWAY_MODELS` (Claude/Gemini/OpenAI vision-capable variants, etc.) to exercise the image probe.
 ### Aggregators / alternate gateways
--- a/package.json
+++ b/package.json
@ -97,7 +97,7 @@
    "test:force": "tsx scripts/test-force.ts",
    "test:coverage": "vitest run --coverage",
    "test:e2e": "vitest run --config vitest.e2e.config.ts",
-    "test:live": "vitest run --config vitest.live.config.ts",
+    "test:live": "CLAWDBOT_LIVE_TEST=1 vitest run --config vitest.live.config.ts",
    "test:docker:onboard": "bash scripts/e2e/onboard-docker.sh",
    "test:docker:gateway-network": "bash scripts/e2e/gateway-network-docker.sh",
    "test:docker:live-models": "bash scripts/test-live-models-docker.sh",
--- a/scripts/test-live-gateway-models-docker.sh
+++ b/scripts/test-live-gateway-models-docker.sh
@ -20,10 +20,6 @@ docker run --rm -t \
  --entrypoint bash \
  -e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 \
  -e HOME=/home/node \
  -e CLAWDBOT_LIVE_TEST=1 \
  -e CLAWDBOT_LIVE_GATEWAY=1 \
  -e CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 \
  -e CLAWDBOT_LIVE_GATEWAY_MODELS="${CLAWDBOT_LIVE_GATEWAY_MODELS:-all}" \
  -v "$CONFIG_DIR":/home/node/.clawdbot \
  -v "$WORKSPACE_DIR":/home/node/clawd \
  "${PROFILE_MOUNT[@]}" \
--- a/src/agents/live-auth-keys.ts
+++ b/src/agents/live-auth-keys.ts
@ -0,0 +1,50 @@
 const KEY_SPLIT_RE = /[\s,;]+/g;
 function parseKeyList(raw?: string | null): string[] {
  if (!raw) return [];
  return raw
    .split(KEY_SPLIT_RE)
    .map((value) => value.trim())
    .filter(Boolean);
 }
 function collectEnvPrefixedKeys(prefix: string): string[] {
  const keys: string[] = [];
  for (const [name, value] of Object.entries(process.env)) {
    if (!name.startsWith(prefix)) continue;
    const trimmed = value?.trim();
    if (!trimmed) continue;
    keys.push(trimmed);
  }
  return keys;
 }
 export function collectAnthropicApiKeys(): string[] {
  const forcedSingle = process.env.CLAWDBOT_LIVE_ANTHROPIC_KEY?.trim();
  if (forcedSingle) return [forcedSingle];
  const fromList = parseKeyList(process.env.CLAWDBOT_LIVE_ANTHROPIC_KEYS);
  const fromEnv = collectEnvPrefixedKeys("ANTHROPIC_API_KEY");
  const primary = process.env.ANTHROPIC_API_KEY?.trim();
  const seen = new Set<string>();
  const add = (value?: string) => {
    if (!value) return;
    if (seen.has(value)) return;
    seen.add(value);
  };
  for (const value of fromList) add(value);
  if (primary) add(primary);
  for (const value of fromEnv) add(value);
  return Array.from(seen);
 }
 export function isAnthropicRateLimitError(message: string): boolean {
  const lower = message.toLowerCase();
  if (lower.includes("rate_limit")) return true;
  if (lower.includes("rate limit")) return true;
  if (lower.includes("429")) return true;
  return false;
 }
--- a/src/agents/live-model-filter.ts
+++ b/src/agents/live-model-filter.ts
@ -0,0 +1,89 @@
 export type ModelRef = {
  provider?: string | null;
  id?: string | null;
 };
 const ANTHROPIC_PREFIXES = [
  "claude-opus-4-5",
  "claude-sonnet-4-5",
  "claude-haiku-4-5",
 ];
 const OPENAI_MODELS = ["gpt-5.2", "gpt-5.0"];
 const CODEX_MODELS = [
  "gpt-5.2",
  "gpt-5.2-codex",
  "gpt-5.1-codex",
  "gpt-5.1-codex-mini",
  "gpt-5.1-codex-max",
 ];
 const GOOGLE_PREFIXES = ["gemini-3"];
 const ZAI_PREFIXES = ["glm-4.7"];
 const MINIMAX_PREFIXES = ["minimax-m2.1"];
 const XAI_PREFIXES = ["grok-4"];
 function matchesPrefix(id: string, prefixes: string[]): boolean {
  return prefixes.some((prefix) => id.startsWith(prefix));
 }
 function matchesExactOrPrefix(id: string, values: string[]): boolean {
  return values.some((value) => id === value || id.startsWith(value));
 }
 function matchesAny(id: string, values: string[]): boolean {
  return values.some((value) => id.includes(value));
 }
 export function isModernModelRef(ref: ModelRef): boolean {
  const provider = ref.provider?.trim().toLowerCase() ?? "";
  const id = ref.id?.trim().toLowerCase() ?? "";
  if (!provider || !id) return false;
  if (provider === "anthropic") {
    return matchesPrefix(id, ANTHROPIC_PREFIXES);
  }
  if (provider === "openai") {
    return matchesExactOrPrefix(id, OPENAI_MODELS);
  }
  if (provider === "openai-codex") {
    return matchesExactOrPrefix(id, CODEX_MODELS);
  }
  if (provider === "google" || provider === "google-gemini-cli") {
    return matchesPrefix(id, GOOGLE_PREFIXES);
  }
  if (provider === "google-antigravity") {
    return (
      matchesPrefix(id, GOOGLE_PREFIXES) ||
      matchesPrefix(id, ANTHROPIC_PREFIXES)
    );
  }
  if (provider === "zai") {
    return matchesPrefix(id, ZAI_PREFIXES);
  }
  if (provider === "minimax") {
    return matchesPrefix(id, MINIMAX_PREFIXES);
  }
  if (provider === "xai") {
    return matchesPrefix(id, XAI_PREFIXES);
  }
  if (provider === "openrouter" || provider === "opencode") {
    return matchesAny(id, [
      ...ANTHROPIC_PREFIXES,
      ...OPENAI_MODELS,
      ...CODEX_MODELS,
      ...GOOGLE_PREFIXES,
      ...ZAI_PREFIXES,
      ...MINIMAX_PREFIXES,
      ...XAI_PREFIXES,
    ]);
  }
  return false;
 }
--- a/src/agents/models-config.test.ts
+++ b/src/agents/models-config.test.ts
@ -117,4 +117,59 @@ describe("models config", () => {
      );
    });
  });
  it("normalizes gemini 3 ids to preview for google providers", async () => {
    await withTempHome(async () => {
      vi.resetModules();
      const { ensureClawdbotModelsJson } = await import("./models-config.js");
      const { resolveClawdbotAgentDir } = await import("./agent-paths.js");
      const cfg: ClawdbotConfig = {
        models: {
          providers: {
            google: {
              baseUrl: "https://generativelanguage.googleapis.com/v1beta",
              apiKey: "GEMINI_KEY",
              api: "google-generative-ai",
              models: [
                {
                  id: "gemini-3-pro",
                  name: "Gemini 3 Pro",
                  api: "google-generative-ai",
                  reasoning: true,
                  input: ["text", "image"],
                  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
                  contextWindow: 1048576,
                  maxTokens: 65536,
                },
                {
                  id: "gemini-3-flash",
                  name: "Gemini 3 Flash",
                  api: "google-generative-ai",
                  reasoning: false,
                  input: ["text", "image"],
                  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
                  contextWindow: 1048576,
                  maxTokens: 65536,
                },
              ],
            },
          },
        },
      };
      await ensureClawdbotModelsJson(cfg);
      const modelPath = path.join(resolveClawdbotAgentDir(), "models.json");
      const raw = await fs.readFile(modelPath, "utf8");
      const parsed = JSON.parse(raw) as {
        providers: Record<string, { models: Array<{ id: string }> }>;
      };
      const ids = parsed.providers.google?.models?.map((model) => model.id);
      expect(ids).toEqual([
        "gemini-3-pro-preview",
        "gemini-3-flash-preview",
      ]);
    });
  });
 });
--- a/src/agents/models-config.ts
+++ b/src/agents/models-config.ts
@ -5,6 +5,7 @@ import { type ClawdbotConfig, loadConfig } from "../config/config.js";
 import { resolveClawdbotAgentDir } from "./agent-paths.js";
 type ModelsConfig = NonNullable<ClawdbotConfig["models"]>;
 type ProviderConfig = NonNullable<ModelsConfig["providers"]>[string];
 const DEFAULT_MODE: NonNullable<ModelsConfig["mode"]> = "merge";
@ -12,6 +13,38 @@ function isRecord(value: unknown): value is Record<string, unknown> {
  return Boolean(value && typeof value === "object" && !Array.isArray(value));
 }
 function normalizeGoogleModelId(id: string): string {
  if (id === "gemini-3-pro") return "gemini-3-pro-preview";
  if (id === "gemini-3-flash") return "gemini-3-flash-preview";
  return id;
 }
 function normalizeGoogleProvider(provider: ProviderConfig): ProviderConfig {
  let mutated = false;
  const models = provider.models.map((model) => {
    const nextId = normalizeGoogleModelId(model.id);
    if (nextId === model.id) return model;
    mutated = true;
    return { ...model, id: nextId };
  });
  return mutated ? { ...provider, models } : provider;
 }
 function normalizeProviders(
  providers: ModelsConfig["providers"],
 ): ModelsConfig["providers"] {
  if (!providers) return providers;
  let mutated = false;
  const next: Record<string, ProviderConfig> = {};
  for (const [key, provider] of Object.entries(providers)) {
    const normalized =
      key === "google" ? normalizeGoogleProvider(provider) : provider;
    if (normalized !== provider) mutated = true;
    next[key] = normalized;
  }
  return mutated ? next : providers;
 }
 async function readJson(pathname: string): Promise<unknown> {
  try {
    const raw = await fs.readFile(pathname, "utf8");
@ -53,7 +86,8 @@ export async function ensureClawdbotModelsJson(
    }
  }
-  const next = `${JSON.stringify({ providers: mergedProviders }, null, 2)}\n`;
+  const normalizedProviders = normalizeProviders(mergedProviders);
  const next = `${JSON.stringify({ providers: normalizedProviders }, null, 2)}\n`;
  try {
    existingRaw = await fs.readFile(targetPath, "utf8");
  } catch {
--- a/src/agents/models.profiles.live.test.ts
+++ b/src/agents/models.profiles.live.test.ts
@ -7,24 +7,20 @@ import { Type } from "@sinclair/typebox";
 import { describe, expect, it } from "vitest";
 import { loadConfig } from "../config/config.js";
 import { resolveClawdbotAgentDir } from "./agent-paths.js";
 import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
 import { getApiKeyForModel } from "./model-auth.js";
 import {
-  buildModelAliasIndex,
+  collectAnthropicApiKeys,
-  parseModelRef,
+  isAnthropicRateLimitError,
-  resolveConfiguredModelRef,
+} from "./live-auth-keys.js";
-  resolveModelRefFromString,
+import { isModernModelRef } from "./live-model-filter.js";
-} from "./model-selection.js";
+import { getApiKeyForModel } from "./model-auth.js";
 import { ensureClawdbotModelsJson } from "./models-config.js";
 const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
-const ALL_MODELS =
+const DIRECT_ENABLED = Boolean(process.env.CLAWDBOT_LIVE_MODELS?.trim());
  process.env.CLAWDBOT_LIVE_ALL_MODELS === "1" ||
  process.env.CLAWDBOT_LIVE_MODELS === "all";
 const REQUIRE_PROFILE_KEYS =
  process.env.CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS === "1";
-const describeLive = LIVE && ALL_MODELS ? describe : describe.skip;
+const describeLive = LIVE ? describe : describe.skip;
 function parseProviderFilter(raw?: string): Set<string> | null {
  const trimmed = raw?.trim();
@ -46,6 +42,10 @@ function parseModelFilter(raw?: string): Set<string> | null {
  return ids.length ? new Set(ids) : null;
 }
 function logProgress(message: string): void {
  console.log(`[live] ${message}`);
 }
 function isGoogleModelNotFoundError(err: unknown): boolean {
  const msg = String(err);
  if (!/not found/i.test(msg)) return false;
@ -127,75 +127,25 @@ async function completeOkWithRetry(params: {
  return await runOnce();
 }
 function resolveConfiguredModelKeys(
  cfg: ReturnType<typeof loadConfig>,
 ): string[] {
  const aliasIndex = buildModelAliasIndex({
    cfg,
    defaultProvider: DEFAULT_PROVIDER,
  });
  const order: string[] = [];
  const seen = new Set<string>();
  const addKey = (key: string) => {
    const normalized = key.trim();
    if (!normalized || seen.has(normalized)) return;
    seen.add(normalized);
    order.push(normalized);
  };
  const addRef = (ref: { provider: string; model: string }) => {
    addKey(`${ref.provider}/${ref.model}`);
  };
  addRef(
    resolveConfiguredModelRef({
      cfg,
      defaultProvider: DEFAULT_PROVIDER,
      defaultModel: DEFAULT_MODEL,
    }),
  );
  const modelConfig = cfg.agents?.defaults?.model as
    | { primary?: string; fallbacks?: string[] }
    | undefined;
  const imageModelConfig = cfg.agents?.defaults?.imageModel as
    | { primary?: string; fallbacks?: string[] }
    | undefined;
  const primary = modelConfig?.primary?.trim() ?? "";
  const fallbacks = modelConfig?.fallbacks ?? [];
  const imagePrimary = imageModelConfig?.primary?.trim() ?? "";
  const imageFallbacks = imageModelConfig?.fallbacks ?? [];
  const addRaw = (raw: string) => {
    const resolved = resolveModelRefFromString({
      raw,
      defaultProvider: DEFAULT_PROVIDER,
      aliasIndex,
    });
    if (resolved) addRef(resolved.ref);
  };
  if (primary) addRaw(primary);
  for (const raw of fallbacks) addRaw(String(raw ?? ""));
  if (imagePrimary) addRaw(imagePrimary);
  for (const raw of imageFallbacks) addRaw(String(raw ?? ""));
  for (const key of Object.keys(cfg.agents?.defaults?.models ?? {})) {
    const parsed = parseModelRef(String(key ?? ""), DEFAULT_PROVIDER);
    if (parsed) addRef(parsed);
  }
  return order;
 }
 describeLive("live models (profile keys)", () => {
  it(
-    "completes across configured models",
+    "completes across selected models",
    async () => {
      const cfg = loadConfig();
      await ensureClawdbotModelsJson(cfg);
      if (!DIRECT_ENABLED) {
        logProgress(
          "[live-models] skipping (set CLAWDBOT_LIVE_MODELS=modern|all|<list>; all=modern)",
        );
        return;
      }
      const anthropicKeys = collectAnthropicApiKeys();
      if (anthropicKeys.length > 0) {
        process.env.ANTHROPIC_API_KEY = anthropicKeys[0];
        logProgress(
          `[live-models] anthropic keys loaded: ${anthropicKeys.length}`,
        );
      }
      const agentDir = resolveClawdbotAgentDir();
      const authStorage = discoverAuthStorage(agentDir);
@ -205,7 +155,11 @@ describeLive("live models (profile keys)", () => {
        models.map((model) => [`${model.provider}/${model.id}`, model]),
      );
-      const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS);
+      const rawModels = process.env.CLAWDBOT_LIVE_MODELS?.trim();
      const useModern = rawModels === "modern" || rawModels === "all";
      const useExplicit = Boolean(rawModels) && !useModern;
      const filter = useExplicit ? parseModelFilter(rawModels) : null;
      const allowNotFoundSkip = useModern;
      const providers = parseProviderFilter(
        process.env.CLAWDBOT_LIVE_PROVIDERS,
      );
@ -216,149 +170,196 @@ describeLive("live models (profile keys)", () => {
      const failures: Array<{ model: string; error: string }> = [];
      const skipped: Array<{ model: string; reason: string }> = [];
      const candidates: Array<{
        model: Model<Api>;
        apiKeyInfo: Awaited<ReturnType<typeof getApiKeyForModel>>;
      }> = [];
-      const configuredKeys = resolveConfiguredModelKeys(cfg);
+      for (const model of models) {
      for (const key of configuredKeys) {
        const model = modelByKey.get(key);
        if (!model) {
          skipped.push({
            model: key,
            reason: "configured model missing in registry",
          });
          continue;
        }
        if (providers && !providers.has(model.provider)) continue;
        const id = `${model.provider}/${model.id}`;
        if (filter && !filter.has(id)) continue;
-
+        if (!filter && useModern) {
-        let apiKeyInfo: Awaited<ReturnType<typeof getApiKeyForModel>>;
+          if (!isModernModelRef({ provider: model.provider, id: model.id })) {
        try {
          apiKeyInfo = await getApiKeyForModel({ model, cfg });
        } catch (err) {
          skipped.push({ model: id, reason: String(err) });
          continue;
        }
        if (REQUIRE_PROFILE_KEYS && !apiKeyInfo.source.startsWith("profile:")) {
          skipped.push({
            model: id,
            reason: `non-profile credential source: ${apiKeyInfo.source}`,
          });
          continue;
        }
        try {
          // Special regression: OpenAI requires replayed `reasoning` items for tool-only turns.
          if (
            model.provider === "openai" &&
            model.api === "openai-responses" &&
            model.id === "gpt-5.2"
          ) {
            const noopTool = {
              name: "noop",
              description: "Return ok.",
              parameters: Type.Object({}, { additionalProperties: false }),
            };
            const first = await completeSimpleWithTimeout(
              model,
              {
                messages: [
                  {
                    role: "user",
                    content:
                      "Call the tool `noop` with {}. Do not write any other text.",
                    timestamp: Date.now(),
                  },
                ],
                tools: [noopTool],
              },
              {
                apiKey: apiKeyInfo.apiKey,
                reasoning: model.reasoning ? "low" : undefined,
                maxTokens: 128,
              },
              perModelTimeoutMs,
            );
            const toolCall = first.content.find((b) => b.type === "toolCall");
            expect(toolCall).toBeTruthy();
            if (!toolCall || toolCall.type !== "toolCall") {
              throw new Error("expected tool call");
            }
            const second = await completeSimpleWithTimeout(
              model,
              {
                messages: [
                  {
                    role: "user",
                    content:
                      "Call the tool `noop` with {}. Do not write any other text.",
                    timestamp: Date.now(),
                  },
                  first,
                  {
                    role: "toolResult",
                    toolCallId: toolCall.id,
                    toolName: "noop",
                    content: [{ type: "text", text: "ok" }],
                    isError: false,
                    timestamp: Date.now(),
                  },
                  {
                    role: "user",
                    content: "Reply with the word ok.",
                    timestamp: Date.now(),
                  },
                ],
              },
              {
                apiKey: apiKeyInfo.apiKey,
                reasoning: model.reasoning ? "low" : undefined,
                maxTokens: 64,
              },
              perModelTimeoutMs,
            );
            const secondText = second.content
              .filter((b) => b.type === "text")
              .map((b) => b.text.trim())
              .join(" ");
            expect(secondText.length).toBeGreaterThan(0);
            continue;
          }
-
+        }
-          const ok = await completeOkWithRetry({
+        try {
-            model,
+          const apiKeyInfo = await getApiKeyForModel({ model, cfg });
-            apiKey: apiKeyInfo.apiKey,
+          if (
-            timeoutMs: perModelTimeoutMs,
+            REQUIRE_PROFILE_KEYS &&
-          });
+            !apiKeyInfo.source.startsWith("profile:")
-
+          ) {
          if (ok.res.stopReason === "error") {
            const msg = ok.res.errorMessage ?? "";
            if (ALL_MODELS && isModelNotFoundErrorMessage(msg)) {
              skipped.push({ model: id, reason: msg });
              continue;
            }
            throw new Error(msg || "model returned error with no message");
          }
          if (ok.text.length === 0 && model.provider === "google") {
            skipped.push({
              model: id,
-              reason: "no text returned (likely unavailable model id)",
+              reason: `non-profile credential source: ${apiKeyInfo.source}`,
            });
            continue;
          }
-          expect(ok.text.length).toBeGreaterThan(0);
+          candidates.push({ model, apiKeyInfo });
        } catch (err) {
-          if (model.provider === "google" && isGoogleModelNotFoundError(err)) {
+          skipped.push({ model: id, reason: String(err) });
-            skipped.push({ model: id, reason: String(err) });
+        }
-            continue;
+      }
      if (candidates.length === 0) {
        logProgress("[live-models] no API keys found; skipping");
        return;
      }
      logProgress(
        `[live-models] selection=${useExplicit ? "explicit" : "modern"}`,
      );
      logProgress(`[live-models] running ${candidates.length} models`);
      const total = candidates.length;
      for (const [index, entry] of candidates.entries()) {
        const { model, apiKeyInfo } = entry;
        const id = `${model.provider}/${model.id}`;
        const progressLabel = `[live-models] ${index + 1}/${total} ${id}`;
        const attemptMax =
          model.provider === "anthropic" && anthropicKeys.length > 0
            ? anthropicKeys.length
            : 1;
        for (let attempt = 0; attempt < attemptMax; attempt += 1) {
          if (model.provider === "anthropic" && anthropicKeys.length > 0) {
            process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt];
          }
          const apiKey =
            model.provider === "anthropic" && anthropicKeys.length > 0
              ? anthropicKeys[attempt]
              : apiKeyInfo.apiKey;
          try {
            // Special regression: OpenAI requires replayed `reasoning` items for tool-only turns.
            if (
              model.provider === "openai" &&
              model.api === "openai-responses" &&
              model.id === "gpt-5.2"
            ) {
              logProgress(`${progressLabel}: tool-only regression`);
              const noopTool = {
                name: "noop",
                description: "Return ok.",
                parameters: Type.Object({}, { additionalProperties: false }),
              };
              const first = await completeSimpleWithTimeout(
                model,
                {
                  messages: [
                    {
                      role: "user",
                      content:
                        "Call the tool `noop` with {}. Do not write any other text.",
                      timestamp: Date.now(),
                    },
                  ],
                  tools: [noopTool],
                },
                {
                  apiKey,
                  reasoning: model.reasoning ? "low" : undefined,
                  maxTokens: 128,
                },
                perModelTimeoutMs,
              );
              const toolCall = first.content.find((b) => b.type === "toolCall");
              expect(toolCall).toBeTruthy();
              if (!toolCall || toolCall.type !== "toolCall") {
                throw new Error("expected tool call");
              }
              const second = await completeSimpleWithTimeout(
                model,
                {
                  messages: [
                    {
                      role: "user",
                      content:
                        "Call the tool `noop` with {}. Do not write any other text.",
                      timestamp: Date.now(),
                    },
                    first,
                    {
                      role: "toolResult",
                      toolCallId: toolCall.id,
                      toolName: "noop",
                      content: [{ type: "text", text: "ok" }],
                      isError: false,
                      timestamp: Date.now(),
                    },
                    {
                      role: "user",
                      content: "Reply with the word ok.",
                      timestamp: Date.now(),
                    },
                  ],
                },
                {
                  apiKey,
                  reasoning: model.reasoning ? "low" : undefined,
                  maxTokens: 64,
                },
                perModelTimeoutMs,
              );
              const secondText = second.content
                .filter((b) => b.type === "text")
                .map((b) => b.text.trim())
                .join(" ");
              expect(secondText.length).toBeGreaterThan(0);
              logProgress(`${progressLabel}: done`);
              break;
            }
            logProgress(`${progressLabel}: prompt`);
            const ok = await completeOkWithRetry({
              model,
              apiKey,
              timeoutMs: perModelTimeoutMs,
            });
            if (ok.res.stopReason === "error") {
              const msg = ok.res.errorMessage ?? "";
              if (allowNotFoundSkip && isModelNotFoundErrorMessage(msg)) {
                skipped.push({ model: id, reason: msg });
                logProgress(`${progressLabel}: skip (model not found)`);
                break;
              }
              throw new Error(msg || "model returned error with no message");
            }
            if (ok.text.length === 0 && model.provider === "google") {
              skipped.push({
                model: id,
                reason: "no text returned (likely unavailable model id)",
              });
              logProgress(`${progressLabel}: skip (google model not found)`);
              break;
            }
            expect(ok.text.length).toBeGreaterThan(0);
            logProgress(`${progressLabel}: done`);
            break;
          } catch (err) {
            const message = String(err);
            if (
              model.provider === "anthropic" &&
              isAnthropicRateLimitError(message) &&
              attempt + 1 < attemptMax
            ) {
              logProgress(`${progressLabel}: rate limit, retrying with next key`);
              continue;
            }
            if (model.provider === "google" && isGoogleModelNotFoundError(err)) {
              skipped.push({ model: id, reason: message });
              logProgress(`${progressLabel}: skip (google model not found)`);
              break;
            }
            logProgress(`${progressLabel}: failed`);
            failures.push({ model: id, error: message });
            break;
          }
          failures.push({ model: id, error: String(err) });
        }
      }
@ -372,8 +373,6 @@ describeLive("live models (profile keys)", () => {
        );
      }
      // Keep one assertion so the test fails loudly if we somehow ran nothing.
      expect(models.length).toBeGreaterThan(0);
      void skipped;
    },
    15 * 60 * 1000,
--- a/src/gateway/gateway-models.profiles.live.test.ts
+++ b/src/gateway/gateway-models.profiles.live.test.ts
@ -11,9 +11,15 @@ import {
 } from "@mariozechner/pi-coding-agent";
 import { describe, expect, it } from "vitest";
 import { resolveClawdbotAgentDir } from "../agents/agent-paths.js";
 import {
  collectAnthropicApiKeys,
  isAnthropicRateLimitError,
 } from "../agents/live-auth-keys.js";
 import { isModernModelRef } from "../agents/live-model-filter.js";
 import { getApiKeyForModel } from "../agents/model-auth.js";
 import { ensureClawdbotModelsJson } from "../agents/models-config.js";
 import { loadConfig } from "../config/config.js";
 import type { ClawdbotConfig, ModelProviderConfig } from "../config/types.js";
 import {
  GATEWAY_CLIENT_MODES,
  GATEWAY_CLIENT_NAMES,
@ -25,16 +31,14 @@ import { startGatewayServer } from "./server.js";
 const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
 const GATEWAY_LIVE = process.env.CLAWDBOT_LIVE_GATEWAY === "1";
 const ALL_MODELS =
  process.env.CLAWDBOT_LIVE_GATEWAY_ALL_MODELS === "1" ||
  process.env.CLAWDBOT_LIVE_GATEWAY_MODELS === "all";
 const EXTRA_TOOL_PROBES = process.env.CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE === "1";
 const EXTRA_IMAGE_PROBES =
  process.env.CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE === "1";
 const ZAI_FALLBACK = process.env.CLAWDBOT_LIVE_GATEWAY_ZAI_FALLBACK === "1";
 const PROVIDERS = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_PROVIDERS);
 const THINKING_LEVEL = "high";
 const THINKING_TAG_RE =
  /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
 const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i;
-const describeLive = LIVE && GATEWAY_LIVE ? describe : describe.skip;
+const describeLive = LIVE || GATEWAY_LIVE ? describe : describe.skip;
 function parseFilter(raw?: string): Set<string> | null {
  const trimmed = raw?.trim();
@ -46,6 +50,26 @@ function parseFilter(raw?: string): Set<string> | null {
  return ids.length ? new Set(ids) : null;
 }
 function logProgress(message: string): void {
  console.log(`[live] ${message}`);
 }
 function assertNoReasoningTags(params: {
  text: string;
  model: string;
  phase: string;
  label: string;
 }): void {
  if (!params.text) return;
  if (THINKING_TAG_RE.test(params.text) || FINAL_TAG_RE.test(params.text)) {
    const snippet =
      params.text.length > 200 ? `${params.text.slice(0, 200)}…` : params.text;
    throw new Error(
      `[${params.label}] reasoning tag leak (${params.model} / ${params.phase}): ${snippet}`,
    );
  }
 }
 function extractPayloadText(result: unknown): string {
  const record = result as Record<string, unknown>;
  const payloads = Array.isArray(record.payloads) ? record.payloads : [];
@ -200,61 +224,470 @@ async function connectClient(params: { url: string; token: string }) {
  });
 }
 type GatewayModelSuiteParams = {
  label: string;
  cfg: ClawdbotConfig;
  candidates: Array<Model<Api>>;
  extraToolProbes: boolean;
  extraImageProbes: boolean;
  thinkingLevel: string;
  providerOverrides?: Record<string, ModelProviderConfig>;
 };
 function buildLiveGatewayConfig(params: {
  cfg: ClawdbotConfig;
  candidates: Array<Model<Api>>;
  providerOverrides?: Record<string, ModelProviderConfig>;
 }): ClawdbotConfig {
  const lmstudioProvider = params.cfg.models?.providers?.lmstudio;
  const baseProviders = params.cfg.models?.providers ?? {};
  const nextProviders = {
    ...baseProviders,
    ...(lmstudioProvider
      ? {
          lmstudio: {
            ...lmstudioProvider,
            api: "openai-completions",
          },
        }
      : {}),
    ...(params.providerOverrides ?? {}),
  };
  const providers =
    Object.keys(nextProviders).length > 0 ? nextProviders : baseProviders;
  return {
    ...params.cfg,
    agents: {
      ...params.cfg.agents,
      list: (params.cfg.agents?.list ?? []).map((entry) => ({
        ...entry,
        sandbox: { mode: "off" },
      })),
      defaults: {
        ...params.cfg.agents?.defaults,
        // Live tests should avoid Docker sandboxing so tool probes can
        // operate on the temporary probe files we create in the host workspace.
        sandbox: { mode: "off" },
        models: Object.fromEntries(
          params.candidates.map((m) => [`${m.provider}/${m.id}`, {}]),
        ),
      },
    },
    models:
      Object.keys(providers).length > 0
        ? { ...params.cfg.models, providers }
        : params.cfg.models,
  };
 }
 function buildMinimaxProviderOverride(params: {
  cfg: ClawdbotConfig;
  api: "openai-completions" | "anthropic-messages";
  baseUrl: string;
 }): ModelProviderConfig | null {
  const existing = params.cfg.models?.providers?.minimax;
  if (!existing || !Array.isArray(existing.models) || existing.models.length === 0)
    return null;
  return {
    ...existing,
    api: params.api,
    baseUrl: params.baseUrl,
  };
 }
 async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
  const previous = {
    configPath: process.env.CLAWDBOT_CONFIG_PATH,
    token: process.env.CLAWDBOT_GATEWAY_TOKEN,
    skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS,
    skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
    skipCron: process.env.CLAWDBOT_SKIP_CRON,
    skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
  };
  process.env.CLAWDBOT_SKIP_PROVIDERS = "1";
  process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
  process.env.CLAWDBOT_SKIP_CRON = "1";
  process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
  const token = `test-${randomUUID()}`;
  process.env.CLAWDBOT_GATEWAY_TOKEN = token;
  const workspaceDir = resolveUserPath(
    params.cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"),
  );
  await fs.mkdir(workspaceDir, { recursive: true });
  const nonceA = randomUUID();
  const nonceB = randomUUID();
  const toolProbePath = path.join(
    workspaceDir,
    `.clawdbot-live-tool-probe.${nonceA}.txt`,
  );
  await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
  const nextCfg = buildLiveGatewayConfig({
    cfg: params.cfg,
    candidates: params.candidates,
    providerOverrides: params.providerOverrides,
  });
  const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-live-"));
  const tempConfigPath = path.join(tempDir, "clawdbot.json");
  await fs.writeFile(tempConfigPath, `${JSON.stringify(nextCfg, null, 2)}\n`);
  process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath;
  await ensureClawdbotModelsJson(nextCfg);
  const port = await getFreeGatewayPort();
  const server = await startGatewayServer(port, {
    bind: "loopback",
    auth: { mode: "token", token },
    controlUiEnabled: false,
  });
  const client = await connectClient({
    url: `ws://127.0.0.1:${port}`,
    token,
  });
  try {
    logProgress(
      `[${params.label}] running ${params.candidates.length} models (thinking=${params.thinkingLevel})`,
    );
    const anthropicKeys = collectAnthropicApiKeys();
    if (anthropicKeys.length > 0) {
      process.env.ANTHROPIC_API_KEY = anthropicKeys[0];
      logProgress(`[${params.label}] anthropic keys loaded: ${anthropicKeys.length}`);
    }
    const sessionKey = `agent:dev:${params.label}`;
    const failures: Array<{ model: string; error: string }> = [];
    const total = params.candidates.length;
    for (const [index, model] of params.candidates.entries()) {
      const modelKey = `${model.provider}/${model.id}`;
      const progressLabel = `[${params.label}] ${index + 1}/${total} ${modelKey}`;
      const attemptMax =
        model.provider === "anthropic" && anthropicKeys.length > 0
          ? anthropicKeys.length
          : 1;
      for (let attempt = 0; attempt < attemptMax; attempt += 1) {
        if (model.provider === "anthropic" && anthropicKeys.length > 0) {
          process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt];
        }
        try {
          // Ensure session exists + override model for this run.
          await client.request<Record<string, unknown>>("sessions.patch", {
            key: sessionKey,
            model: modelKey,
          });
          // Reset between models: avoids cross-provider transcript incompatibilities
          // (notably OpenAI Responses requiring reasoning replay for function_call items).
          await client.request<Record<string, unknown>>("sessions.reset", {
            key: sessionKey,
          });
        logProgress(`${progressLabel}: prompt`);
        const runId = randomUUID();
        const payload = await client.request<AgentFinalPayload>(
          "agent",
          {
            sessionKey,
            idempotencyKey: `idem-${runId}`,
            message:
              "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
            thinking: params.thinkingLevel,
            deliver: false,
          },
          { expectFinal: true },
        );
        if (payload?.status !== "ok") {
          throw new Error(`agent status=${String(payload?.status)}`);
        }
        const text = extractPayloadText(payload?.result);
        if (model.provider === "google" && isGoogleModelNotFoundText(text)) {
          // Catalog drift: model IDs can disappear or become unavailable on the API.
          // Treat as skip when scanning "all models" for Google.
          logProgress(`${progressLabel}: skip (google model not found)`);
          break;
        }
        assertNoReasoningTags({
          text,
          model: modelKey,
          phase: "prompt",
          label: params.label,
        });
        if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`);
        if (
          !/\bmicro\s*-?\s*tasks?\b/i.test(text) ||
          !/\bmacro\s*-?\s*tasks?\b/i.test(text)
        ) {
          throw new Error(`missing required keywords: ${text}`);
        }
        // Real tool invocation: force the agent to Read a local file and echo a nonce.
        logProgress(`${progressLabel}: tool-read`);
        const runIdTool = randomUUID();
        const toolProbe = await client.request<AgentFinalPayload>(
          "agent",
          {
            sessionKey,
            idempotencyKey: `idem-${runIdTool}-tool`,
            message:
              "Clawdbot live tool probe (local, safe): " +
              `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
              "Then reply with the two nonce values you read (include both).",
            thinking: params.thinkingLevel,
            deliver: false,
          },
          { expectFinal: true },
        );
        if (toolProbe?.status !== "ok") {
          throw new Error(`tool probe failed: status=${String(toolProbe?.status)}`);
        }
        const toolText = extractPayloadText(toolProbe?.result);
        assertNoReasoningTags({
          text: toolText,
          model: modelKey,
          phase: "tool-read",
          label: params.label,
        });
        if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
          throw new Error(`tool probe missing nonce: ${toolText}`);
        }
        if (params.extraToolProbes) {
          logProgress(`${progressLabel}: tool-exec`);
          const nonceC = randomUUID();
          const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`);
          const execReadProbe = await client.request<AgentFinalPayload>(
            "agent",
            {
              sessionKey,
              idempotencyKey: `idem-${runIdTool}-exec-read`,
              message:
                "Clawdbot live tool probe (local, safe): " +
                "use the tool named `exec` (or `Exec`) to run this command: " +
                `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
                `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
                "Finally reply including the nonce text you read back.",
              thinking: params.thinkingLevel,
              deliver: false,
            },
            { expectFinal: true },
          );
          if (execReadProbe?.status !== "ok") {
            throw new Error(
              `exec+read probe failed: status=${String(execReadProbe?.status)}`,
            );
          }
          const execReadText = extractPayloadText(execReadProbe?.result);
          assertNoReasoningTags({
            text: execReadText,
            model: modelKey,
            phase: "tool-exec",
            label: params.label,
          });
          if (!execReadText.includes(nonceC)) {
            throw new Error(`exec+read probe missing nonce: ${execReadText}`);
          }
          await fs.rm(toolWritePath, { force: true });
        }
        if (params.extraImageProbes && model.input?.includes("image")) {
          logProgress(`${progressLabel}: image`);
          const imageCode = randomImageProbeCode(10);
          const imageBase64 = renderCatNoncePngBase64(imageCode);
          const runIdImage = randomUUID();
          const imageProbe = await client.request<AgentFinalPayload>(
            "agent",
            {
              sessionKey,
              idempotencyKey: `idem-${runIdImage}-image`,
              message:
                "Look at the attached image. Reply with exactly two tokens separated by a single space: " +
                "(1) the animal shown or written in the image, lowercase; " +
                "(2) the code printed in the image, uppercase. No extra text.",
              attachments: [
                {
                  mimeType: "image/png",
                  fileName: `probe-${runIdImage}.png`,
                  content: imageBase64,
                },
              ],
              thinking: params.thinkingLevel,
              deliver: false,
            },
            { expectFinal: true },
          );
          if (imageProbe?.status !== "ok") {
            throw new Error(
              `image probe failed: status=${String(imageProbe?.status)}`,
            );
          }
          const imageText = extractPayloadText(imageProbe?.result);
          assertNoReasoningTags({
            text: imageText,
            model: modelKey,
            phase: "image",
            label: params.label,
          });
          if (!/\bcat\b/i.test(imageText)) {
            throw new Error(`image probe missing 'cat': ${imageText}`);
          }
          const candidates =
            imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
          const bestDistance = candidates.reduce((best, cand) => {
            if (Math.abs(cand.length - imageCode.length) > 2) return best;
            return Math.min(best, editDistance(cand, imageCode));
          }, Number.POSITIVE_INFINITY);
          if (!(bestDistance <= 2)) {
            throw new Error(
              `image probe missing code (${imageCode}): ${imageText}`,
            );
          }
        }
        // Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
        if (
          (model.provider === "openai" && model.api === "openai-responses") ||
          (model.provider === "openai-codex" &&
            model.api === "openai-codex-responses")
        ) {
          logProgress(`${progressLabel}: tool-only regression`);
          const runId2 = randomUUID();
          const first = await client.request<AgentFinalPayload>(
            "agent",
            {
              sessionKey,
              idempotencyKey: `idem-${runId2}-1`,
              message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
              thinking: params.thinkingLevel,
              deliver: false,
            },
            { expectFinal: true },
          );
          if (first?.status !== "ok") {
            throw new Error(`tool-only turn failed: status=${String(first?.status)}`);
          }
          const firstText = extractPayloadText(first?.result);
          assertNoReasoningTags({
            text: firstText,
            model: modelKey,
            phase: "tool-only",
            label: params.label,
          });
          const second = await client.request<AgentFinalPayload>(
            "agent",
            {
              sessionKey,
              idempotencyKey: `idem-${runId2}-2`,
              message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
              thinking: params.thinkingLevel,
              deliver: false,
            },
            { expectFinal: true },
          );
          if (second?.status !== "ok") {
            throw new Error(
              `post-tool message failed: status=${String(second?.status)}`,
            );
          }
          const reply = extractPayloadText(second?.result);
          assertNoReasoningTags({
            text: reply,
            model: modelKey,
            phase: "tool-only-followup",
            label: params.label,
          });
          if (!reply.includes(nonceA) || !reply.includes(nonceB)) {
            throw new Error(`unexpected reply: ${reply}`);
          }
        }
        logProgress(`${progressLabel}: done`);
        break;
        } catch (err) {
          const message = String(err);
          if (
            model.provider === "anthropic" &&
            isAnthropicRateLimitError(message) &&
            attempt + 1 < attemptMax
          ) {
            logProgress(`${progressLabel}: rate limit, retrying with next key`);
            continue;
          }
          // OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests.
          if (
            model.provider === "openai-codex" &&
            isRefreshTokenReused(message)
          ) {
            logProgress(`${progressLabel}: skip (codex refresh token reused)`);
            break;
          }
          logProgress(`${progressLabel}: failed`);
          failures.push({ model: modelKey, error: message });
          break;
        }
      }
    }
    if (failures.length > 0) {
      const preview = failures
        .slice(0, 20)
        .map((f) => `- ${f.model}: ${f.error}`)
        .join("\n");
      throw new Error(
        `gateway live model failures (${failures.length}):\n${preview}`,
      );
    }
  } finally {
    client.stop();
    await server.close({ reason: "live test complete" });
    await fs.rm(toolProbePath, { force: true });
    await fs.rm(tempDir, { recursive: true, force: true });
    process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
    process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
    process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders;
    process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
    process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
    process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
  }
 }
 describeLive("gateway live (dev agent, profile keys)", () => {
  it(
    "runs meaningful prompts across models with available keys",
    async () => {
      const previous = {
        configPath: process.env.CLAWDBOT_CONFIG_PATH,
        token: process.env.CLAWDBOT_GATEWAY_TOKEN,
        skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS,
        skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
        skipCron: process.env.CLAWDBOT_SKIP_CRON,
        skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
      };
      process.env.CLAWDBOT_SKIP_PROVIDERS = "1";
      process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
      process.env.CLAWDBOT_SKIP_CRON = "1";
      process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
      const token = `test-${randomUUID()}`;
      process.env.CLAWDBOT_GATEWAY_TOKEN = token;
      const cfg = loadConfig();
      await ensureClawdbotModelsJson(cfg);
      const workspaceDir = resolveUserPath(
        cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"),
      );
      await fs.mkdir(workspaceDir, { recursive: true });
      const nonceA = randomUUID();
      const nonceB = randomUUID();
      const toolProbePath = path.join(
        workspaceDir,
        `.clawdbot-live-tool-probe.${nonceA}.txt`,
      );
      await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
      const agentDir = resolveClawdbotAgentDir();
      const authStorage = discoverAuthStorage(agentDir);
      const modelRegistry = discoverModels(authStorage, agentDir);
      const all = modelRegistry.getAll() as Array<Model<Api>>;
-      const filter = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_MODELS);
+      const rawModels = process.env.CLAWDBOT_LIVE_GATEWAY_MODELS?.trim();
-
+      const useModern =
-      // Default: honor user allowlist. Opt-in: scan all models with keys.
+        !rawModels || rawModels === "modern" || rawModels === "all";
-      const allowlistKeys = Object.keys(cfg.agents?.defaults?.models ?? {});
+      const useExplicit = Boolean(rawModels) && !useModern;
-      const wanted =
+      const filter = useExplicit ? parseFilter(rawModels) : null;
-        ALL_MODELS || allowlistKeys.length === 0
+      const wanted = filter
-          ? all
+        ? all.filter((m) => filter.has(`${m.provider}/${m.id}`))
-          : all.filter((m) => allowlistKeys.includes(`${m.provider}/${m.id}`));
+        : all.filter((m) =>
            isModernModelRef({ provider: m.provider, id: m.id }),
          );
      const candidates: Array<Model<Api>> = [];
      for (const model of wanted) {
        const id = `${model.provider}/${model.id}`;
        if (PROVIDERS && !PROVIDERS.has(model.provider)) continue;
        if (filter && !filter.has(id)) continue;
        try {
          // eslint-disable-next-line no-await-in-loop
          await getApiKeyForModel({ model, cfg });
@ -264,315 +697,72 @@ describeLive("gateway live (dev agent, profile keys)", () => {
        }
      }
-      expect(candidates.length).toBeGreaterThan(0);
+      if (candidates.length === 0) {
-      const imageCandidates = EXTRA_IMAGE_PROBES
+        logProgress("[all-models] no API keys found; skipping");
-        ? candidates.filter((m) => m.input?.includes("image"))
+        return;
-        : [];
+      }
-      if (EXTRA_IMAGE_PROBES && imageCandidates.length === 0) {
+      logProgress(
-        throw new Error(
+        `[all-models] selection=${useExplicit ? "explicit" : "modern"}`,
-          "image probe enabled but no selected models advertise image support; set CLAWDBOT_LIVE_GATEWAY_MODELS to include an image-capable model",
+      );
      const imageCandidates = candidates.filter((m) =>
        m.input?.includes("image"),
      );
      if (imageCandidates.length === 0) {
        logProgress(
          "[all-models] no image-capable models selected; image probe will be skipped",
        );
      }
-
+      await runGatewayModelSuite({
-      // Build a temp config that allows all selected models, so session overrides stick.
+        label: "all-models",
-      const lmstudioProvider = cfg.models?.providers?.lmstudio;
+        cfg,
-      const nextCfg = {
+        candidates,
-        ...cfg,
+        extraToolProbes: true,
-        agents: {
+        extraImageProbes: true,
-          ...cfg.agents,
+        thinkingLevel: THINKING_LEVEL,
          list: (cfg.agents?.list ?? []).map((entry) => ({
            ...entry,
            sandbox: { mode: "off" },
          })),
          defaults: {
            ...cfg.agents?.defaults,
            // Live tests should avoid Docker sandboxing so tool probes can
            // operate on the temporary probe files we create in the host workspace.
            sandbox: { mode: "off" },
            models: Object.fromEntries(
              candidates.map((m) => [`${m.provider}/${m.id}`, {}]),
            ),
          },
        },
        models: {
          ...cfg.models,
          providers: {
            ...cfg.models?.providers,
            // LM Studio is most reliable via Chat Completions; its Responses API
            // tool-calling behavior is inconsistent across releases.
            ...(lmstudioProvider
              ? {
                  lmstudio: {
                    ...lmstudioProvider,
                    api: "openai-completions",
                  },
                }
              : {}),
          },
        },
      };
      const tempDir = await fs.mkdtemp(
        path.join(os.tmpdir(), "clawdbot-live-"),
      );
      const tempConfigPath = path.join(tempDir, "clawdbot.json");
      await fs.writeFile(
        tempConfigPath,
        `${JSON.stringify(nextCfg, null, 2)}\n`,
      );
      process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath;
      const port = await getFreeGatewayPort();
      const server = await startGatewayServer(port, {
        bind: "loopback",
        auth: { mode: "token", token },
        controlUiEnabled: false,
      });
-      const client = await connectClient({
+      const minimaxCandidates = candidates.filter((model) => model.provider === "minimax");
-        url: `ws://127.0.0.1:${port}`,
+      if (minimaxCandidates.length === 0) {
-        token,
+        logProgress("[minimax] no candidates with keys; skipping dual endpoint probes");
        return;
      }
      const minimaxOpenAi = buildMinimaxProviderOverride({
        cfg,
        api: "openai-completions",
        baseUrl: "https://api.minimax.io/v1",
      });
      if (minimaxOpenAi) {
        await runGatewayModelSuite({
          label: "minimax-openai",
          cfg,
          candidates: minimaxCandidates,
          extraToolProbes: true,
          extraImageProbes: true,
          thinkingLevel: THINKING_LEVEL,
          providerOverrides: { minimax: minimaxOpenAi },
        });
      } else {
        logProgress("[minimax-openai] missing minimax provider config; skipping");
      }
-      try {
+      const minimaxAnthropic = buildMinimaxProviderOverride({
-        const sessionKey = "agent:dev:live-gateway";
+        cfg,
-
+        api: "anthropic-messages",
-        const failures: Array<{ model: string; error: string }> = [];
+        baseUrl: "https://api.minimax.io/anthropic",
-
+      });
-        for (const model of candidates) {
+      if (minimaxAnthropic) {
-          const modelKey = `${model.provider}/${model.id}`;
+        await runGatewayModelSuite({
-
+          label: "minimax-anthropic",
-          try {
+          cfg,
-            // Ensure session exists + override model for this run.
+          candidates: minimaxCandidates,
-            await client.request<Record<string, unknown>>("sessions.patch", {
+          extraToolProbes: true,
-              key: sessionKey,
+          extraImageProbes: true,
-              model: modelKey,
+          thinkingLevel: THINKING_LEVEL,
-            });
+          providerOverrides: { minimax: minimaxAnthropic },
-            // Reset between models: avoids cross-provider transcript incompatibilities
+        });
-            // (notably OpenAI Responses requiring reasoning replay for function_call items).
+      } else {
-            await client.request<Record<string, unknown>>("sessions.reset", {
+        logProgress("[minimax-anthropic] missing minimax provider config; skipping");
              key: sessionKey,
            });
            // “Meaningful” direct prompt (no tools).
            const runId = randomUUID();
            const payload = await client.request<AgentFinalPayload>(
              "agent",
              {
                sessionKey,
                idempotencyKey: `idem-${runId}`,
                message:
                  "Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
                deliver: false,
              },
              { expectFinal: true },
            );
            if (payload?.status !== "ok") {
              throw new Error(`agent status=${String(payload?.status)}`);
            }
            const text = extractPayloadText(payload?.result);
            if (
              model.provider === "google" &&
              isGoogleModelNotFoundText(text)
            ) {
              // Catalog drift: model IDs can disappear or become unavailable on the API.
              // Treat as skip when scanning "all models" for Google.
              continue;
            }
            if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`);
            if (
              !/\bmicro\s*-?\s*tasks?\b/i.test(text) ||
              !/\bmacro\s*-?\s*tasks?\b/i.test(text)
            ) {
              throw new Error(`missing required keywords: ${text}`);
            }
            // Real tool invocation: force the agent to Read a local file and echo a nonce.
            const runIdTool = randomUUID();
            const toolProbe = await client.request<AgentFinalPayload>(
              "agent",
              {
                sessionKey,
                idempotencyKey: `idem-${runIdTool}-tool`,
                message:
                  "Clawdbot live tool probe (local, safe): " +
                  `use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
                  "Then reply with the two nonce values you read (include both).",
                deliver: false,
              },
              { expectFinal: true },
            );
            if (toolProbe?.status !== "ok") {
              throw new Error(
                `tool probe failed: status=${String(toolProbe?.status)}`,
              );
            }
            const toolText = extractPayloadText(toolProbe?.result);
            if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
              throw new Error(`tool probe missing nonce: ${toolText}`);
            }
            if (EXTRA_TOOL_PROBES) {
              const nonceC = randomUUID();
              const toolWritePath = path.join(
                tempDir,
                `write-${runIdTool}.txt`,
              );
              const execReadProbe = await client.request<AgentFinalPayload>(
                "agent",
                {
                  sessionKey,
                  idempotencyKey: `idem-${runIdTool}-exec-read`,
                  message:
                    "Clawdbot live tool probe (local, safe): " +
                    "use the tool named `exec` (or `Exec`) to run this command: " +
                    `mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
                    `Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
                    "Finally reply including the nonce text you read back.",
                  deliver: false,
                },
                { expectFinal: true },
              );
              if (execReadProbe?.status !== "ok") {
                throw new Error(
                  `exec+read probe failed: status=${String(execReadProbe?.status)}`,
                );
              }
              const execReadText = extractPayloadText(execReadProbe?.result);
              if (!execReadText.includes(nonceC)) {
                throw new Error(
                  `exec+read probe missing nonce: ${execReadText}`,
                );
              }
              await fs.rm(toolWritePath, { force: true });
            }
            if (EXTRA_IMAGE_PROBES && model.input?.includes("image")) {
              const imageCode = randomImageProbeCode(10);
              const imageBase64 = renderCatNoncePngBase64(imageCode);
              const runIdImage = randomUUID();
              const imageProbe = await client.request<AgentFinalPayload>(
                "agent",
                {
                  sessionKey,
                  idempotencyKey: `idem-${runIdImage}-image`,
                  message:
                    "Look at the attached image. Reply with exactly two tokens separated by a single space: " +
                    "(1) the animal shown or written in the image, lowercase; " +
                    "(2) the code printed in the image, uppercase. No extra text.",
                  attachments: [
                    {
                      mimeType: "image/png",
                      fileName: `probe-${runIdImage}.png`,
                      content: imageBase64,
                    },
                  ],
                  deliver: false,
                },
                { expectFinal: true },
              );
              if (imageProbe?.status !== "ok") {
                throw new Error(
                  `image probe failed: status=${String(imageProbe?.status)}`,
                );
              }
              const imageText = extractPayloadText(imageProbe?.result);
              if (!/\bcat\b/i.test(imageText)) {
                throw new Error(`image probe missing 'cat': ${imageText}`);
              }
              const candidates =
                imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
              const bestDistance = candidates.reduce((best, cand) => {
                if (Math.abs(cand.length - imageCode.length) > 2) return best;
                return Math.min(best, editDistance(cand, imageCode));
              }, Number.POSITIVE_INFINITY);
              if (!(bestDistance <= 2)) {
                throw new Error(
                  `image probe missing code (${imageCode}): ${imageText}`,
                );
              }
            }
            // Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
            if (
              (model.provider === "openai" &&
                model.api === "openai-responses") ||
              (model.provider === "openai-codex" &&
                model.api === "openai-codex-responses")
            ) {
              const runId2 = randomUUID();
              const first = await client.request<AgentFinalPayload>(
                "agent",
                {
                  sessionKey,
                  idempotencyKey: `idem-${runId2}-1`,
                  message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
                  deliver: false,
                },
                { expectFinal: true },
              );
              if (first?.status !== "ok") {
                throw new Error(
                  `tool-only turn failed: status=${String(first?.status)}`,
                );
              }
              const second = await client.request<AgentFinalPayload>(
                "agent",
                {
                  sessionKey,
                  idempotencyKey: `idem-${runId2}-2`,
                  message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
                  deliver: false,
                },
                { expectFinal: true },
              );
              if (second?.status !== "ok") {
                throw new Error(
                  `post-tool message failed: status=${String(second?.status)}`,
                );
              }
              const reply = extractPayloadText(second?.result);
              if (!reply.includes(nonceA) || !reply.includes(nonceB)) {
                throw new Error(`unexpected reply: ${reply}`);
              }
            }
          } catch (err) {
            const message = String(err);
            // OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests.
            if (
              model.provider === "openai-codex" &&
              isRefreshTokenReused(message)
            ) {
              continue;
            }
            failures.push({ model: modelKey, error: message });
          }
        }
        if (failures.length > 0) {
          const preview = failures
            .slice(0, 20)
            .map((f) => `- ${f.model}: ${f.error}`)
            .join("\n");
          throw new Error(
            `gateway live model failures (${failures.length}):\n${preview}`,
          );
        }
      } finally {
        client.stop();
        await server.close({ reason: "live test complete" });
        await fs.rm(toolProbePath, { force: true });
        await fs.rm(tempDir, { recursive: true, force: true });
        process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
        process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
        process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders;
        process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
        process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
        process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
      }
    },
    20 * 60 * 1000,
@ -661,6 +851,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
          message:
            `Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
            `Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
          thinking: THINKING_LEVEL,
          deliver: false,
        },
        { expectFinal: true },
@ -671,6 +862,12 @@ describeLive("gateway live (dev agent, profile keys)", () => {
        );
      }
      const toolText = extractPayloadText(toolProbe?.result);
      assertNoReasoningTags({
        text: toolText,
        model: "anthropic/claude-opus-4-5",
        phase: "zai-fallback-tool",
        label: "zai-fallback",
      });
      if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
        throw new Error(`anthropic tool probe missing nonce: ${toolText}`);
      }
@ -689,6 +886,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
          message:
            `What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
            `Reply with exactly: ${nonceA} ${nonceB}.`,
          thinking: THINKING_LEVEL,
          deliver: false,
        },
        { expectFinal: true },
@ -699,6 +897,12 @@ describeLive("gateway live (dev agent, profile keys)", () => {
        );
      }
      const followupText = extractPayloadText(followup?.result);
      assertNoReasoningTags({
        text: followupText,
        model: "zai/glm-4.7",
        phase: "zai-fallback-followup",
        label: "zai-fallback",
      });
      if (!followupText.includes(nonceA) || !followupText.includes(nonceB)) {
        throw new Error(`zai followup missing nonce: ${followupText}`);
      }
--- a/test/test-env.ts
+++ b/test/test-env.ts
@ -1,3 +1,4 @@
 import { execFileSync } from "node:child_process";
 import fs from "node:fs";
 import os from "node:os";
 import path from "node:path";
@ -11,6 +12,37 @@ function restoreEnv(entries: RestoreEntry[]): void {
  }
 }
 function loadProfileEnv(): void {
  const profilePath = path.join(os.homedir(), ".profile");
  if (!fs.existsSync(profilePath)) return;
  try {
    const output = execFileSync(
      "/bin/bash",
      [
        "-lc",
        `set -a; source \"${profilePath}\" >/dev/null 2>&1; env -0`,
      ],
      { encoding: "utf8" },
    );
    const entries = output.split("\0");
    let applied = 0;
    for (const entry of entries) {
      if (!entry) continue;
      const idx = entry.indexOf("=");
      if (idx <= 0) continue;
      const key = entry.slice(0, idx);
      if (!key || (process.env[key] ?? "") !== "") continue;
      process.env[key] = entry.slice(idx + 1);
      applied += 1;
    }
    if (applied > 0) {
      console.log(`[live] loaded ${applied} env vars from ~/.profile`);
    }
  } catch {
    // ignore profile load failures
  }
 }
 export function installTestEnv(): { cleanup: () => void; tempHome: string } {
  const live =
    process.env.LIVE === "1" ||
@ -20,6 +52,7 @@ export function installTestEnv(): { cleanup: () => void; tempHome: string } {
  // Live tests must use the real user environment (keys, profiles, config).
  // The default test env isolates HOME to avoid touching real state.
  if (live) {
    loadProfileEnv();
    return { cleanup: () => {}, tempHome: process.env.HOME ?? "" };
  }