fix: modernize live tests and gemini ids
This commit is contained in:
parent
79cbb20988
commit
1850013cae
11 changed files with 1053 additions and 593 deletions
|
|
@ -76,7 +76,7 @@ Clawdbot ships with the pi‑ai catalog. These providers require **no**
|
||||||
|
|
||||||
- Provider: `google`
|
- Provider: `google`
|
||||||
- Auth: `GEMINI_API_KEY`
|
- Auth: `GEMINI_API_KEY`
|
||||||
- Example model: `google/gemini-3-pro`
|
- Example model: `google/gemini-3-pro-preview`
|
||||||
- CLI: `clawdbot onboard --auth-choice gemini-api-key`
|
- CLI: `clawdbot onboard --auth-choice gemini-api-key`
|
||||||
|
|
||||||
### Google Vertex / Antigravity / Gemini CLI
|
### Google Vertex / Antigravity / Gemini CLI
|
||||||
|
|
|
||||||
|
|
@ -25,9 +25,8 @@ When you touch tests or want extra confidence:
|
||||||
- Coverage gate: `pnpm test:coverage`
|
- Coverage gate: `pnpm test:coverage`
|
||||||
- E2E suite: `pnpm test:e2e`
|
- E2E suite: `pnpm test:e2e`
|
||||||
|
|
||||||
When debugging real providers/models (requires real creds; skipped by default):
|
When debugging real providers/models (requires real creds):
|
||||||
- Live suite (models only): `CLAWDBOT_LIVE_TEST=1 pnpm test:live`
|
- Live suite (models + gateway tool/image probes): `pnpm test:live`
|
||||||
- Live suite (models + providers): `LIVE=1 pnpm test:live`
|
|
||||||
|
|
||||||
Tip: when you only need one failing case, prefer narrowing live tests via the allowlist env vars described below.
|
Tip: when you only need one failing case, prefer narrowing live tests via the allowlist env vars described below.
|
||||||
|
|
||||||
|
|
@ -67,7 +66,7 @@ Think of the suites as “increasing realism” (and increasing flakiness/cost):
|
||||||
- Command: `pnpm test:live`
|
- Command: `pnpm test:live`
|
||||||
- Config: `vitest.live.config.ts`
|
- Config: `vitest.live.config.ts`
|
||||||
- Files: `src/**/*.live.test.ts`
|
- Files: `src/**/*.live.test.ts`
|
||||||
- Default: **skipped** unless `CLAWDBOT_LIVE_TEST=1` or `LIVE=1`
|
- Default: **enabled** by `pnpm test:live` (sets `CLAWDBOT_LIVE_TEST=1`)
|
||||||
- Scope:
|
- Scope:
|
||||||
- “Does this provider/model actually work *today* with real creds?”
|
- “Does this provider/model actually work *today* with real creds?”
|
||||||
- Catch provider format changes, tool-calling quirks, auth issues, and rate limit behavior
|
- Catch provider format changes, tool-calling quirks, auth issues, and rate limit behavior
|
||||||
|
|
@ -75,6 +74,8 @@ Think of the suites as “increasing realism” (and increasing flakiness/cost):
|
||||||
- Not CI-stable by design (real networks, real provider policies, quotas, outages)
|
- Not CI-stable by design (real networks, real provider policies, quotas, outages)
|
||||||
- Costs money / uses rate limits
|
- Costs money / uses rate limits
|
||||||
- Prefer running narrowed subsets instead of “everything”
|
- Prefer running narrowed subsets instead of “everything”
|
||||||
|
- Live runs will source `~/.profile` to pick up missing API keys
|
||||||
|
- Anthropic key rotation: set `CLAWDBOT_LIVE_ANTHROPIC_KEYS="sk-...,sk-..."` (or `CLAWDBOT_LIVE_ANTHROPIC_KEY=sk-...`) or multiple `ANTHROPIC_API_KEY*` vars; tests will retry on rate limits
|
||||||
|
|
||||||
## Which suite should I run?
|
## Which suite should I run?
|
||||||
|
|
||||||
|
|
@ -97,10 +98,11 @@ Live tests are split into two layers so we can isolate failures:
|
||||||
- Use `getApiKeyForModel` to select models you have creds for
|
- Use `getApiKeyForModel` to select models you have creds for
|
||||||
- Run a small completion per model (and targeted regressions where needed)
|
- Run a small completion per model (and targeted regressions where needed)
|
||||||
- How to enable:
|
- How to enable:
|
||||||
- `CLAWDBOT_LIVE_TEST=1` or `LIVE=1`
|
- `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly)
|
||||||
- `CLAWDBOT_LIVE_ALL_MODELS=1` (required for this test to run)
|
- Set `CLAWDBOT_LIVE_MODELS=modern` (or `all`, alias for modern) to actually run this suite; otherwise it skips to keep `pnpm test:live` focused on gateway smoke
|
||||||
- How to select models:
|
- How to select models:
|
||||||
- `CLAWDBOT_LIVE_MODELS=all` to run everything with keys
|
- `CLAWDBOT_LIVE_MODELS=modern` to run the modern allowlist (Opus/Sonnet/Haiku 4.5, GPT-5.x + Codex, Gemini 3, GLM 4.7, MiniMax M2.1, Grok 4)
|
||||||
|
- `CLAWDBOT_LIVE_MODELS=all` is an alias for the modern allowlist
|
||||||
- or `CLAWDBOT_LIVE_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,..."` (comma allowlist)
|
- or `CLAWDBOT_LIVE_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,..."` (comma allowlist)
|
||||||
- How to select providers:
|
- How to select providers:
|
||||||
- `CLAWDBOT_LIVE_PROVIDERS="google,google-antigravity,google-gemini-cli"` (comma allowlist)
|
- `CLAWDBOT_LIVE_PROVIDERS="google,google-antigravity,google-gemini-cli"` (comma allowlist)
|
||||||
|
|
@ -128,18 +130,16 @@ Live tests are split into two layers so we can isolate failures:
|
||||||
- image probe: the test attaches a generated PNG (cat + randomized code) and expects the model to return `cat <CODE>`.
|
- image probe: the test attaches a generated PNG (cat + randomized code) and expects the model to return `cat <CODE>`.
|
||||||
- Implementation reference: `src/gateway/gateway-models.profiles.live.test.ts` and `src/gateway/live-image-probe.ts`.
|
- Implementation reference: `src/gateway/gateway-models.profiles.live.test.ts` and `src/gateway/live-image-probe.ts`.
|
||||||
- How to enable:
|
- How to enable:
|
||||||
- `CLAWDBOT_LIVE_TEST=1` or `LIVE=1`
|
- `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly)
|
||||||
- `CLAWDBOT_LIVE_GATEWAY=1` (required for this test to run)
|
|
||||||
- How to select models:
|
- How to select models:
|
||||||
- `CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1` to scan all discovered models with keys
|
- Default: modern allowlist (Opus/Sonnet/Haiku 4.5, GPT-5.x + Codex, Gemini 3, GLM 4.7, MiniMax M2.1, Grok 4)
|
||||||
- or set `CLAWDBOT_LIVE_GATEWAY_MODELS="provider/model,provider/model,..."` to narrow quickly
|
- `CLAWDBOT_LIVE_GATEWAY_MODELS=all` is an alias for the modern allowlist
|
||||||
|
- Or set `CLAWDBOT_LIVE_GATEWAY_MODELS="provider/model"` (or comma list) to narrow
|
||||||
- How to select providers (avoid “OpenRouter everything”):
|
- How to select providers (avoid “OpenRouter everything”):
|
||||||
- `CLAWDBOT_LIVE_GATEWAY_PROVIDERS="google,google-antigravity,google-gemini-cli,openai,anthropic,zai,minimax"` (comma allowlist)
|
- `CLAWDBOT_LIVE_GATEWAY_PROVIDERS="google,google-antigravity,google-gemini-cli,openai,anthropic,zai,minimax"` (comma allowlist)
|
||||||
- Optional tool-calling stress:
|
- Tool + image probes are always on in this live test:
|
||||||
- `CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1` enables an extra “exec writes file → read reads it back → echo nonce” check.
|
- `read` probe + `exec+read` probe (tool stress)
|
||||||
- This is specifically meant to catch tool-calling compatibility issues across providers (formatting, history replay, tool_result pairing, etc.).
|
- image probe runs when the model advertises image input support
|
||||||
- Optional image send smoke:
|
|
||||||
- `CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1` sends a real image attachment through the gateway agent pipeline (multimodal message) and asserts the model can read back a per-run code from the image.
|
|
||||||
- Flow (high level):
|
- Flow (high level):
|
||||||
- Test generates a tiny PNG with “CAT” + random code (`src/gateway/live-image-probe.ts`)
|
- Test generates a tiny PNG with “CAT” + random code (`src/gateway/live-image-probe.ts`)
|
||||||
- Sends it via `agent` `attachments: [{ mimeType: "image/png", content: "<base64>" }]`
|
- Sends it via `agent` `attachments: [{ mimeType: "image/png", content: "<base64>" }]`
|
||||||
|
|
@ -159,7 +159,7 @@ pnpm clawdbot models list --json
|
||||||
- Test: `src/agents/anthropic.setup-token.live.test.ts`
|
- Test: `src/agents/anthropic.setup-token.live.test.ts`
|
||||||
- Goal: verify Claude CLI setup-token (or a pasted setup-token profile) can complete an Anthropic prompt.
|
- Goal: verify Claude CLI setup-token (or a pasted setup-token profile) can complete an Anthropic prompt.
|
||||||
- Enable:
|
- Enable:
|
||||||
- `CLAWDBOT_LIVE_TEST=1` or `LIVE=1`
|
- `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly)
|
||||||
- `CLAWDBOT_LIVE_SETUP_TOKEN=1`
|
- `CLAWDBOT_LIVE_SETUP_TOKEN=1`
|
||||||
- Token sources (pick one):
|
- Token sources (pick one):
|
||||||
- Profile: `CLAWDBOT_LIVE_SETUP_TOKEN_PROFILE=anthropic:setup-token-test`
|
- Profile: `CLAWDBOT_LIVE_SETUP_TOKEN_PROFILE=anthropic:setup-token-test`
|
||||||
|
|
@ -171,7 +171,7 @@ Setup example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
clawdbot models auth paste-token --provider anthropic --profile-id anthropic:setup-token-test
|
clawdbot models auth paste-token --provider anthropic --profile-id anthropic:setup-token-test
|
||||||
CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFILE=anthropic:setup-token-test pnpm test:live src/agents/anthropic.setup-token.live.test.ts
|
CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFILE=anthropic:setup-token-test pnpm test:live src/agents/anthropic.setup-token.live.test.ts
|
||||||
```
|
```
|
||||||
|
|
||||||
## Live: CLI backend smoke (Claude CLI or other local CLIs)
|
## Live: CLI backend smoke (Claude CLI or other local CLIs)
|
||||||
|
|
@ -179,7 +179,7 @@ CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFI
|
||||||
- Test: `src/gateway/gateway-cli-backend.live.test.ts`
|
- Test: `src/gateway/gateway-cli-backend.live.test.ts`
|
||||||
- Goal: validate the Gateway + agent pipeline using a local CLI backend, without touching your default config.
|
- Goal: validate the Gateway + agent pipeline using a local CLI backend, without touching your default config.
|
||||||
- Enable:
|
- Enable:
|
||||||
- `CLAWDBOT_LIVE_TEST=1` or `LIVE=1`
|
- `pnpm test:live` (or `CLAWDBOT_LIVE_TEST=1` if invoking Vitest directly)
|
||||||
- `CLAWDBOT_LIVE_CLI_BACKEND=1`
|
- `CLAWDBOT_LIVE_CLI_BACKEND=1`
|
||||||
- Defaults:
|
- Defaults:
|
||||||
- Model: `claude-cli/claude-sonnet-4-5`
|
- Model: `claude-cli/claude-sonnet-4-5`
|
||||||
|
|
@ -200,7 +200,7 @@ CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_SETUP_TOKEN=1 CLAWDBOT_LIVE_SETUP_TOKEN_PROFI
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_CLI_BACKEND=1 \
|
CLAWDBOT_LIVE_CLI_BACKEND=1 \
|
||||||
CLAWDBOT_LIVE_CLI_BACKEND_MODEL="claude-cli/claude-sonnet-4-5" \
|
CLAWDBOT_LIVE_CLI_BACKEND_MODEL="claude-cli/claude-sonnet-4-5" \
|
||||||
pnpm test:live src/gateway/gateway-cli-backend.live.test.ts
|
pnpm test:live src/gateway/gateway-cli-backend.live.test.ts
|
||||||
```
|
```
|
||||||
|
|
@ -210,17 +210,17 @@ CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_CLI_BACKEND=1 \
|
||||||
Narrow, explicit allowlists are fastest and least flaky:
|
Narrow, explicit allowlists are fastest and least flaky:
|
||||||
|
|
||||||
- Single model, direct (no gateway):
|
- Single model, direct (no gateway):
|
||||||
- `CLAWDBOT_LIVE_TEST=1 CLAWDBOT_LIVE_ALL_MODELS=1 CLAWDBOT_LIVE_MODELS="openai/gpt-5.2" pnpm test:live src/agents/models.profiles.live.test.ts`
|
- `CLAWDBOT_LIVE_MODELS="openai/gpt-5.2" pnpm test:live src/agents/models.profiles.live.test.ts`
|
||||||
|
|
||||||
- Single model, gateway smoke:
|
- Single model, gateway smoke:
|
||||||
- `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
- `CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
||||||
|
|
||||||
- Tool calling across several providers (exec + read probe):
|
- Tool calling across several providers:
|
||||||
- `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-flash,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
- `CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-flash-preview,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
||||||
|
|
||||||
- Google focus (Gemini API key + Antigravity):
|
- Google focus (Gemini API key + Antigravity):
|
||||||
- Gemini (API key): `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="google/gemini-3-flash" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
- Gemini (API key): `CLAWDBOT_LIVE_GATEWAY_MODELS="google/gemini-3-flash-preview" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
||||||
- Antigravity (OAuth): `LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-pro-high" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
- Antigravity (OAuth): `CLAWDBOT_LIVE_GATEWAY_MODELS="google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-pro-high" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
- `google/...` uses the Gemini API (API key).
|
- `google/...` uses the Gemini API (API key).
|
||||||
|
|
@ -240,20 +240,20 @@ This is the “common models” run we expect to keep working:
|
||||||
- OpenAI (non-Codex): `openai/gpt-5.2` (optional: `openai/gpt-5.1`)
|
- OpenAI (non-Codex): `openai/gpt-5.2` (optional: `openai/gpt-5.1`)
|
||||||
- OpenAI Codex: `openai-codex/gpt-5.2` (optional: `openai-codex/gpt-5.2-codex`)
|
- OpenAI Codex: `openai-codex/gpt-5.2` (optional: `openai-codex/gpt-5.2-codex`)
|
||||||
- Anthropic: `anthropic/claude-opus-4-5` (or `anthropic/claude-sonnet-4-5`)
|
- Anthropic: `anthropic/claude-opus-4-5` (or `anthropic/claude-sonnet-4-5`)
|
||||||
- Google (Gemini API): `google/gemini-3-pro` and `google/gemini-3-flash` (avoid older Gemini 2.x models)
|
- Google (Gemini API): `google/gemini-3-pro-preview` and `google/gemini-3-flash-preview` (avoid older Gemini 2.x models)
|
||||||
- Google (Antigravity): `google-antigravity/claude-opus-4-5-thinking` and `google-antigravity/gemini-3-flash`
|
- Google (Antigravity): `google-antigravity/claude-opus-4-5-thinking` and `google-antigravity/gemini-3-flash`
|
||||||
- Z.AI (GLM): `zai/glm-4.7`
|
- Z.AI (GLM): `zai/glm-4.7`
|
||||||
- MiniMax: `minimax/minimax-m2.1`
|
- MiniMax: `minimax/minimax-m2.1`
|
||||||
|
|
||||||
Run gateway smoke with tools + image:
|
Run gateway smoke with tools + image:
|
||||||
`LIVE=1 CLAWDBOT_LIVE_GATEWAY=1 CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE=1 CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1 CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-pro,google/gemini-3-flash,google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-flash,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
`CLAWDBOT_LIVE_GATEWAY_MODELS="openai/gpt-5.2,openai-codex/gpt-5.2,anthropic/claude-opus-4-5,google/gemini-3-pro-preview,google/gemini-3-flash-preview,google-antigravity/claude-opus-4-5-thinking,google-antigravity/gemini-3-flash,zai/glm-4.7,minimax/minimax-m2.1" pnpm test:live src/gateway/gateway-models.profiles.live.test.ts`
|
||||||
|
|
||||||
### Baseline: tool calling (Read + optional Exec)
|
### Baseline: tool calling (Read + optional Exec)
|
||||||
|
|
||||||
Pick at least one per provider family:
|
Pick at least one per provider family:
|
||||||
- OpenAI: `openai/gpt-5.2` (or `openai/gpt-5-mini`)
|
- OpenAI: `openai/gpt-5.2` (or `openai/gpt-5-mini`)
|
||||||
- Anthropic: `anthropic/claude-opus-4-5` (or `anthropic/claude-sonnet-4-5`)
|
- Anthropic: `anthropic/claude-opus-4-5` (or `anthropic/claude-sonnet-4-5`)
|
||||||
- Google: `google/gemini-3-flash` (or `google/gemini-3-pro`)
|
- Google: `google/gemini-3-flash-preview` (or `google/gemini-3-pro-preview`)
|
||||||
- Z.AI (GLM): `zai/glm-4.7`
|
- Z.AI (GLM): `zai/glm-4.7`
|
||||||
- MiniMax: `minimax/minimax-m2.1`
|
- MiniMax: `minimax/minimax-m2.1`
|
||||||
|
|
||||||
|
|
@ -265,7 +265,7 @@ Optional additional coverage (nice to have):
|
||||||
|
|
||||||
### Vision: image send (attachment → multimodal message)
|
### Vision: image send (attachment → multimodal message)
|
||||||
|
|
||||||
Run with `CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE=1` and include at least one image-capable model in `CLAWDBOT_LIVE_GATEWAY_MODELS` (Claude/Gemini/OpenAI vision-capable variants, etc.).
|
Include at least one image-capable model in `CLAWDBOT_LIVE_GATEWAY_MODELS` (Claude/Gemini/OpenAI vision-capable variants, etc.) to exercise the image probe.
|
||||||
|
|
||||||
### Aggregators / alternate gateways
|
### Aggregators / alternate gateways
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -97,7 +97,7 @@
|
||||||
"test:force": "tsx scripts/test-force.ts",
|
"test:force": "tsx scripts/test-force.ts",
|
||||||
"test:coverage": "vitest run --coverage",
|
"test:coverage": "vitest run --coverage",
|
||||||
"test:e2e": "vitest run --config vitest.e2e.config.ts",
|
"test:e2e": "vitest run --config vitest.e2e.config.ts",
|
||||||
"test:live": "vitest run --config vitest.live.config.ts",
|
"test:live": "CLAWDBOT_LIVE_TEST=1 vitest run --config vitest.live.config.ts",
|
||||||
"test:docker:onboard": "bash scripts/e2e/onboard-docker.sh",
|
"test:docker:onboard": "bash scripts/e2e/onboard-docker.sh",
|
||||||
"test:docker:gateway-network": "bash scripts/e2e/gateway-network-docker.sh",
|
"test:docker:gateway-network": "bash scripts/e2e/gateway-network-docker.sh",
|
||||||
"test:docker:live-models": "bash scripts/test-live-models-docker.sh",
|
"test:docker:live-models": "bash scripts/test-live-models-docker.sh",
|
||||||
|
|
|
||||||
|
|
@ -20,10 +20,6 @@ docker run --rm -t \
|
||||||
--entrypoint bash \
|
--entrypoint bash \
|
||||||
-e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 \
|
-e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 \
|
||||||
-e HOME=/home/node \
|
-e HOME=/home/node \
|
||||||
-e CLAWDBOT_LIVE_TEST=1 \
|
|
||||||
-e CLAWDBOT_LIVE_GATEWAY=1 \
|
|
||||||
-e CLAWDBOT_LIVE_GATEWAY_ALL_MODELS=1 \
|
|
||||||
-e CLAWDBOT_LIVE_GATEWAY_MODELS="${CLAWDBOT_LIVE_GATEWAY_MODELS:-all}" \
|
|
||||||
-v "$CONFIG_DIR":/home/node/.clawdbot \
|
-v "$CONFIG_DIR":/home/node/.clawdbot \
|
||||||
-v "$WORKSPACE_DIR":/home/node/clawd \
|
-v "$WORKSPACE_DIR":/home/node/clawd \
|
||||||
"${PROFILE_MOUNT[@]}" \
|
"${PROFILE_MOUNT[@]}" \
|
||||||
|
|
|
||||||
50
src/agents/live-auth-keys.ts
Normal file
50
src/agents/live-auth-keys.ts
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
const KEY_SPLIT_RE = /[\s,;]+/g;
|
||||||
|
|
||||||
|
function parseKeyList(raw?: string | null): string[] {
|
||||||
|
if (!raw) return [];
|
||||||
|
return raw
|
||||||
|
.split(KEY_SPLIT_RE)
|
||||||
|
.map((value) => value.trim())
|
||||||
|
.filter(Boolean);
|
||||||
|
}
|
||||||
|
|
||||||
|
function collectEnvPrefixedKeys(prefix: string): string[] {
|
||||||
|
const keys: string[] = [];
|
||||||
|
for (const [name, value] of Object.entries(process.env)) {
|
||||||
|
if (!name.startsWith(prefix)) continue;
|
||||||
|
const trimmed = value?.trim();
|
||||||
|
if (!trimmed) continue;
|
||||||
|
keys.push(trimmed);
|
||||||
|
}
|
||||||
|
return keys;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function collectAnthropicApiKeys(): string[] {
|
||||||
|
const forcedSingle = process.env.CLAWDBOT_LIVE_ANTHROPIC_KEY?.trim();
|
||||||
|
if (forcedSingle) return [forcedSingle];
|
||||||
|
|
||||||
|
const fromList = parseKeyList(process.env.CLAWDBOT_LIVE_ANTHROPIC_KEYS);
|
||||||
|
const fromEnv = collectEnvPrefixedKeys("ANTHROPIC_API_KEY");
|
||||||
|
const primary = process.env.ANTHROPIC_API_KEY?.trim();
|
||||||
|
|
||||||
|
const seen = new Set<string>();
|
||||||
|
const add = (value?: string) => {
|
||||||
|
if (!value) return;
|
||||||
|
if (seen.has(value)) return;
|
||||||
|
seen.add(value);
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const value of fromList) add(value);
|
||||||
|
if (primary) add(primary);
|
||||||
|
for (const value of fromEnv) add(value);
|
||||||
|
|
||||||
|
return Array.from(seen);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isAnthropicRateLimitError(message: string): boolean {
|
||||||
|
const lower = message.toLowerCase();
|
||||||
|
if (lower.includes("rate_limit")) return true;
|
||||||
|
if (lower.includes("rate limit")) return true;
|
||||||
|
if (lower.includes("429")) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
89
src/agents/live-model-filter.ts
Normal file
89
src/agents/live-model-filter.ts
Normal file
|
|
@ -0,0 +1,89 @@
|
||||||
|
export type ModelRef = {
|
||||||
|
provider?: string | null;
|
||||||
|
id?: string | null;
|
||||||
|
};
|
||||||
|
|
||||||
|
const ANTHROPIC_PREFIXES = [
|
||||||
|
"claude-opus-4-5",
|
||||||
|
"claude-sonnet-4-5",
|
||||||
|
"claude-haiku-4-5",
|
||||||
|
];
|
||||||
|
const OPENAI_MODELS = ["gpt-5.2", "gpt-5.0"];
|
||||||
|
const CODEX_MODELS = [
|
||||||
|
"gpt-5.2",
|
||||||
|
"gpt-5.2-codex",
|
||||||
|
"gpt-5.1-codex",
|
||||||
|
"gpt-5.1-codex-mini",
|
||||||
|
"gpt-5.1-codex-max",
|
||||||
|
];
|
||||||
|
const GOOGLE_PREFIXES = ["gemini-3"];
|
||||||
|
const ZAI_PREFIXES = ["glm-4.7"];
|
||||||
|
const MINIMAX_PREFIXES = ["minimax-m2.1"];
|
||||||
|
const XAI_PREFIXES = ["grok-4"];
|
||||||
|
|
||||||
|
function matchesPrefix(id: string, prefixes: string[]): boolean {
|
||||||
|
return prefixes.some((prefix) => id.startsWith(prefix));
|
||||||
|
}
|
||||||
|
|
||||||
|
function matchesExactOrPrefix(id: string, values: string[]): boolean {
|
||||||
|
return values.some((value) => id === value || id.startsWith(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
function matchesAny(id: string, values: string[]): boolean {
|
||||||
|
return values.some((value) => id.includes(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isModernModelRef(ref: ModelRef): boolean {
|
||||||
|
const provider = ref.provider?.trim().toLowerCase() ?? "";
|
||||||
|
const id = ref.id?.trim().toLowerCase() ?? "";
|
||||||
|
if (!provider || !id) return false;
|
||||||
|
|
||||||
|
if (provider === "anthropic") {
|
||||||
|
return matchesPrefix(id, ANTHROPIC_PREFIXES);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (provider === "openai") {
|
||||||
|
return matchesExactOrPrefix(id, OPENAI_MODELS);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (provider === "openai-codex") {
|
||||||
|
return matchesExactOrPrefix(id, CODEX_MODELS);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (provider === "google" || provider === "google-gemini-cli") {
|
||||||
|
return matchesPrefix(id, GOOGLE_PREFIXES);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (provider === "google-antigravity") {
|
||||||
|
return (
|
||||||
|
matchesPrefix(id, GOOGLE_PREFIXES) ||
|
||||||
|
matchesPrefix(id, ANTHROPIC_PREFIXES)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (provider === "zai") {
|
||||||
|
return matchesPrefix(id, ZAI_PREFIXES);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (provider === "minimax") {
|
||||||
|
return matchesPrefix(id, MINIMAX_PREFIXES);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (provider === "xai") {
|
||||||
|
return matchesPrefix(id, XAI_PREFIXES);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (provider === "openrouter" || provider === "opencode") {
|
||||||
|
return matchesAny(id, [
|
||||||
|
...ANTHROPIC_PREFIXES,
|
||||||
|
...OPENAI_MODELS,
|
||||||
|
...CODEX_MODELS,
|
||||||
|
...GOOGLE_PREFIXES,
|
||||||
|
...ZAI_PREFIXES,
|
||||||
|
...MINIMAX_PREFIXES,
|
||||||
|
...XAI_PREFIXES,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
@ -117,4 +117,59 @@ describe("models config", () => {
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("normalizes gemini 3 ids to preview for google providers", async () => {
|
||||||
|
await withTempHome(async () => {
|
||||||
|
vi.resetModules();
|
||||||
|
const { ensureClawdbotModelsJson } = await import("./models-config.js");
|
||||||
|
const { resolveClawdbotAgentDir } = await import("./agent-paths.js");
|
||||||
|
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
models: {
|
||||||
|
providers: {
|
||||||
|
google: {
|
||||||
|
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
|
||||||
|
apiKey: "GEMINI_KEY",
|
||||||
|
api: "google-generative-ai",
|
||||||
|
models: [
|
||||||
|
{
|
||||||
|
id: "gemini-3-pro",
|
||||||
|
name: "Gemini 3 Pro",
|
||||||
|
api: "google-generative-ai",
|
||||||
|
reasoning: true,
|
||||||
|
input: ["text", "image"],
|
||||||
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||||
|
contextWindow: 1048576,
|
||||||
|
maxTokens: 65536,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "gemini-3-flash",
|
||||||
|
name: "Gemini 3 Flash",
|
||||||
|
api: "google-generative-ai",
|
||||||
|
reasoning: false,
|
||||||
|
input: ["text", "image"],
|
||||||
|
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||||
|
contextWindow: 1048576,
|
||||||
|
maxTokens: 65536,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
await ensureClawdbotModelsJson(cfg);
|
||||||
|
|
||||||
|
const modelPath = path.join(resolveClawdbotAgentDir(), "models.json");
|
||||||
|
const raw = await fs.readFile(modelPath, "utf8");
|
||||||
|
const parsed = JSON.parse(raw) as {
|
||||||
|
providers: Record<string, { models: Array<{ id: string }> }>;
|
||||||
|
};
|
||||||
|
const ids = parsed.providers.google?.models?.map((model) => model.id);
|
||||||
|
expect(ids).toEqual([
|
||||||
|
"gemini-3-pro-preview",
|
||||||
|
"gemini-3-flash-preview",
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ import { type ClawdbotConfig, loadConfig } from "../config/config.js";
|
||||||
import { resolveClawdbotAgentDir } from "./agent-paths.js";
|
import { resolveClawdbotAgentDir } from "./agent-paths.js";
|
||||||
|
|
||||||
type ModelsConfig = NonNullable<ClawdbotConfig["models"]>;
|
type ModelsConfig = NonNullable<ClawdbotConfig["models"]>;
|
||||||
|
type ProviderConfig = NonNullable<ModelsConfig["providers"]>[string];
|
||||||
|
|
||||||
const DEFAULT_MODE: NonNullable<ModelsConfig["mode"]> = "merge";
|
const DEFAULT_MODE: NonNullable<ModelsConfig["mode"]> = "merge";
|
||||||
|
|
||||||
|
|
@ -12,6 +13,38 @@ function isRecord(value: unknown): value is Record<string, unknown> {
|
||||||
return Boolean(value && typeof value === "object" && !Array.isArray(value));
|
return Boolean(value && typeof value === "object" && !Array.isArray(value));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function normalizeGoogleModelId(id: string): string {
|
||||||
|
if (id === "gemini-3-pro") return "gemini-3-pro-preview";
|
||||||
|
if (id === "gemini-3-flash") return "gemini-3-flash-preview";
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeGoogleProvider(provider: ProviderConfig): ProviderConfig {
|
||||||
|
let mutated = false;
|
||||||
|
const models = provider.models.map((model) => {
|
||||||
|
const nextId = normalizeGoogleModelId(model.id);
|
||||||
|
if (nextId === model.id) return model;
|
||||||
|
mutated = true;
|
||||||
|
return { ...model, id: nextId };
|
||||||
|
});
|
||||||
|
return mutated ? { ...provider, models } : provider;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeProviders(
|
||||||
|
providers: ModelsConfig["providers"],
|
||||||
|
): ModelsConfig["providers"] {
|
||||||
|
if (!providers) return providers;
|
||||||
|
let mutated = false;
|
||||||
|
const next: Record<string, ProviderConfig> = {};
|
||||||
|
for (const [key, provider] of Object.entries(providers)) {
|
||||||
|
const normalized =
|
||||||
|
key === "google" ? normalizeGoogleProvider(provider) : provider;
|
||||||
|
if (normalized !== provider) mutated = true;
|
||||||
|
next[key] = normalized;
|
||||||
|
}
|
||||||
|
return mutated ? next : providers;
|
||||||
|
}
|
||||||
|
|
||||||
async function readJson(pathname: string): Promise<unknown> {
|
async function readJson(pathname: string): Promise<unknown> {
|
||||||
try {
|
try {
|
||||||
const raw = await fs.readFile(pathname, "utf8");
|
const raw = await fs.readFile(pathname, "utf8");
|
||||||
|
|
@ -53,7 +86,8 @@ export async function ensureClawdbotModelsJson(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const next = `${JSON.stringify({ providers: mergedProviders }, null, 2)}\n`;
|
const normalizedProviders = normalizeProviders(mergedProviders);
|
||||||
|
const next = `${JSON.stringify({ providers: normalizedProviders }, null, 2)}\n`;
|
||||||
try {
|
try {
|
||||||
existingRaw = await fs.readFile(targetPath, "utf8");
|
existingRaw = await fs.readFile(targetPath, "utf8");
|
||||||
} catch {
|
} catch {
|
||||||
|
|
|
||||||
|
|
@ -7,24 +7,20 @@ import { Type } from "@sinclair/typebox";
|
||||||
import { describe, expect, it } from "vitest";
|
import { describe, expect, it } from "vitest";
|
||||||
import { loadConfig } from "../config/config.js";
|
import { loadConfig } from "../config/config.js";
|
||||||
import { resolveClawdbotAgentDir } from "./agent-paths.js";
|
import { resolveClawdbotAgentDir } from "./agent-paths.js";
|
||||||
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
|
|
||||||
import { getApiKeyForModel } from "./model-auth.js";
|
|
||||||
import {
|
import {
|
||||||
buildModelAliasIndex,
|
collectAnthropicApiKeys,
|
||||||
parseModelRef,
|
isAnthropicRateLimitError,
|
||||||
resolveConfiguredModelRef,
|
} from "./live-auth-keys.js";
|
||||||
resolveModelRefFromString,
|
import { isModernModelRef } from "./live-model-filter.js";
|
||||||
} from "./model-selection.js";
|
import { getApiKeyForModel } from "./model-auth.js";
|
||||||
import { ensureClawdbotModelsJson } from "./models-config.js";
|
import { ensureClawdbotModelsJson } from "./models-config.js";
|
||||||
|
|
||||||
const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
|
const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
|
||||||
const ALL_MODELS =
|
const DIRECT_ENABLED = Boolean(process.env.CLAWDBOT_LIVE_MODELS?.trim());
|
||||||
process.env.CLAWDBOT_LIVE_ALL_MODELS === "1" ||
|
|
||||||
process.env.CLAWDBOT_LIVE_MODELS === "all";
|
|
||||||
const REQUIRE_PROFILE_KEYS =
|
const REQUIRE_PROFILE_KEYS =
|
||||||
process.env.CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS === "1";
|
process.env.CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS === "1";
|
||||||
|
|
||||||
const describeLive = LIVE && ALL_MODELS ? describe : describe.skip;
|
const describeLive = LIVE ? describe : describe.skip;
|
||||||
|
|
||||||
function parseProviderFilter(raw?: string): Set<string> | null {
|
function parseProviderFilter(raw?: string): Set<string> | null {
|
||||||
const trimmed = raw?.trim();
|
const trimmed = raw?.trim();
|
||||||
|
|
@ -46,6 +42,10 @@ function parseModelFilter(raw?: string): Set<string> | null {
|
||||||
return ids.length ? new Set(ids) : null;
|
return ids.length ? new Set(ids) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function logProgress(message: string): void {
|
||||||
|
console.log(`[live] ${message}`);
|
||||||
|
}
|
||||||
|
|
||||||
function isGoogleModelNotFoundError(err: unknown): boolean {
|
function isGoogleModelNotFoundError(err: unknown): boolean {
|
||||||
const msg = String(err);
|
const msg = String(err);
|
||||||
if (!/not found/i.test(msg)) return false;
|
if (!/not found/i.test(msg)) return false;
|
||||||
|
|
@ -127,75 +127,25 @@ async function completeOkWithRetry(params: {
|
||||||
return await runOnce();
|
return await runOnce();
|
||||||
}
|
}
|
||||||
|
|
||||||
function resolveConfiguredModelKeys(
|
|
||||||
cfg: ReturnType<typeof loadConfig>,
|
|
||||||
): string[] {
|
|
||||||
const aliasIndex = buildModelAliasIndex({
|
|
||||||
cfg,
|
|
||||||
defaultProvider: DEFAULT_PROVIDER,
|
|
||||||
});
|
|
||||||
const order: string[] = [];
|
|
||||||
const seen = new Set<string>();
|
|
||||||
|
|
||||||
const addKey = (key: string) => {
|
|
||||||
const normalized = key.trim();
|
|
||||||
if (!normalized || seen.has(normalized)) return;
|
|
||||||
seen.add(normalized);
|
|
||||||
order.push(normalized);
|
|
||||||
};
|
|
||||||
|
|
||||||
const addRef = (ref: { provider: string; model: string }) => {
|
|
||||||
addKey(`${ref.provider}/${ref.model}`);
|
|
||||||
};
|
|
||||||
|
|
||||||
addRef(
|
|
||||||
resolveConfiguredModelRef({
|
|
||||||
cfg,
|
|
||||||
defaultProvider: DEFAULT_PROVIDER,
|
|
||||||
defaultModel: DEFAULT_MODEL,
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
|
|
||||||
const modelConfig = cfg.agents?.defaults?.model as
|
|
||||||
| { primary?: string; fallbacks?: string[] }
|
|
||||||
| undefined;
|
|
||||||
const imageModelConfig = cfg.agents?.defaults?.imageModel as
|
|
||||||
| { primary?: string; fallbacks?: string[] }
|
|
||||||
| undefined;
|
|
||||||
|
|
||||||
const primary = modelConfig?.primary?.trim() ?? "";
|
|
||||||
const fallbacks = modelConfig?.fallbacks ?? [];
|
|
||||||
const imagePrimary = imageModelConfig?.primary?.trim() ?? "";
|
|
||||||
const imageFallbacks = imageModelConfig?.fallbacks ?? [];
|
|
||||||
|
|
||||||
const addRaw = (raw: string) => {
|
|
||||||
const resolved = resolveModelRefFromString({
|
|
||||||
raw,
|
|
||||||
defaultProvider: DEFAULT_PROVIDER,
|
|
||||||
aliasIndex,
|
|
||||||
});
|
|
||||||
if (resolved) addRef(resolved.ref);
|
|
||||||
};
|
|
||||||
|
|
||||||
if (primary) addRaw(primary);
|
|
||||||
for (const raw of fallbacks) addRaw(String(raw ?? ""));
|
|
||||||
if (imagePrimary) addRaw(imagePrimary);
|
|
||||||
for (const raw of imageFallbacks) addRaw(String(raw ?? ""));
|
|
||||||
|
|
||||||
for (const key of Object.keys(cfg.agents?.defaults?.models ?? {})) {
|
|
||||||
const parsed = parseModelRef(String(key ?? ""), DEFAULT_PROVIDER);
|
|
||||||
if (parsed) addRef(parsed);
|
|
||||||
}
|
|
||||||
|
|
||||||
return order;
|
|
||||||
}
|
|
||||||
|
|
||||||
describeLive("live models (profile keys)", () => {
|
describeLive("live models (profile keys)", () => {
|
||||||
it(
|
it(
|
||||||
"completes across configured models",
|
"completes across selected models",
|
||||||
async () => {
|
async () => {
|
||||||
const cfg = loadConfig();
|
const cfg = loadConfig();
|
||||||
await ensureClawdbotModelsJson(cfg);
|
await ensureClawdbotModelsJson(cfg);
|
||||||
|
if (!DIRECT_ENABLED) {
|
||||||
|
logProgress(
|
||||||
|
"[live-models] skipping (set CLAWDBOT_LIVE_MODELS=modern|all|<list>; all=modern)",
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const anthropicKeys = collectAnthropicApiKeys();
|
||||||
|
if (anthropicKeys.length > 0) {
|
||||||
|
process.env.ANTHROPIC_API_KEY = anthropicKeys[0];
|
||||||
|
logProgress(
|
||||||
|
`[live-models] anthropic keys loaded: ${anthropicKeys.length}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
const agentDir = resolveClawdbotAgentDir();
|
const agentDir = resolveClawdbotAgentDir();
|
||||||
const authStorage = discoverAuthStorage(agentDir);
|
const authStorage = discoverAuthStorage(agentDir);
|
||||||
|
|
@ -205,7 +155,11 @@ describeLive("live models (profile keys)", () => {
|
||||||
models.map((model) => [`${model.provider}/${model.id}`, model]),
|
models.map((model) => [`${model.provider}/${model.id}`, model]),
|
||||||
);
|
);
|
||||||
|
|
||||||
const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS);
|
const rawModels = process.env.CLAWDBOT_LIVE_MODELS?.trim();
|
||||||
|
const useModern = rawModels === "modern" || rawModels === "all";
|
||||||
|
const useExplicit = Boolean(rawModels) && !useModern;
|
||||||
|
const filter = useExplicit ? parseModelFilter(rawModels) : null;
|
||||||
|
const allowNotFoundSkip = useModern;
|
||||||
const providers = parseProviderFilter(
|
const providers = parseProviderFilter(
|
||||||
process.env.CLAWDBOT_LIVE_PROVIDERS,
|
process.env.CLAWDBOT_LIVE_PROVIDERS,
|
||||||
);
|
);
|
||||||
|
|
@ -216,149 +170,196 @@ describeLive("live models (profile keys)", () => {
|
||||||
|
|
||||||
const failures: Array<{ model: string; error: string }> = [];
|
const failures: Array<{ model: string; error: string }> = [];
|
||||||
const skipped: Array<{ model: string; reason: string }> = [];
|
const skipped: Array<{ model: string; reason: string }> = [];
|
||||||
|
const candidates: Array<{
|
||||||
|
model: Model<Api>;
|
||||||
|
apiKeyInfo: Awaited<ReturnType<typeof getApiKeyForModel>>;
|
||||||
|
}> = [];
|
||||||
|
|
||||||
const configuredKeys = resolveConfiguredModelKeys(cfg);
|
for (const model of models) {
|
||||||
|
|
||||||
for (const key of configuredKeys) {
|
|
||||||
const model = modelByKey.get(key);
|
|
||||||
if (!model) {
|
|
||||||
skipped.push({
|
|
||||||
model: key,
|
|
||||||
reason: "configured model missing in registry",
|
|
||||||
});
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (providers && !providers.has(model.provider)) continue;
|
if (providers && !providers.has(model.provider)) continue;
|
||||||
const id = `${model.provider}/${model.id}`;
|
const id = `${model.provider}/${model.id}`;
|
||||||
if (filter && !filter.has(id)) continue;
|
if (filter && !filter.has(id)) continue;
|
||||||
|
if (!filter && useModern) {
|
||||||
let apiKeyInfo: Awaited<ReturnType<typeof getApiKeyForModel>>;
|
if (!isModernModelRef({ provider: model.provider, id: model.id })) {
|
||||||
try {
|
|
||||||
apiKeyInfo = await getApiKeyForModel({ model, cfg });
|
|
||||||
} catch (err) {
|
|
||||||
skipped.push({ model: id, reason: String(err) });
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (REQUIRE_PROFILE_KEYS && !apiKeyInfo.source.startsWith("profile:")) {
|
|
||||||
skipped.push({
|
|
||||||
model: id,
|
|
||||||
reason: `non-profile credential source: ${apiKeyInfo.source}`,
|
|
||||||
});
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Special regression: OpenAI requires replayed `reasoning` items for tool-only turns.
|
|
||||||
if (
|
|
||||||
model.provider === "openai" &&
|
|
||||||
model.api === "openai-responses" &&
|
|
||||||
model.id === "gpt-5.2"
|
|
||||||
) {
|
|
||||||
const noopTool = {
|
|
||||||
name: "noop",
|
|
||||||
description: "Return ok.",
|
|
||||||
parameters: Type.Object({}, { additionalProperties: false }),
|
|
||||||
};
|
|
||||||
|
|
||||||
const first = await completeSimpleWithTimeout(
|
|
||||||
model,
|
|
||||||
{
|
|
||||||
messages: [
|
|
||||||
{
|
|
||||||
role: "user",
|
|
||||||
content:
|
|
||||||
"Call the tool `noop` with {}. Do not write any other text.",
|
|
||||||
timestamp: Date.now(),
|
|
||||||
},
|
|
||||||
],
|
|
||||||
tools: [noopTool],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
apiKey: apiKeyInfo.apiKey,
|
|
||||||
reasoning: model.reasoning ? "low" : undefined,
|
|
||||||
maxTokens: 128,
|
|
||||||
},
|
|
||||||
perModelTimeoutMs,
|
|
||||||
);
|
|
||||||
|
|
||||||
const toolCall = first.content.find((b) => b.type === "toolCall");
|
|
||||||
expect(toolCall).toBeTruthy();
|
|
||||||
if (!toolCall || toolCall.type !== "toolCall") {
|
|
||||||
throw new Error("expected tool call");
|
|
||||||
}
|
|
||||||
|
|
||||||
const second = await completeSimpleWithTimeout(
|
|
||||||
model,
|
|
||||||
{
|
|
||||||
messages: [
|
|
||||||
{
|
|
||||||
role: "user",
|
|
||||||
content:
|
|
||||||
"Call the tool `noop` with {}. Do not write any other text.",
|
|
||||||
timestamp: Date.now(),
|
|
||||||
},
|
|
||||||
first,
|
|
||||||
{
|
|
||||||
role: "toolResult",
|
|
||||||
toolCallId: toolCall.id,
|
|
||||||
toolName: "noop",
|
|
||||||
content: [{ type: "text", text: "ok" }],
|
|
||||||
isError: false,
|
|
||||||
timestamp: Date.now(),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
role: "user",
|
|
||||||
content: "Reply with the word ok.",
|
|
||||||
timestamp: Date.now(),
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
apiKey: apiKeyInfo.apiKey,
|
|
||||||
reasoning: model.reasoning ? "low" : undefined,
|
|
||||||
maxTokens: 64,
|
|
||||||
},
|
|
||||||
perModelTimeoutMs,
|
|
||||||
);
|
|
||||||
|
|
||||||
const secondText = second.content
|
|
||||||
.filter((b) => b.type === "text")
|
|
||||||
.map((b) => b.text.trim())
|
|
||||||
.join(" ");
|
|
||||||
expect(secondText.length).toBeGreaterThan(0);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
const ok = await completeOkWithRetry({
|
try {
|
||||||
model,
|
const apiKeyInfo = await getApiKeyForModel({ model, cfg });
|
||||||
apiKey: apiKeyInfo.apiKey,
|
if (
|
||||||
timeoutMs: perModelTimeoutMs,
|
REQUIRE_PROFILE_KEYS &&
|
||||||
});
|
!apiKeyInfo.source.startsWith("profile:")
|
||||||
|
) {
|
||||||
if (ok.res.stopReason === "error") {
|
|
||||||
const msg = ok.res.errorMessage ?? "";
|
|
||||||
if (ALL_MODELS && isModelNotFoundErrorMessage(msg)) {
|
|
||||||
skipped.push({ model: id, reason: msg });
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
throw new Error(msg || "model returned error with no message");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ok.text.length === 0 && model.provider === "google") {
|
|
||||||
skipped.push({
|
skipped.push({
|
||||||
model: id,
|
model: id,
|
||||||
reason: "no text returned (likely unavailable model id)",
|
reason: `non-profile credential source: ${apiKeyInfo.source}`,
|
||||||
});
|
});
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
expect(ok.text.length).toBeGreaterThan(0);
|
candidates.push({ model, apiKeyInfo });
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (model.provider === "google" && isGoogleModelNotFoundError(err)) {
|
skipped.push({ model: id, reason: String(err) });
|
||||||
skipped.push({ model: id, reason: String(err) });
|
}
|
||||||
continue;
|
}
|
||||||
|
|
||||||
|
if (candidates.length === 0) {
|
||||||
|
logProgress("[live-models] no API keys found; skipping");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
logProgress(
|
||||||
|
`[live-models] selection=${useExplicit ? "explicit" : "modern"}`,
|
||||||
|
);
|
||||||
|
logProgress(`[live-models] running ${candidates.length} models`);
|
||||||
|
const total = candidates.length;
|
||||||
|
|
||||||
|
for (const [index, entry] of candidates.entries()) {
|
||||||
|
const { model, apiKeyInfo } = entry;
|
||||||
|
const id = `${model.provider}/${model.id}`;
|
||||||
|
const progressLabel = `[live-models] ${index + 1}/${total} ${id}`;
|
||||||
|
const attemptMax =
|
||||||
|
model.provider === "anthropic" && anthropicKeys.length > 0
|
||||||
|
? anthropicKeys.length
|
||||||
|
: 1;
|
||||||
|
for (let attempt = 0; attempt < attemptMax; attempt += 1) {
|
||||||
|
if (model.provider === "anthropic" && anthropicKeys.length > 0) {
|
||||||
|
process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt];
|
||||||
|
}
|
||||||
|
const apiKey =
|
||||||
|
model.provider === "anthropic" && anthropicKeys.length > 0
|
||||||
|
? anthropicKeys[attempt]
|
||||||
|
: apiKeyInfo.apiKey;
|
||||||
|
try {
|
||||||
|
// Special regression: OpenAI requires replayed `reasoning` items for tool-only turns.
|
||||||
|
if (
|
||||||
|
model.provider === "openai" &&
|
||||||
|
model.api === "openai-responses" &&
|
||||||
|
model.id === "gpt-5.2"
|
||||||
|
) {
|
||||||
|
logProgress(`${progressLabel}: tool-only regression`);
|
||||||
|
const noopTool = {
|
||||||
|
name: "noop",
|
||||||
|
description: "Return ok.",
|
||||||
|
parameters: Type.Object({}, { additionalProperties: false }),
|
||||||
|
};
|
||||||
|
|
||||||
|
const first = await completeSimpleWithTimeout(
|
||||||
|
model,
|
||||||
|
{
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content:
|
||||||
|
"Call the tool `noop` with {}. Do not write any other text.",
|
||||||
|
timestamp: Date.now(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
tools: [noopTool],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
apiKey,
|
||||||
|
reasoning: model.reasoning ? "low" : undefined,
|
||||||
|
maxTokens: 128,
|
||||||
|
},
|
||||||
|
perModelTimeoutMs,
|
||||||
|
);
|
||||||
|
|
||||||
|
const toolCall = first.content.find((b) => b.type === "toolCall");
|
||||||
|
expect(toolCall).toBeTruthy();
|
||||||
|
if (!toolCall || toolCall.type !== "toolCall") {
|
||||||
|
throw new Error("expected tool call");
|
||||||
|
}
|
||||||
|
|
||||||
|
const second = await completeSimpleWithTimeout(
|
||||||
|
model,
|
||||||
|
{
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content:
|
||||||
|
"Call the tool `noop` with {}. Do not write any other text.",
|
||||||
|
timestamp: Date.now(),
|
||||||
|
},
|
||||||
|
first,
|
||||||
|
{
|
||||||
|
role: "toolResult",
|
||||||
|
toolCallId: toolCall.id,
|
||||||
|
toolName: "noop",
|
||||||
|
content: [{ type: "text", text: "ok" }],
|
||||||
|
isError: false,
|
||||||
|
timestamp: Date.now(),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: "Reply with the word ok.",
|
||||||
|
timestamp: Date.now(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
apiKey,
|
||||||
|
reasoning: model.reasoning ? "low" : undefined,
|
||||||
|
maxTokens: 64,
|
||||||
|
},
|
||||||
|
perModelTimeoutMs,
|
||||||
|
);
|
||||||
|
|
||||||
|
const secondText = second.content
|
||||||
|
.filter((b) => b.type === "text")
|
||||||
|
.map((b) => b.text.trim())
|
||||||
|
.join(" ");
|
||||||
|
expect(secondText.length).toBeGreaterThan(0);
|
||||||
|
logProgress(`${progressLabel}: done`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
logProgress(`${progressLabel}: prompt`);
|
||||||
|
const ok = await completeOkWithRetry({
|
||||||
|
model,
|
||||||
|
apiKey,
|
||||||
|
timeoutMs: perModelTimeoutMs,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (ok.res.stopReason === "error") {
|
||||||
|
const msg = ok.res.errorMessage ?? "";
|
||||||
|
if (allowNotFoundSkip && isModelNotFoundErrorMessage(msg)) {
|
||||||
|
skipped.push({ model: id, reason: msg });
|
||||||
|
logProgress(`${progressLabel}: skip (model not found)`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
throw new Error(msg || "model returned error with no message");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ok.text.length === 0 && model.provider === "google") {
|
||||||
|
skipped.push({
|
||||||
|
model: id,
|
||||||
|
reason: "no text returned (likely unavailable model id)",
|
||||||
|
});
|
||||||
|
logProgress(`${progressLabel}: skip (google model not found)`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
expect(ok.text.length).toBeGreaterThan(0);
|
||||||
|
logProgress(`${progressLabel}: done`);
|
||||||
|
break;
|
||||||
|
} catch (err) {
|
||||||
|
const message = String(err);
|
||||||
|
if (
|
||||||
|
model.provider === "anthropic" &&
|
||||||
|
isAnthropicRateLimitError(message) &&
|
||||||
|
attempt + 1 < attemptMax
|
||||||
|
) {
|
||||||
|
logProgress(`${progressLabel}: rate limit, retrying with next key`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (model.provider === "google" && isGoogleModelNotFoundError(err)) {
|
||||||
|
skipped.push({ model: id, reason: message });
|
||||||
|
logProgress(`${progressLabel}: skip (google model not found)`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
logProgress(`${progressLabel}: failed`);
|
||||||
|
failures.push({ model: id, error: message });
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
failures.push({ model: id, error: String(err) });
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -372,8 +373,6 @@ describeLive("live models (profile keys)", () => {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Keep one assertion so the test fails loudly if we somehow ran nothing.
|
|
||||||
expect(models.length).toBeGreaterThan(0);
|
|
||||||
void skipped;
|
void skipped;
|
||||||
},
|
},
|
||||||
15 * 60 * 1000,
|
15 * 60 * 1000,
|
||||||
|
|
|
||||||
|
|
@ -11,9 +11,15 @@ import {
|
||||||
} from "@mariozechner/pi-coding-agent";
|
} from "@mariozechner/pi-coding-agent";
|
||||||
import { describe, expect, it } from "vitest";
|
import { describe, expect, it } from "vitest";
|
||||||
import { resolveClawdbotAgentDir } from "../agents/agent-paths.js";
|
import { resolveClawdbotAgentDir } from "../agents/agent-paths.js";
|
||||||
|
import {
|
||||||
|
collectAnthropicApiKeys,
|
||||||
|
isAnthropicRateLimitError,
|
||||||
|
} from "../agents/live-auth-keys.js";
|
||||||
|
import { isModernModelRef } from "../agents/live-model-filter.js";
|
||||||
import { getApiKeyForModel } from "../agents/model-auth.js";
|
import { getApiKeyForModel } from "../agents/model-auth.js";
|
||||||
import { ensureClawdbotModelsJson } from "../agents/models-config.js";
|
import { ensureClawdbotModelsJson } from "../agents/models-config.js";
|
||||||
import { loadConfig } from "../config/config.js";
|
import { loadConfig } from "../config/config.js";
|
||||||
|
import type { ClawdbotConfig, ModelProviderConfig } from "../config/types.js";
|
||||||
import {
|
import {
|
||||||
GATEWAY_CLIENT_MODES,
|
GATEWAY_CLIENT_MODES,
|
||||||
GATEWAY_CLIENT_NAMES,
|
GATEWAY_CLIENT_NAMES,
|
||||||
|
|
@ -25,16 +31,14 @@ import { startGatewayServer } from "./server.js";
|
||||||
|
|
||||||
const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
|
const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
|
||||||
const GATEWAY_LIVE = process.env.CLAWDBOT_LIVE_GATEWAY === "1";
|
const GATEWAY_LIVE = process.env.CLAWDBOT_LIVE_GATEWAY === "1";
|
||||||
const ALL_MODELS =
|
|
||||||
process.env.CLAWDBOT_LIVE_GATEWAY_ALL_MODELS === "1" ||
|
|
||||||
process.env.CLAWDBOT_LIVE_GATEWAY_MODELS === "all";
|
|
||||||
const EXTRA_TOOL_PROBES = process.env.CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE === "1";
|
|
||||||
const EXTRA_IMAGE_PROBES =
|
|
||||||
process.env.CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE === "1";
|
|
||||||
const ZAI_FALLBACK = process.env.CLAWDBOT_LIVE_GATEWAY_ZAI_FALLBACK === "1";
|
const ZAI_FALLBACK = process.env.CLAWDBOT_LIVE_GATEWAY_ZAI_FALLBACK === "1";
|
||||||
const PROVIDERS = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_PROVIDERS);
|
const PROVIDERS = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_PROVIDERS);
|
||||||
|
const THINKING_LEVEL = "high";
|
||||||
|
const THINKING_TAG_RE =
|
||||||
|
/<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
|
||||||
|
const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i;
|
||||||
|
|
||||||
const describeLive = LIVE && GATEWAY_LIVE ? describe : describe.skip;
|
const describeLive = LIVE || GATEWAY_LIVE ? describe : describe.skip;
|
||||||
|
|
||||||
function parseFilter(raw?: string): Set<string> | null {
|
function parseFilter(raw?: string): Set<string> | null {
|
||||||
const trimmed = raw?.trim();
|
const trimmed = raw?.trim();
|
||||||
|
|
@ -46,6 +50,26 @@ function parseFilter(raw?: string): Set<string> | null {
|
||||||
return ids.length ? new Set(ids) : null;
|
return ids.length ? new Set(ids) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function logProgress(message: string): void {
|
||||||
|
console.log(`[live] ${message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
function assertNoReasoningTags(params: {
|
||||||
|
text: string;
|
||||||
|
model: string;
|
||||||
|
phase: string;
|
||||||
|
label: string;
|
||||||
|
}): void {
|
||||||
|
if (!params.text) return;
|
||||||
|
if (THINKING_TAG_RE.test(params.text) || FINAL_TAG_RE.test(params.text)) {
|
||||||
|
const snippet =
|
||||||
|
params.text.length > 200 ? `${params.text.slice(0, 200)}…` : params.text;
|
||||||
|
throw new Error(
|
||||||
|
`[${params.label}] reasoning tag leak (${params.model} / ${params.phase}): ${snippet}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function extractPayloadText(result: unknown): string {
|
function extractPayloadText(result: unknown): string {
|
||||||
const record = result as Record<string, unknown>;
|
const record = result as Record<string, unknown>;
|
||||||
const payloads = Array.isArray(record.payloads) ? record.payloads : [];
|
const payloads = Array.isArray(record.payloads) ? record.payloads : [];
|
||||||
|
|
@ -200,61 +224,470 @@ async function connectClient(params: { url: string; token: string }) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type GatewayModelSuiteParams = {
|
||||||
|
label: string;
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
candidates: Array<Model<Api>>;
|
||||||
|
extraToolProbes: boolean;
|
||||||
|
extraImageProbes: boolean;
|
||||||
|
thinkingLevel: string;
|
||||||
|
providerOverrides?: Record<string, ModelProviderConfig>;
|
||||||
|
};
|
||||||
|
|
||||||
|
function buildLiveGatewayConfig(params: {
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
candidates: Array<Model<Api>>;
|
||||||
|
providerOverrides?: Record<string, ModelProviderConfig>;
|
||||||
|
}): ClawdbotConfig {
|
||||||
|
const lmstudioProvider = params.cfg.models?.providers?.lmstudio;
|
||||||
|
const baseProviders = params.cfg.models?.providers ?? {};
|
||||||
|
const nextProviders = {
|
||||||
|
...baseProviders,
|
||||||
|
...(lmstudioProvider
|
||||||
|
? {
|
||||||
|
lmstudio: {
|
||||||
|
...lmstudioProvider,
|
||||||
|
api: "openai-completions",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
: {}),
|
||||||
|
...(params.providerOverrides ?? {}),
|
||||||
|
};
|
||||||
|
const providers =
|
||||||
|
Object.keys(nextProviders).length > 0 ? nextProviders : baseProviders;
|
||||||
|
return {
|
||||||
|
...params.cfg,
|
||||||
|
agents: {
|
||||||
|
...params.cfg.agents,
|
||||||
|
list: (params.cfg.agents?.list ?? []).map((entry) => ({
|
||||||
|
...entry,
|
||||||
|
sandbox: { mode: "off" },
|
||||||
|
})),
|
||||||
|
defaults: {
|
||||||
|
...params.cfg.agents?.defaults,
|
||||||
|
// Live tests should avoid Docker sandboxing so tool probes can
|
||||||
|
// operate on the temporary probe files we create in the host workspace.
|
||||||
|
sandbox: { mode: "off" },
|
||||||
|
models: Object.fromEntries(
|
||||||
|
params.candidates.map((m) => [`${m.provider}/${m.id}`, {}]),
|
||||||
|
),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
models:
|
||||||
|
Object.keys(providers).length > 0
|
||||||
|
? { ...params.cfg.models, providers }
|
||||||
|
: params.cfg.models,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildMinimaxProviderOverride(params: {
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
api: "openai-completions" | "anthropic-messages";
|
||||||
|
baseUrl: string;
|
||||||
|
}): ModelProviderConfig | null {
|
||||||
|
const existing = params.cfg.models?.providers?.minimax;
|
||||||
|
if (!existing || !Array.isArray(existing.models) || existing.models.length === 0)
|
||||||
|
return null;
|
||||||
|
return {
|
||||||
|
...existing,
|
||||||
|
api: params.api,
|
||||||
|
baseUrl: params.baseUrl,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
||||||
|
const previous = {
|
||||||
|
configPath: process.env.CLAWDBOT_CONFIG_PATH,
|
||||||
|
token: process.env.CLAWDBOT_GATEWAY_TOKEN,
|
||||||
|
skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS,
|
||||||
|
skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
|
||||||
|
skipCron: process.env.CLAWDBOT_SKIP_CRON,
|
||||||
|
skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
|
||||||
|
};
|
||||||
|
|
||||||
|
process.env.CLAWDBOT_SKIP_PROVIDERS = "1";
|
||||||
|
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
|
||||||
|
process.env.CLAWDBOT_SKIP_CRON = "1";
|
||||||
|
process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
|
||||||
|
|
||||||
|
const token = `test-${randomUUID()}`;
|
||||||
|
process.env.CLAWDBOT_GATEWAY_TOKEN = token;
|
||||||
|
|
||||||
|
const workspaceDir = resolveUserPath(
|
||||||
|
params.cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"),
|
||||||
|
);
|
||||||
|
await fs.mkdir(workspaceDir, { recursive: true });
|
||||||
|
const nonceA = randomUUID();
|
||||||
|
const nonceB = randomUUID();
|
||||||
|
const toolProbePath = path.join(
|
||||||
|
workspaceDir,
|
||||||
|
`.clawdbot-live-tool-probe.${nonceA}.txt`,
|
||||||
|
);
|
||||||
|
await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
|
||||||
|
|
||||||
|
const nextCfg = buildLiveGatewayConfig({
|
||||||
|
cfg: params.cfg,
|
||||||
|
candidates: params.candidates,
|
||||||
|
providerOverrides: params.providerOverrides,
|
||||||
|
});
|
||||||
|
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-live-"));
|
||||||
|
const tempConfigPath = path.join(tempDir, "clawdbot.json");
|
||||||
|
await fs.writeFile(tempConfigPath, `${JSON.stringify(nextCfg, null, 2)}\n`);
|
||||||
|
process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath;
|
||||||
|
|
||||||
|
await ensureClawdbotModelsJson(nextCfg);
|
||||||
|
|
||||||
|
const port = await getFreeGatewayPort();
|
||||||
|
const server = await startGatewayServer(port, {
|
||||||
|
bind: "loopback",
|
||||||
|
auth: { mode: "token", token },
|
||||||
|
controlUiEnabled: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
const client = await connectClient({
|
||||||
|
url: `ws://127.0.0.1:${port}`,
|
||||||
|
token,
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
logProgress(
|
||||||
|
`[${params.label}] running ${params.candidates.length} models (thinking=${params.thinkingLevel})`,
|
||||||
|
);
|
||||||
|
const anthropicKeys = collectAnthropicApiKeys();
|
||||||
|
if (anthropicKeys.length > 0) {
|
||||||
|
process.env.ANTHROPIC_API_KEY = anthropicKeys[0];
|
||||||
|
logProgress(`[${params.label}] anthropic keys loaded: ${anthropicKeys.length}`);
|
||||||
|
}
|
||||||
|
const sessionKey = `agent:dev:${params.label}`;
|
||||||
|
const failures: Array<{ model: string; error: string }> = [];
|
||||||
|
const total = params.candidates.length;
|
||||||
|
|
||||||
|
for (const [index, model] of params.candidates.entries()) {
|
||||||
|
const modelKey = `${model.provider}/${model.id}`;
|
||||||
|
const progressLabel = `[${params.label}] ${index + 1}/${total} ${modelKey}`;
|
||||||
|
|
||||||
|
const attemptMax =
|
||||||
|
model.provider === "anthropic" && anthropicKeys.length > 0
|
||||||
|
? anthropicKeys.length
|
||||||
|
: 1;
|
||||||
|
|
||||||
|
for (let attempt = 0; attempt < attemptMax; attempt += 1) {
|
||||||
|
if (model.provider === "anthropic" && anthropicKeys.length > 0) {
|
||||||
|
process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt];
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
// Ensure session exists + override model for this run.
|
||||||
|
await client.request<Record<string, unknown>>("sessions.patch", {
|
||||||
|
key: sessionKey,
|
||||||
|
model: modelKey,
|
||||||
|
});
|
||||||
|
// Reset between models: avoids cross-provider transcript incompatibilities
|
||||||
|
// (notably OpenAI Responses requiring reasoning replay for function_call items).
|
||||||
|
await client.request<Record<string, unknown>>("sessions.reset", {
|
||||||
|
key: sessionKey,
|
||||||
|
});
|
||||||
|
|
||||||
|
logProgress(`${progressLabel}: prompt`);
|
||||||
|
const runId = randomUUID();
|
||||||
|
const payload = await client.request<AgentFinalPayload>(
|
||||||
|
"agent",
|
||||||
|
{
|
||||||
|
sessionKey,
|
||||||
|
idempotencyKey: `idem-${runId}`,
|
||||||
|
message:
|
||||||
|
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
|
||||||
|
thinking: params.thinkingLevel,
|
||||||
|
deliver: false,
|
||||||
|
},
|
||||||
|
{ expectFinal: true },
|
||||||
|
);
|
||||||
|
|
||||||
|
if (payload?.status !== "ok") {
|
||||||
|
throw new Error(`agent status=${String(payload?.status)}`);
|
||||||
|
}
|
||||||
|
const text = extractPayloadText(payload?.result);
|
||||||
|
if (model.provider === "google" && isGoogleModelNotFoundText(text)) {
|
||||||
|
// Catalog drift: model IDs can disappear or become unavailable on the API.
|
||||||
|
// Treat as skip when scanning "all models" for Google.
|
||||||
|
logProgress(`${progressLabel}: skip (google model not found)`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
assertNoReasoningTags({
|
||||||
|
text,
|
||||||
|
model: modelKey,
|
||||||
|
phase: "prompt",
|
||||||
|
label: params.label,
|
||||||
|
});
|
||||||
|
if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`);
|
||||||
|
if (
|
||||||
|
!/\bmicro\s*-?\s*tasks?\b/i.test(text) ||
|
||||||
|
!/\bmacro\s*-?\s*tasks?\b/i.test(text)
|
||||||
|
) {
|
||||||
|
throw new Error(`missing required keywords: ${text}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Real tool invocation: force the agent to Read a local file and echo a nonce.
|
||||||
|
logProgress(`${progressLabel}: tool-read`);
|
||||||
|
const runIdTool = randomUUID();
|
||||||
|
const toolProbe = await client.request<AgentFinalPayload>(
|
||||||
|
"agent",
|
||||||
|
{
|
||||||
|
sessionKey,
|
||||||
|
idempotencyKey: `idem-${runIdTool}-tool`,
|
||||||
|
message:
|
||||||
|
"Clawdbot live tool probe (local, safe): " +
|
||||||
|
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
|
||||||
|
"Then reply with the two nonce values you read (include both).",
|
||||||
|
thinking: params.thinkingLevel,
|
||||||
|
deliver: false,
|
||||||
|
},
|
||||||
|
{ expectFinal: true },
|
||||||
|
);
|
||||||
|
if (toolProbe?.status !== "ok") {
|
||||||
|
throw new Error(`tool probe failed: status=${String(toolProbe?.status)}`);
|
||||||
|
}
|
||||||
|
const toolText = extractPayloadText(toolProbe?.result);
|
||||||
|
assertNoReasoningTags({
|
||||||
|
text: toolText,
|
||||||
|
model: modelKey,
|
||||||
|
phase: "tool-read",
|
||||||
|
label: params.label,
|
||||||
|
});
|
||||||
|
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
|
||||||
|
throw new Error(`tool probe missing nonce: ${toolText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.extraToolProbes) {
|
||||||
|
logProgress(`${progressLabel}: tool-exec`);
|
||||||
|
const nonceC = randomUUID();
|
||||||
|
const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`);
|
||||||
|
|
||||||
|
const execReadProbe = await client.request<AgentFinalPayload>(
|
||||||
|
"agent",
|
||||||
|
{
|
||||||
|
sessionKey,
|
||||||
|
idempotencyKey: `idem-${runIdTool}-exec-read`,
|
||||||
|
message:
|
||||||
|
"Clawdbot live tool probe (local, safe): " +
|
||||||
|
"use the tool named `exec` (or `Exec`) to run this command: " +
|
||||||
|
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
|
||||||
|
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
|
||||||
|
"Finally reply including the nonce text you read back.",
|
||||||
|
thinking: params.thinkingLevel,
|
||||||
|
deliver: false,
|
||||||
|
},
|
||||||
|
{ expectFinal: true },
|
||||||
|
);
|
||||||
|
if (execReadProbe?.status !== "ok") {
|
||||||
|
throw new Error(
|
||||||
|
`exec+read probe failed: status=${String(execReadProbe?.status)}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const execReadText = extractPayloadText(execReadProbe?.result);
|
||||||
|
assertNoReasoningTags({
|
||||||
|
text: execReadText,
|
||||||
|
model: modelKey,
|
||||||
|
phase: "tool-exec",
|
||||||
|
label: params.label,
|
||||||
|
});
|
||||||
|
if (!execReadText.includes(nonceC)) {
|
||||||
|
throw new Error(`exec+read probe missing nonce: ${execReadText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
await fs.rm(toolWritePath, { force: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.extraImageProbes && model.input?.includes("image")) {
|
||||||
|
logProgress(`${progressLabel}: image`);
|
||||||
|
const imageCode = randomImageProbeCode(10);
|
||||||
|
const imageBase64 = renderCatNoncePngBase64(imageCode);
|
||||||
|
const runIdImage = randomUUID();
|
||||||
|
|
||||||
|
const imageProbe = await client.request<AgentFinalPayload>(
|
||||||
|
"agent",
|
||||||
|
{
|
||||||
|
sessionKey,
|
||||||
|
idempotencyKey: `idem-${runIdImage}-image`,
|
||||||
|
message:
|
||||||
|
"Look at the attached image. Reply with exactly two tokens separated by a single space: " +
|
||||||
|
"(1) the animal shown or written in the image, lowercase; " +
|
||||||
|
"(2) the code printed in the image, uppercase. No extra text.",
|
||||||
|
attachments: [
|
||||||
|
{
|
||||||
|
mimeType: "image/png",
|
||||||
|
fileName: `probe-${runIdImage}.png`,
|
||||||
|
content: imageBase64,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
thinking: params.thinkingLevel,
|
||||||
|
deliver: false,
|
||||||
|
},
|
||||||
|
{ expectFinal: true },
|
||||||
|
);
|
||||||
|
if (imageProbe?.status !== "ok") {
|
||||||
|
throw new Error(
|
||||||
|
`image probe failed: status=${String(imageProbe?.status)}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const imageText = extractPayloadText(imageProbe?.result);
|
||||||
|
assertNoReasoningTags({
|
||||||
|
text: imageText,
|
||||||
|
model: modelKey,
|
||||||
|
phase: "image",
|
||||||
|
label: params.label,
|
||||||
|
});
|
||||||
|
if (!/\bcat\b/i.test(imageText)) {
|
||||||
|
throw new Error(`image probe missing 'cat': ${imageText}`);
|
||||||
|
}
|
||||||
|
const candidates =
|
||||||
|
imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
|
||||||
|
const bestDistance = candidates.reduce((best, cand) => {
|
||||||
|
if (Math.abs(cand.length - imageCode.length) > 2) return best;
|
||||||
|
return Math.min(best, editDistance(cand, imageCode));
|
||||||
|
}, Number.POSITIVE_INFINITY);
|
||||||
|
if (!(bestDistance <= 2)) {
|
||||||
|
throw new Error(
|
||||||
|
`image probe missing code (${imageCode}): ${imageText}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
|
||||||
|
if (
|
||||||
|
(model.provider === "openai" && model.api === "openai-responses") ||
|
||||||
|
(model.provider === "openai-codex" &&
|
||||||
|
model.api === "openai-codex-responses")
|
||||||
|
) {
|
||||||
|
logProgress(`${progressLabel}: tool-only regression`);
|
||||||
|
const runId2 = randomUUID();
|
||||||
|
const first = await client.request<AgentFinalPayload>(
|
||||||
|
"agent",
|
||||||
|
{
|
||||||
|
sessionKey,
|
||||||
|
idempotencyKey: `idem-${runId2}-1`,
|
||||||
|
message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
|
||||||
|
thinking: params.thinkingLevel,
|
||||||
|
deliver: false,
|
||||||
|
},
|
||||||
|
{ expectFinal: true },
|
||||||
|
);
|
||||||
|
if (first?.status !== "ok") {
|
||||||
|
throw new Error(`tool-only turn failed: status=${String(first?.status)}`);
|
||||||
|
}
|
||||||
|
const firstText = extractPayloadText(first?.result);
|
||||||
|
assertNoReasoningTags({
|
||||||
|
text: firstText,
|
||||||
|
model: modelKey,
|
||||||
|
phase: "tool-only",
|
||||||
|
label: params.label,
|
||||||
|
});
|
||||||
|
|
||||||
|
const second = await client.request<AgentFinalPayload>(
|
||||||
|
"agent",
|
||||||
|
{
|
||||||
|
sessionKey,
|
||||||
|
idempotencyKey: `idem-${runId2}-2`,
|
||||||
|
message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
|
||||||
|
thinking: params.thinkingLevel,
|
||||||
|
deliver: false,
|
||||||
|
},
|
||||||
|
{ expectFinal: true },
|
||||||
|
);
|
||||||
|
if (second?.status !== "ok") {
|
||||||
|
throw new Error(
|
||||||
|
`post-tool message failed: status=${String(second?.status)}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const reply = extractPayloadText(second?.result);
|
||||||
|
assertNoReasoningTags({
|
||||||
|
text: reply,
|
||||||
|
model: modelKey,
|
||||||
|
phase: "tool-only-followup",
|
||||||
|
label: params.label,
|
||||||
|
});
|
||||||
|
if (!reply.includes(nonceA) || !reply.includes(nonceB)) {
|
||||||
|
throw new Error(`unexpected reply: ${reply}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logProgress(`${progressLabel}: done`);
|
||||||
|
break;
|
||||||
|
} catch (err) {
|
||||||
|
const message = String(err);
|
||||||
|
if (
|
||||||
|
model.provider === "anthropic" &&
|
||||||
|
isAnthropicRateLimitError(message) &&
|
||||||
|
attempt + 1 < attemptMax
|
||||||
|
) {
|
||||||
|
logProgress(`${progressLabel}: rate limit, retrying with next key`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests.
|
||||||
|
if (
|
||||||
|
model.provider === "openai-codex" &&
|
||||||
|
isRefreshTokenReused(message)
|
||||||
|
) {
|
||||||
|
logProgress(`${progressLabel}: skip (codex refresh token reused)`);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
logProgress(`${progressLabel}: failed`);
|
||||||
|
failures.push({ model: modelKey, error: message });
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (failures.length > 0) {
|
||||||
|
const preview = failures
|
||||||
|
.slice(0, 20)
|
||||||
|
.map((f) => `- ${f.model}: ${f.error}`)
|
||||||
|
.join("\n");
|
||||||
|
throw new Error(
|
||||||
|
`gateway live model failures (${failures.length}):\n${preview}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
client.stop();
|
||||||
|
await server.close({ reason: "live test complete" });
|
||||||
|
await fs.rm(toolProbePath, { force: true });
|
||||||
|
await fs.rm(tempDir, { recursive: true, force: true });
|
||||||
|
|
||||||
|
process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
|
||||||
|
process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
|
||||||
|
process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders;
|
||||||
|
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
|
||||||
|
process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
|
||||||
|
process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
describeLive("gateway live (dev agent, profile keys)", () => {
|
describeLive("gateway live (dev agent, profile keys)", () => {
|
||||||
it(
|
it(
|
||||||
"runs meaningful prompts across models with available keys",
|
"runs meaningful prompts across models with available keys",
|
||||||
async () => {
|
async () => {
|
||||||
const previous = {
|
|
||||||
configPath: process.env.CLAWDBOT_CONFIG_PATH,
|
|
||||||
token: process.env.CLAWDBOT_GATEWAY_TOKEN,
|
|
||||||
skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS,
|
|
||||||
skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
|
|
||||||
skipCron: process.env.CLAWDBOT_SKIP_CRON,
|
|
||||||
skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
|
|
||||||
};
|
|
||||||
|
|
||||||
process.env.CLAWDBOT_SKIP_PROVIDERS = "1";
|
|
||||||
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
|
|
||||||
process.env.CLAWDBOT_SKIP_CRON = "1";
|
|
||||||
process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
|
|
||||||
|
|
||||||
const token = `test-${randomUUID()}`;
|
|
||||||
process.env.CLAWDBOT_GATEWAY_TOKEN = token;
|
|
||||||
|
|
||||||
const cfg = loadConfig();
|
const cfg = loadConfig();
|
||||||
await ensureClawdbotModelsJson(cfg);
|
await ensureClawdbotModelsJson(cfg);
|
||||||
|
|
||||||
const workspaceDir = resolveUserPath(
|
|
||||||
cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"),
|
|
||||||
);
|
|
||||||
await fs.mkdir(workspaceDir, { recursive: true });
|
|
||||||
const nonceA = randomUUID();
|
|
||||||
const nonceB = randomUUID();
|
|
||||||
const toolProbePath = path.join(
|
|
||||||
workspaceDir,
|
|
||||||
`.clawdbot-live-tool-probe.${nonceA}.txt`,
|
|
||||||
);
|
|
||||||
await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
|
|
||||||
|
|
||||||
const agentDir = resolveClawdbotAgentDir();
|
const agentDir = resolveClawdbotAgentDir();
|
||||||
const authStorage = discoverAuthStorage(agentDir);
|
const authStorage = discoverAuthStorage(agentDir);
|
||||||
const modelRegistry = discoverModels(authStorage, agentDir);
|
const modelRegistry = discoverModels(authStorage, agentDir);
|
||||||
const all = modelRegistry.getAll() as Array<Model<Api>>;
|
const all = modelRegistry.getAll() as Array<Model<Api>>;
|
||||||
|
|
||||||
const filter = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_MODELS);
|
const rawModels = process.env.CLAWDBOT_LIVE_GATEWAY_MODELS?.trim();
|
||||||
|
const useModern =
|
||||||
// Default: honor user allowlist. Opt-in: scan all models with keys.
|
!rawModels || rawModels === "modern" || rawModels === "all";
|
||||||
const allowlistKeys = Object.keys(cfg.agents?.defaults?.models ?? {});
|
const useExplicit = Boolean(rawModels) && !useModern;
|
||||||
const wanted =
|
const filter = useExplicit ? parseFilter(rawModels) : null;
|
||||||
ALL_MODELS || allowlistKeys.length === 0
|
const wanted = filter
|
||||||
? all
|
? all.filter((m) => filter.has(`${m.provider}/${m.id}`))
|
||||||
: all.filter((m) => allowlistKeys.includes(`${m.provider}/${m.id}`));
|
: all.filter((m) =>
|
||||||
|
isModernModelRef({ provider: m.provider, id: m.id }),
|
||||||
|
);
|
||||||
|
|
||||||
const candidates: Array<Model<Api>> = [];
|
const candidates: Array<Model<Api>> = [];
|
||||||
for (const model of wanted) {
|
for (const model of wanted) {
|
||||||
const id = `${model.provider}/${model.id}`;
|
const id = `${model.provider}/${model.id}`;
|
||||||
if (PROVIDERS && !PROVIDERS.has(model.provider)) continue;
|
if (PROVIDERS && !PROVIDERS.has(model.provider)) continue;
|
||||||
if (filter && !filter.has(id)) continue;
|
|
||||||
try {
|
try {
|
||||||
// eslint-disable-next-line no-await-in-loop
|
// eslint-disable-next-line no-await-in-loop
|
||||||
await getApiKeyForModel({ model, cfg });
|
await getApiKeyForModel({ model, cfg });
|
||||||
|
|
@ -264,315 +697,72 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
expect(candidates.length).toBeGreaterThan(0);
|
if (candidates.length === 0) {
|
||||||
const imageCandidates = EXTRA_IMAGE_PROBES
|
logProgress("[all-models] no API keys found; skipping");
|
||||||
? candidates.filter((m) => m.input?.includes("image"))
|
return;
|
||||||
: [];
|
}
|
||||||
if (EXTRA_IMAGE_PROBES && imageCandidates.length === 0) {
|
logProgress(
|
||||||
throw new Error(
|
`[all-models] selection=${useExplicit ? "explicit" : "modern"}`,
|
||||||
"image probe enabled but no selected models advertise image support; set CLAWDBOT_LIVE_GATEWAY_MODELS to include an image-capable model",
|
);
|
||||||
|
const imageCandidates = candidates.filter((m) =>
|
||||||
|
m.input?.includes("image"),
|
||||||
|
);
|
||||||
|
if (imageCandidates.length === 0) {
|
||||||
|
logProgress(
|
||||||
|
"[all-models] no image-capable models selected; image probe will be skipped",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
await runGatewayModelSuite({
|
||||||
// Build a temp config that allows all selected models, so session overrides stick.
|
label: "all-models",
|
||||||
const lmstudioProvider = cfg.models?.providers?.lmstudio;
|
cfg,
|
||||||
const nextCfg = {
|
candidates,
|
||||||
...cfg,
|
extraToolProbes: true,
|
||||||
agents: {
|
extraImageProbes: true,
|
||||||
...cfg.agents,
|
thinkingLevel: THINKING_LEVEL,
|
||||||
list: (cfg.agents?.list ?? []).map((entry) => ({
|
|
||||||
...entry,
|
|
||||||
sandbox: { mode: "off" },
|
|
||||||
})),
|
|
||||||
defaults: {
|
|
||||||
...cfg.agents?.defaults,
|
|
||||||
// Live tests should avoid Docker sandboxing so tool probes can
|
|
||||||
// operate on the temporary probe files we create in the host workspace.
|
|
||||||
sandbox: { mode: "off" },
|
|
||||||
models: Object.fromEntries(
|
|
||||||
candidates.map((m) => [`${m.provider}/${m.id}`, {}]),
|
|
||||||
),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
models: {
|
|
||||||
...cfg.models,
|
|
||||||
providers: {
|
|
||||||
...cfg.models?.providers,
|
|
||||||
// LM Studio is most reliable via Chat Completions; its Responses API
|
|
||||||
// tool-calling behavior is inconsistent across releases.
|
|
||||||
...(lmstudioProvider
|
|
||||||
? {
|
|
||||||
lmstudio: {
|
|
||||||
...lmstudioProvider,
|
|
||||||
api: "openai-completions",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
: {}),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
};
|
|
||||||
const tempDir = await fs.mkdtemp(
|
|
||||||
path.join(os.tmpdir(), "clawdbot-live-"),
|
|
||||||
);
|
|
||||||
const tempConfigPath = path.join(tempDir, "clawdbot.json");
|
|
||||||
await fs.writeFile(
|
|
||||||
tempConfigPath,
|
|
||||||
`${JSON.stringify(nextCfg, null, 2)}\n`,
|
|
||||||
);
|
|
||||||
process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath;
|
|
||||||
|
|
||||||
const port = await getFreeGatewayPort();
|
|
||||||
const server = await startGatewayServer(port, {
|
|
||||||
bind: "loopback",
|
|
||||||
auth: { mode: "token", token },
|
|
||||||
controlUiEnabled: false,
|
|
||||||
});
|
});
|
||||||
|
|
||||||
const client = await connectClient({
|
const minimaxCandidates = candidates.filter((model) => model.provider === "minimax");
|
||||||
url: `ws://127.0.0.1:${port}`,
|
if (minimaxCandidates.length === 0) {
|
||||||
token,
|
logProgress("[minimax] no candidates with keys; skipping dual endpoint probes");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const minimaxOpenAi = buildMinimaxProviderOverride({
|
||||||
|
cfg,
|
||||||
|
api: "openai-completions",
|
||||||
|
baseUrl: "https://api.minimax.io/v1",
|
||||||
});
|
});
|
||||||
|
if (minimaxOpenAi) {
|
||||||
|
await runGatewayModelSuite({
|
||||||
|
label: "minimax-openai",
|
||||||
|
cfg,
|
||||||
|
candidates: minimaxCandidates,
|
||||||
|
extraToolProbes: true,
|
||||||
|
extraImageProbes: true,
|
||||||
|
thinkingLevel: THINKING_LEVEL,
|
||||||
|
providerOverrides: { minimax: minimaxOpenAi },
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
logProgress("[minimax-openai] missing minimax provider config; skipping");
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
const minimaxAnthropic = buildMinimaxProviderOverride({
|
||||||
const sessionKey = "agent:dev:live-gateway";
|
cfg,
|
||||||
|
api: "anthropic-messages",
|
||||||
const failures: Array<{ model: string; error: string }> = [];
|
baseUrl: "https://api.minimax.io/anthropic",
|
||||||
|
});
|
||||||
for (const model of candidates) {
|
if (minimaxAnthropic) {
|
||||||
const modelKey = `${model.provider}/${model.id}`;
|
await runGatewayModelSuite({
|
||||||
|
label: "minimax-anthropic",
|
||||||
try {
|
cfg,
|
||||||
// Ensure session exists + override model for this run.
|
candidates: minimaxCandidates,
|
||||||
await client.request<Record<string, unknown>>("sessions.patch", {
|
extraToolProbes: true,
|
||||||
key: sessionKey,
|
extraImageProbes: true,
|
||||||
model: modelKey,
|
thinkingLevel: THINKING_LEVEL,
|
||||||
});
|
providerOverrides: { minimax: minimaxAnthropic },
|
||||||
// Reset between models: avoids cross-provider transcript incompatibilities
|
});
|
||||||
// (notably OpenAI Responses requiring reasoning replay for function_call items).
|
} else {
|
||||||
await client.request<Record<string, unknown>>("sessions.reset", {
|
logProgress("[minimax-anthropic] missing minimax provider config; skipping");
|
||||||
key: sessionKey,
|
|
||||||
});
|
|
||||||
|
|
||||||
// “Meaningful” direct prompt (no tools).
|
|
||||||
const runId = randomUUID();
|
|
||||||
const payload = await client.request<AgentFinalPayload>(
|
|
||||||
"agent",
|
|
||||||
{
|
|
||||||
sessionKey,
|
|
||||||
idempotencyKey: `idem-${runId}`,
|
|
||||||
message:
|
|
||||||
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
|
|
||||||
deliver: false,
|
|
||||||
},
|
|
||||||
{ expectFinal: true },
|
|
||||||
);
|
|
||||||
|
|
||||||
if (payload?.status !== "ok") {
|
|
||||||
throw new Error(`agent status=${String(payload?.status)}`);
|
|
||||||
}
|
|
||||||
const text = extractPayloadText(payload?.result);
|
|
||||||
if (
|
|
||||||
model.provider === "google" &&
|
|
||||||
isGoogleModelNotFoundText(text)
|
|
||||||
) {
|
|
||||||
// Catalog drift: model IDs can disappear or become unavailable on the API.
|
|
||||||
// Treat as skip when scanning "all models" for Google.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`);
|
|
||||||
if (
|
|
||||||
!/\bmicro\s*-?\s*tasks?\b/i.test(text) ||
|
|
||||||
!/\bmacro\s*-?\s*tasks?\b/i.test(text)
|
|
||||||
) {
|
|
||||||
throw new Error(`missing required keywords: ${text}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Real tool invocation: force the agent to Read a local file and echo a nonce.
|
|
||||||
const runIdTool = randomUUID();
|
|
||||||
const toolProbe = await client.request<AgentFinalPayload>(
|
|
||||||
"agent",
|
|
||||||
{
|
|
||||||
sessionKey,
|
|
||||||
idempotencyKey: `idem-${runIdTool}-tool`,
|
|
||||||
message:
|
|
||||||
"Clawdbot live tool probe (local, safe): " +
|
|
||||||
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
|
|
||||||
"Then reply with the two nonce values you read (include both).",
|
|
||||||
deliver: false,
|
|
||||||
},
|
|
||||||
{ expectFinal: true },
|
|
||||||
);
|
|
||||||
if (toolProbe?.status !== "ok") {
|
|
||||||
throw new Error(
|
|
||||||
`tool probe failed: status=${String(toolProbe?.status)}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const toolText = extractPayloadText(toolProbe?.result);
|
|
||||||
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
|
|
||||||
throw new Error(`tool probe missing nonce: ${toolText}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (EXTRA_TOOL_PROBES) {
|
|
||||||
const nonceC = randomUUID();
|
|
||||||
const toolWritePath = path.join(
|
|
||||||
tempDir,
|
|
||||||
`write-${runIdTool}.txt`,
|
|
||||||
);
|
|
||||||
|
|
||||||
const execReadProbe = await client.request<AgentFinalPayload>(
|
|
||||||
"agent",
|
|
||||||
{
|
|
||||||
sessionKey,
|
|
||||||
idempotencyKey: `idem-${runIdTool}-exec-read`,
|
|
||||||
message:
|
|
||||||
"Clawdbot live tool probe (local, safe): " +
|
|
||||||
"use the tool named `exec` (or `Exec`) to run this command: " +
|
|
||||||
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
|
|
||||||
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
|
|
||||||
"Finally reply including the nonce text you read back.",
|
|
||||||
deliver: false,
|
|
||||||
},
|
|
||||||
{ expectFinal: true },
|
|
||||||
);
|
|
||||||
if (execReadProbe?.status !== "ok") {
|
|
||||||
throw new Error(
|
|
||||||
`exec+read probe failed: status=${String(execReadProbe?.status)}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const execReadText = extractPayloadText(execReadProbe?.result);
|
|
||||||
if (!execReadText.includes(nonceC)) {
|
|
||||||
throw new Error(
|
|
||||||
`exec+read probe missing nonce: ${execReadText}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
await fs.rm(toolWritePath, { force: true });
|
|
||||||
}
|
|
||||||
|
|
||||||
if (EXTRA_IMAGE_PROBES && model.input?.includes("image")) {
|
|
||||||
const imageCode = randomImageProbeCode(10);
|
|
||||||
const imageBase64 = renderCatNoncePngBase64(imageCode);
|
|
||||||
const runIdImage = randomUUID();
|
|
||||||
|
|
||||||
const imageProbe = await client.request<AgentFinalPayload>(
|
|
||||||
"agent",
|
|
||||||
{
|
|
||||||
sessionKey,
|
|
||||||
idempotencyKey: `idem-${runIdImage}-image`,
|
|
||||||
message:
|
|
||||||
"Look at the attached image. Reply with exactly two tokens separated by a single space: " +
|
|
||||||
"(1) the animal shown or written in the image, lowercase; " +
|
|
||||||
"(2) the code printed in the image, uppercase. No extra text.",
|
|
||||||
attachments: [
|
|
||||||
{
|
|
||||||
mimeType: "image/png",
|
|
||||||
fileName: `probe-${runIdImage}.png`,
|
|
||||||
content: imageBase64,
|
|
||||||
},
|
|
||||||
],
|
|
||||||
deliver: false,
|
|
||||||
},
|
|
||||||
{ expectFinal: true },
|
|
||||||
);
|
|
||||||
if (imageProbe?.status !== "ok") {
|
|
||||||
throw new Error(
|
|
||||||
`image probe failed: status=${String(imageProbe?.status)}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const imageText = extractPayloadText(imageProbe?.result);
|
|
||||||
if (!/\bcat\b/i.test(imageText)) {
|
|
||||||
throw new Error(`image probe missing 'cat': ${imageText}`);
|
|
||||||
}
|
|
||||||
const candidates =
|
|
||||||
imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
|
|
||||||
const bestDistance = candidates.reduce((best, cand) => {
|
|
||||||
if (Math.abs(cand.length - imageCode.length) > 2) return best;
|
|
||||||
return Math.min(best, editDistance(cand, imageCode));
|
|
||||||
}, Number.POSITIVE_INFINITY);
|
|
||||||
if (!(bestDistance <= 2)) {
|
|
||||||
throw new Error(
|
|
||||||
`image probe missing code (${imageCode}): ${imageText}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
|
|
||||||
if (
|
|
||||||
(model.provider === "openai" &&
|
|
||||||
model.api === "openai-responses") ||
|
|
||||||
(model.provider === "openai-codex" &&
|
|
||||||
model.api === "openai-codex-responses")
|
|
||||||
) {
|
|
||||||
const runId2 = randomUUID();
|
|
||||||
const first = await client.request<AgentFinalPayload>(
|
|
||||||
"agent",
|
|
||||||
{
|
|
||||||
sessionKey,
|
|
||||||
idempotencyKey: `idem-${runId2}-1`,
|
|
||||||
message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
|
|
||||||
deliver: false,
|
|
||||||
},
|
|
||||||
{ expectFinal: true },
|
|
||||||
);
|
|
||||||
if (first?.status !== "ok") {
|
|
||||||
throw new Error(
|
|
||||||
`tool-only turn failed: status=${String(first?.status)}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const second = await client.request<AgentFinalPayload>(
|
|
||||||
"agent",
|
|
||||||
{
|
|
||||||
sessionKey,
|
|
||||||
idempotencyKey: `idem-${runId2}-2`,
|
|
||||||
message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
|
|
||||||
deliver: false,
|
|
||||||
},
|
|
||||||
{ expectFinal: true },
|
|
||||||
);
|
|
||||||
if (second?.status !== "ok") {
|
|
||||||
throw new Error(
|
|
||||||
`post-tool message failed: status=${String(second?.status)}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const reply = extractPayloadText(second?.result);
|
|
||||||
if (!reply.includes(nonceA) || !reply.includes(nonceB)) {
|
|
||||||
throw new Error(`unexpected reply: ${reply}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
const message = String(err);
|
|
||||||
// OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests.
|
|
||||||
if (
|
|
||||||
model.provider === "openai-codex" &&
|
|
||||||
isRefreshTokenReused(message)
|
|
||||||
) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
failures.push({ model: modelKey, error: message });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (failures.length > 0) {
|
|
||||||
const preview = failures
|
|
||||||
.slice(0, 20)
|
|
||||||
.map((f) => `- ${f.model}: ${f.error}`)
|
|
||||||
.join("\n");
|
|
||||||
throw new Error(
|
|
||||||
`gateway live model failures (${failures.length}):\n${preview}`,
|
|
||||||
);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
client.stop();
|
|
||||||
await server.close({ reason: "live test complete" });
|
|
||||||
await fs.rm(toolProbePath, { force: true });
|
|
||||||
await fs.rm(tempDir, { recursive: true, force: true });
|
|
||||||
|
|
||||||
process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
|
|
||||||
process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
|
|
||||||
process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders;
|
|
||||||
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
|
|
||||||
process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
|
|
||||||
process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
20 * 60 * 1000,
|
20 * 60 * 1000,
|
||||||
|
|
@ -661,6 +851,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
||||||
message:
|
message:
|
||||||
`Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
|
`Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
|
||||||
`Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
|
`Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
|
||||||
|
thinking: THINKING_LEVEL,
|
||||||
deliver: false,
|
deliver: false,
|
||||||
},
|
},
|
||||||
{ expectFinal: true },
|
{ expectFinal: true },
|
||||||
|
|
@ -671,6 +862,12 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
const toolText = extractPayloadText(toolProbe?.result);
|
const toolText = extractPayloadText(toolProbe?.result);
|
||||||
|
assertNoReasoningTags({
|
||||||
|
text: toolText,
|
||||||
|
model: "anthropic/claude-opus-4-5",
|
||||||
|
phase: "zai-fallback-tool",
|
||||||
|
label: "zai-fallback",
|
||||||
|
});
|
||||||
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
|
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
|
||||||
throw new Error(`anthropic tool probe missing nonce: ${toolText}`);
|
throw new Error(`anthropic tool probe missing nonce: ${toolText}`);
|
||||||
}
|
}
|
||||||
|
|
@ -689,6 +886,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
||||||
message:
|
message:
|
||||||
`What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
|
`What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
|
||||||
`Reply with exactly: ${nonceA} ${nonceB}.`,
|
`Reply with exactly: ${nonceA} ${nonceB}.`,
|
||||||
|
thinking: THINKING_LEVEL,
|
||||||
deliver: false,
|
deliver: false,
|
||||||
},
|
},
|
||||||
{ expectFinal: true },
|
{ expectFinal: true },
|
||||||
|
|
@ -699,6 +897,12 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
const followupText = extractPayloadText(followup?.result);
|
const followupText = extractPayloadText(followup?.result);
|
||||||
|
assertNoReasoningTags({
|
||||||
|
text: followupText,
|
||||||
|
model: "zai/glm-4.7",
|
||||||
|
phase: "zai-fallback-followup",
|
||||||
|
label: "zai-fallback",
|
||||||
|
});
|
||||||
if (!followupText.includes(nonceA) || !followupText.includes(nonceB)) {
|
if (!followupText.includes(nonceA) || !followupText.includes(nonceB)) {
|
||||||
throw new Error(`zai followup missing nonce: ${followupText}`);
|
throw new Error(`zai followup missing nonce: ${followupText}`);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
import { execFileSync } from "node:child_process";
|
||||||
import fs from "node:fs";
|
import fs from "node:fs";
|
||||||
import os from "node:os";
|
import os from "node:os";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
|
|
@ -11,6 +12,37 @@ function restoreEnv(entries: RestoreEntry[]): void {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function loadProfileEnv(): void {
|
||||||
|
const profilePath = path.join(os.homedir(), ".profile");
|
||||||
|
if (!fs.existsSync(profilePath)) return;
|
||||||
|
try {
|
||||||
|
const output = execFileSync(
|
||||||
|
"/bin/bash",
|
||||||
|
[
|
||||||
|
"-lc",
|
||||||
|
`set -a; source \"${profilePath}\" >/dev/null 2>&1; env -0`,
|
||||||
|
],
|
||||||
|
{ encoding: "utf8" },
|
||||||
|
);
|
||||||
|
const entries = output.split("\0");
|
||||||
|
let applied = 0;
|
||||||
|
for (const entry of entries) {
|
||||||
|
if (!entry) continue;
|
||||||
|
const idx = entry.indexOf("=");
|
||||||
|
if (idx <= 0) continue;
|
||||||
|
const key = entry.slice(0, idx);
|
||||||
|
if (!key || (process.env[key] ?? "") !== "") continue;
|
||||||
|
process.env[key] = entry.slice(idx + 1);
|
||||||
|
applied += 1;
|
||||||
|
}
|
||||||
|
if (applied > 0) {
|
||||||
|
console.log(`[live] loaded ${applied} env vars from ~/.profile`);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// ignore profile load failures
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export function installTestEnv(): { cleanup: () => void; tempHome: string } {
|
export function installTestEnv(): { cleanup: () => void; tempHome: string } {
|
||||||
const live =
|
const live =
|
||||||
process.env.LIVE === "1" ||
|
process.env.LIVE === "1" ||
|
||||||
|
|
@ -20,6 +52,7 @@ export function installTestEnv(): { cleanup: () => void; tempHome: string } {
|
||||||
// Live tests must use the real user environment (keys, profiles, config).
|
// Live tests must use the real user environment (keys, profiles, config).
|
||||||
// The default test env isolates HOME to avoid touching real state.
|
// The default test env isolates HOME to avoid touching real state.
|
||||||
if (live) {
|
if (live) {
|
||||||
|
loadProfileEnv();
|
||||||
return { cleanup: () => {}, tempHome: process.env.HOME ?? "" };
|
return { cleanup: () => {}, tempHome: process.env.HOME ?? "" };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue