{
  "version": 1,
  "description": "Managed model candidates for ThumbGate workload benchmarking. Catalog only: no provider-specific runtime dependency is assumed here.",
  "workloads": {
    "pretool-gating": {
      "label": "PreTool gating",
      "summary": "Fast, reliable gate judgments for tool-use and agentic coding decisions before commands run.",
      "desiredStrengths": ["agentic-coding", "tool-use", "reliability"],
      "targetContextWindow": 64000,
      "benchmarkCommands": [
        "npx thumbgate eval --from-feedback --json --min-score=0",
        "node scripts/gate-eval.js run",
        "npx thumbgate bench --json --min-score=90"
      ],
      "metrics": [
        "passRate",
        "falsePositiveRate",
        "falseNegativeRate",
        "medianLatencyMs",
        "costPer1kActionsUsd"
      ]
    },
    "long-trace-review": {
      "label": "Long trace review",
      "summary": "Review long agent traces, multi-step failures, and large-context coding sessions without dropping important detail.",
      "desiredStrengths": ["long-horizon-coding", "multi-agent", "reliability", "long-context"],
      "targetContextWindow": 128000,
      "benchmarkCommands": [
        "npx thumbgate eval --from-feedback --json --min-score=0",
        "node scripts/gate-eval.js run",
        "npx thumbgate bench --json --min-score=90",
        "npx thumbgate deepseek-v4-runtime-guardrails --context-tokens=900000 --hybrid-attention --speculative-decoding --json"
      ],
      "metrics": [
        "passRate",
        "longContextReliability",
        "traceCompressionLoss",
        "cacheCoherencePassRate",
        "speculativeAcceptLength",
        "medianLatencyMs",
        "costPerTraceUsd"
      ]
    },
    "cheap-fast-path": {
      "label": "Cheap fast path",
      "summary": "Low-cost first-pass model for cheap approval triage before escalating ambiguous work.",
      "desiredStrengths": ["agentic-coding", "tool-use"],
      "targetContextWindow": 32000,
      "benchmarkCommands": [
        "npx thumbgate eval --from-feedback --json --min-score=0",
        "node scripts/gate-eval.js run",
        "npx thumbgate bench --json --min-score=90"
      ],
      "metrics": [
        "passRate",
        "medianLatencyMs",
        "costPer1kActionsUsd",
        "escalationRate"
      ]
    },
    "dashboard-analysis": {
      "label": "Dashboard and dataset analysis",
      "summary": "Evaluate frontier models for dataset analysis, chart generation, dashboard planning, and proof-backed insight quality before routing expensive analytical work.",
      "desiredStrengths": ["data-analysis", "dashboard-creation", "charting", "long-context", "reliability"],
      "targetContextWindow": 200000,
      "benchmarkCommands": [
        "npx thumbgate eval --from-feedback --json --min-score=0",
        "node scripts/gate-eval.js run",
        "npx thumbgate bench --json --min-score=90"
      ],
      "metrics": [
        "insightAccuracy",
        "chartSpecValidity",
        "dashboardCompleteness",
        "longContextReliability",
        "medianLatencyMs",
        "costPerAnalysisUsd"
      ]
    }
  },
  "candidates": [
    {
      "id": "self-hosted/deepseek-v4-flash-sglang",
      "vendor": "DeepSeek",
      "family": "deepseek",
      "provider": "self-hosted",
      "gateway": "sglang",
      "model": "deepseek-v4-flash",
      "contextWindow": 1000000,
      "costClass": "medium",
      "strengths": ["long-context", "fast-inference", "reliability", "long-horizon-coding"],
      "notes": "Self-hosted long-context candidate for teams that can operate SGLang-class sparse-attention serving. Requires ThumbGate runtime guardrails before routing production traces."
    },
    {
      "id": "self-hosted/deepseek-v4-pro-sglang",
      "vendor": "DeepSeek",
      "family": "deepseek",
      "provider": "self-hosted",
      "gateway": "sglang",
      "model": "deepseek-v4-pro",
      "contextWindow": 1000000,
      "costClass": "high",
      "strengths": ["long-context", "reliability", "long-horizon-coding", "multi-agent"],
      "notes": "High-capacity self-hosted candidate for long-trace review and verified-RL experiments. Benchmark cache coherence, speculative decoding, KV offload, and train-inference drift before use."
    },
    {
      "id": "openai/gpt-5.5",
      "vendor": "OpenAI",
      "family": "gpt",
      "provider": "openai",
      "model": "gpt-5.5",
      "contextWindow": 1000000,
      "costClass": "high",
      "strengths": ["agentic-coding", "tool-use", "reliability", "long-context", "data-analysis", "dashboard-creation", "charting"],
      "notes": "Frontier candidate for complex reasoning, coding, dataset analysis, and dashboard workflows. Benchmark before routing high-volume or cost-sensitive work."
    },
    {
      "id": "anthropic/claude-haiku-4-5",
      "vendor": "Anthropic",
      "family": "claude",
      "provider": "anthropic",
      "model": "claude-haiku-4-5-20251001",
      "contextWindow": 200000,
      "costClass": "low",
      "strengths": ["tool-use", "reliability", "fast-inference"],
      "notes": "Fast control candidate for cheap approval triage."
    },
    {
      "id": "anthropic/claude-sonnet-4-6",
      "vendor": "Anthropic",
      "family": "claude",
      "provider": "anthropic",
      "model": "claude-sonnet-4-6",
      "contextWindow": 200000,
      "costClass": "medium",
      "strengths": ["agentic-coding", "tool-use", "reliability", "long-horizon-coding"],
      "notes": "Current stronger managed control candidate."
    },
    {
      "id": "tinker/kimi-k2.6-32k",
      "vendor": "Thinking Machines",
      "family": "kimi",
      "provider": "openai-compatible",
      "gateway": "tinker",
      "model": "kimi-k2.6-32k",
      "contextWindow": 32000,
      "costClass": "medium",
      "strengths": ["long-horizon-coding", "multi-agent", "reliability"],
      "notes": "Tinker April 23, 2026 release. Good candidate when long-horizon coding matters more than ultra-low latency."
    },
    {
      "id": "tinker/kimi-k2.6-128k",
      "vendor": "Thinking Machines",
      "family": "kimi",
      "provider": "openai-compatible",
      "gateway": "tinker",
      "model": "kimi-k2.6-128k",
      "contextWindow": 128000,
      "costClass": "medium",
      "strengths": ["long-horizon-coding", "multi-agent", "reliability", "long-context"],
      "notes": "Highest-ROI Kimi candidate for long traces and multi-step review."
    },
    {
      "id": "tinker/qwen3.6-35b-a3b",
      "vendor": "Thinking Machines",
      "family": "qwen",
      "provider": "openai-compatible",
      "gateway": "tinker",
      "model": "qwen3.6-35b-a3b",
      "contextWindow": 64000,
      "costClass": "low",
      "strengths": ["agentic-coding", "tool-use", "reliability", "fast-inference"],
      "notes": "Best first Tinker candidate for ThumbGate pre-action gating and tool-risk classification."
    },
    {
      "id": "tinker/qwen3.6-27b",
      "vendor": "Thinking Machines",
      "family": "qwen",
      "provider": "openai-compatible",
      "gateway": "tinker",
      "model": "qwen3.6-27b",
      "contextWindow": 64000,
      "costClass": "low",
      "strengths": ["agentic-coding", "tool-use", "fast-inference"],
      "notes": "Cheapest Tinker candidate for the fast gate path; use when latency/cost matter most."
    }
  ]
}
