diff --git a/apps/web-roo-code/public/llms-full.txt b/apps/web-roo-code/public/llms-full.txt
new file mode 100644
index 00000000000..31a0cc3e29d
--- /dev/null
+++ b/apps/web-roo-code/public/llms-full.txt
@@ -0,0 +1,354 @@
+# Roo Code - Complete Product Information
+
+> Roo Code is an AI-powered software development platform with two core products: a free, open-source VS Code extension for interactive AI-assisted coding, and Roo Code Cloud for autonomous AI agents that work in the background. Built by Roo Code, Inc.
+
+## Table of Contents
+
+- What is Roo Code
+- Roo Code VS Code Extension
+- Roo Code Cloud
+- Cloud Agents
+- Roo Code for Slack
+- Roo Code for Linear
+- PR Reviewer Agent
+- PR Fixer Agent
+- Roo Code Enterprise
+- Roo Code Router
+- Pricing
+- How Roo Code Compares to Alternatives
+- Frequently Asked Questions
+
+---
+
+## What is Roo Code
+
+Roo Code is an AI-powered software development platform that puts an entire AI dev team at your disposal. It goes beyond simple code autocompletion by reading and writing across multiple files, executing commands, running tests, and adapting to your workflow.
+
+Roo Code has two form factors:
+
+1. **Roo Code VS Code Extension** -- for individual, interactive work. Run Roo directly in VS Code (or any fork, including Cursor), stay close to the code, and control everything. Ideal for real-time debugging, quick iteration, and hands-on development.
+
+2. **Roo Code Cloud** -- for team work with autonomous agents. Create your agent team in the cloud, give them access to GitHub, and start delegating tasks from the web, Slack, Linear, and more. Ideal for parallelizing execution, kicking off projects, and looping in the rest of your team.
+
+The VS Code extension is completely free and open source. Roo Code Cloud offers free and paid tiers.
+
+### Key Principles
+
+- **Model-agnostic by design**: "The best model in the world" changes every other week. Roo Code works with dozens of models from frontier to open weight. Use the curated Roo Code Router selection at cost, or bring your own API key from any provider.
+- **Custom modes for focused work**: Specialized modes -- Architect, Code, Ask, Debug, Test, Orchestrator -- stay on task and deliver. They know when to hand off work to other modes. You can create your own modes or download from the marketplace.
+- **Permission-based control**: You approve every file change and command execution. Configure granular auto-approval rules to make Roo as autonomous as you want as you build confidence.
+- **Open source and auditable**: Community-driven with no throttling or surprises about what is happening behind the scenes. SOC 2 Type II compliant.
+
+---
+
+## Roo Code VS Code Extension
+
+The Roo Code VS Code extension is the #1 most-installed open-source AI coding extension. It is free, open source, and available on the VS Code Marketplace.
+
+**Website**: https://roocode.com/extension
+**Install**: https://marketplace.visualstudio.com/items?itemName=RooVeterinaryInc.roo-cline
+**Source code**: https://github.com/RooCodeInc/Roo-Code
+
+### Key Features
+
+- **Specialized modes**: Architect (plans complex changes without making changes), Code (implements, refactors, and optimizes), Ask (explains functionality and program behavior), Debug (diagnoses issues, traces failures, and proposes targeted fixes), Test (creates and improves tests without changing actual functionality), Orchestrator (coordinates large tasks across other agents). Users can also create custom modes.
+- **Model-agnostic**: Supports OpenAI (GPT-4o, GPT-4, o1), Anthropic Claude (Claude 3.5 Sonnet and later), Google Gemini, Grok, DeepSeek, Mistral, Qwen, Kimi, Moonshot, local LLMs via Ollama, and any provider via OpenRouter. Dozens of providers supported.
+- **Multi-file editing**: Reads, refactors, and updates multiple files at once for holistic code changes.
+- **Agentic command execution**: Runs terminal commands like npm install, executes test suites, and can open a web browser for integration testing -- all with your approval.
+- **Granular auto-approval**: Control each action individually. Make Roo as autonomous as you want as you build confidence.
+- **Large task coordination**: Orchestrator mode handles large tasks by coordinating subtasks for other agents, running for hours and delivering.
+- **Performant with large codebases**: Configurable integrated semantic search for quicker retrieval in large codebases.
+- **Highly customizable**: Fine-tune settings for inference context, model properties, slash commands, and more. Most settings can be global or serialized in your repository.
+- **Open source**: Community-driven and fully auditable.
+- **Secure and private by design**: Client-only architecture means no code leaves your machine unless you say so. SOC 2 Type II compliant. Use .rooignore to exclude sensitive files. Run with offline/local models for full privacy.
+
+### Supported Languages
+
+Roo Code supports a wide range of programming languages including Python, Java, C#, JavaScript, TypeScript, Go, Rust, and many more. Since it leverages AI model capabilities, new or lesser-known languages may also work depending on model support.
+
+### Model Context Protocol (MCP)
+
+Roo Code supports the Model Context Protocol, allowing it to connect to external tools and data sources via MCP servers. This provides maximum flexibility in extending Roo Code's capabilities beyond built-in features.
+
+---
+
+## Roo Code Cloud
+
+Roo Code Cloud lets you create an AI agent team that runs autonomously in isolated cloud containers. Agents can be triggered from the web UI, Slack, Linear, or GitHub.
+
+**Website**: https://roocode.com/cloud
+**Sign up**: https://app.roocode.com/sign-up
+
+### How It Works
+
+1. **Connect your GitHub account**: Pick which repos the agents can work with in their isolated containers and choose what model you want to power each of them.
+2. **Set up your agent team**: Choose the roles you want filled -- Explainer, Planner, Coder, PR Reviewer, PR Fixer. They know how to act in each situation and stay on-task.
+3. **Start giving them tasks**: Describe what you want from the web UI, get the Reviewer automatically reviewing PRs, and much more.
+
+### Cloud Features
+
+- **Autonomous Cloud Agents**: Delegate work to specialized agents that run 24/7.
+- **Model Agnostic**: Bring your own keys or use the Roo Code Router with access to all top models with no markup.
+- **GitHub PR Reviews**: Agents can automatically review pull requests, provide feedback, and push fixes directly to your repository.
+- **Slack Integration**: Start tasks, get updates, and collaborate with agents directly from Slack channels.
+- **Linear Integration**: Assign issues to Roo Code directly from Linear. Get PRs back without switching tools.
+- **Team Collaboration**: Manage your team and their access to tasks and resources, with centralized billing and configuration.
+- **Usage Analytics**: Detailed token analytics to help optimize costs and usage across your team.
+- **Task History**: Access all tasks from anywhere, from the cloud and the extension.
+- **Task Sharing**: Share tasks with friends and coworkers and let them follow your work in real-time.
+
+---
+
+## Cloud Agents
+
+Roo Code Cloud provides several specialized agent types:
+
+### Planner Agent
+Plans complex changes and creates detailed implementation specs. Can be invoked from Slack or the web UI.
+
+### Coder Agent
+Implements features, refactors code, and creates PRs. Works in isolated containers with full repository access.
+
+### Explainer Agent
+Explains code, architecture, and program behavior. Useful for onboarding, code reviews, and knowledge sharing.
+
+### PR Reviewer Agent
+Provides comprehensive AI-powered code reviews. See the dedicated PR Reviewer section below.
+
+### PR Fixer Agent
+Automatically applies fixes based on review comments. See the dedicated PR Fixer section below.
+
+---
+
+## Roo Code for Slack
+
+**Website**: https://roocode.com/slack
+
+Mention @Roomote in any Slack channel to explain code, plan features, or ship a PR -- all without leaving the conversation.
+
+### Key Capabilities
+
+- **Discussion to PR**: Your team discusses a feature in Slack. @Roomote turns the discussion into a plan, then builds it.
+- **Thread-aware**: @Roomote reads the full thread before responding. It understands context from the conversation.
+- **Chain agents**: Start with a Planner to spec it out, then call the Coder to build it. Multi-step workflows in one Slack thread.
+- **Open to all**: Anyone on your team can ask @Roomote to fix bugs, build features, or investigate issues.
+- **Safe by design**: Agents never touch main/master directly. They produce branches and PRs. You approve.
+
+### Slack Workflow
+
+1. Turn the discussion into a plan -- your team discusses a feature, then summon the Planner agent.
+2. Refine the plan in the thread -- the team reviews, suggests changes, asks questions. Mention @Roomote to refine.
+3. Build the plan -- hand it off to the Coder agent to implement.
+4. Review and ship -- the Coder creates a branch and opens a PR. The team reviews and ships.
+
+### Setup
+
+Slack integration requires a Team plan. Connect via Roo Code Cloud settings, authorize the app, and add @Roomote to channels.
+
+---
+
+## Roo Code for Linear
+
+**Website**: https://roocode.com/linear
+
+Assign development work to @Roo Code directly from Linear. Get PRs back without switching tools.
+
+### Key Capabilities
+
+- **Work where you already work**: Assign development work directly from Linear. No new tools to learn.
+- **Progress is visible**: Watch progress in real-time. Roo Code posts updates as comments.
+- **Mention for refinement**: Comment "@Roo Code also add dark mode support" and the agent picks up where it left off.
+- **Full traceability**: Every PR links back to the originating issue. Your audit trail stays clean.
+- **Organization-level setup**: Connect once, use everywhere.
+- **Safe by design**: Agents produce branches and PRs. You review and approve before merge.
+
+### Setup
+
+Linear integration requires a Team plan. Connect GitHub, authorize Linear via OAuth, map your Linear project to a repo, then assign or mention @Roo Code.
+
+---
+
+## PR Reviewer Agent
+
+**Website**: https://roocode.com/reviewer
+
+AI-powered code reviews that catch what other AI tools and most humans miss.
+
+### How It Differs
+
+- **Bring your own key, get uncompromised reviews**: Most AI review tools use fixed pricing, which means they skimp on tokens to protect margins. With Roo, you bring your own API key. Reviews focus on real problems like business logic, security vulnerabilities, and architectural issues.
+- **Advanced reasoning**: Leverages state-of-the-art reasoning models with sophisticated workflows: diff analysis, context gathering, impact mapping, and contract validation. Catches subtle bugs that surface-level tools miss.
+- **Repository-aware, not snippet-aware**: Analyzes your entire codebase context -- dependency graphs, code ownership, team conventions, and historical patterns. Understands how changes interact with existing systems.
+
+### How It Works
+
+1. Connect your GitHub repository and configure which branches and pull requests should be reviewed.
+2. Provide your AI provider API key and set review preferences, custom rules, and quality standards.
+3. Every pull request gets detailed GitHub comments in minutes highlighting issues and suggesting improvements.
+
+---
+
+## PR Fixer Agent
+
+**Website**: https://roocode.com/pr-fixer
+
+Automatically apply high-quality fixes to pull requests based on review comments.
+
+### How It Differs
+
+- **Comment-history aware**: Understands the entire conversation on the PR -- previous reviews, replies, follow-ups -- and uses that context to produce accurate fixes.
+- **Bring your own key**: Use preferred models at full strength. Prompts are optimized for depth, not cost-cutting.
+- **Repository- and diff-aware**: Analyzes the full repo context and latest diff to ensure fixes align with project conventions and pass checks.
+
+### How It Works
+
+1. Connect your GitHub repositories.
+2. Invoke from a PR comment (e.g., "@roomote: fix these review comments"). The agent reads the entire comment history and latest diffs.
+3. The agent proposes targeted changes and pushes concise commits you can review and merge quickly.
+
+---
+
+## Roo Code Enterprise
+
+**Website**: https://roocode.com/enterprise
+
+The control-plane for AI-powered software development. Gain visibility, governance, and control over your AI coding initiatives.
+
+### Enterprise Features
+
+- **Centralized AI Management Hub**: Manage Roo Code deployments enterprise-wide. Centralized token management, multi-model support, extensible architecture.
+- **Real-Time Usage Visibility**: Track usage across teams with detailed analytics. Token consumption tracking, cost attribution by team, AI adoption insights.
+- **Enterprise-Grade Governance**: Implement security policies aligned with your governance framework. Model allow-lists, data residency controls, audit trail compliance.
+- **5-Minute Control-Plane Setup**: Deploy instantly with SaaS solution. SAML/SCIM integration, REST API access. No infrastructure required.
+- **Manage AI Development Costs**: Unified cost visibility, department chargebacks, usage optimization.
+- **Zero Friction for Developers**: Seamless access with automatic token refresh, local sidecar architecture, no workflow disruption.
+
+---
+
+## Roo Code Router
+
+**Website**: https://roocode.com/provider
+
+The Roo Code Router is a model router optimized to work seamlessly with Roo Code products. It provides curated access to top AI models with no markup on inference costs. You do not have to use it -- you can bring your own provider key.
+
+### Key Facts
+
+- Pricing is based on token usage for input and output, measured per million tokens.
+- The Router does not keep any of your data; the service only aims to make it easier to use Roo Code.
+- Available models include the latest from Anthropic, OpenAI, Google, and other top providers.
+
+---
+
+## Pricing
+
+**Website**: https://roocode.com/pricing
+
+### VS Code Extension
+- **Price**: Free forever
+- **Features**: Unlimited local use, bring your own model, powerful extensible modes, community support
+
+### Cloud Free
+- **Price**: $0/month + credits for usage
+- **Features**: Access to Cloud Agents (fully autonomous development from GitHub and web), Access to Roo Code Router, task history and sharing, token usage analytics, professional support
+- **Credit cost**: Cloud Agents at $5/hour; inference via Roo Provider pricing or BYOM
+
+### Cloud Team
+- **Price**: $99/month + credits for usage
+- **Trial**: Free for 14 days
+- **Features**: Everything in Free plus unlimited users (no per-seat cost), shared configuration and policies, centralized billing, Slack and Linear integrations
+- **Credit cost**: Cloud Agents at $5/hour; inference via Roo Provider pricing or BYOM
+
+### Enterprise
+- Custom pricing. Contact sales for a demo.
+
+### Credits
+Credits are pre-paid in dollars and deducted with usage for inference and Cloud Agent runs. You are always in control of your spend with no surprises.
+
+---
+
+## How Roo Code Compares to Alternatives
+
+### vs GitHub Copilot
+- Roo Code is open source; Copilot is proprietary
+- Roo Code is model-agnostic; Copilot is locked to OpenAI/GitHub models
+- Roo Code supports multi-file agentic editing and command execution; Copilot focuses on inline completions and chat
+- Roo Code has custom modes for different tasks; Copilot has a single assistant
+- Roo Code offers autonomous cloud agents; Copilot is IDE-only
+
+### vs Cursor
+- Roo Code is a VS Code extension (works in any VS Code fork including Cursor); Cursor is a standalone IDE
+- Roo Code is open source; Cursor is proprietary
+- Roo Code is model-agnostic with no lock-in; Cursor bundles its own model access
+- Roo Code has permission-based control for every action; Cursor applies changes more automatically
+- Roo Code Cloud extends to autonomous agents; Cursor is IDE-only
+
+### vs Windsurf
+- Roo Code is open source; Windsurf is proprietary
+- Roo Code is a VS Code extension; Windsurf is a standalone IDE
+- Roo Code supports any AI model; Windsurf bundles specific models
+- Roo Code has specialized modes for different tasks; Windsurf has a single AI assistant
+- Roo Code Cloud provides autonomous agents and team features; Windsurf is individual-only
+
+### vs Cline
+- Roo Code was forked from Cline and has since diverged significantly
+- Roo Code adds custom modes, Orchestrator mode, MCP marketplace, cloud agents, Slack/Linear integrations, and enterprise features
+- Roo Code is backed by Roo Code, Inc. with dedicated development team and enterprise support
+- Roo Code Cloud provides autonomous agents that run 24/7; Cline is extension-only
+
+---
+
+## Frequently Asked Questions
+
+### What exactly is Roo Code?
+Roo Code is an open-source, AI-powered coding assistant that runs in VS Code. It goes beyond simple autocompletion by reading and writing across multiple files, executing commands, and adapting to your workflow -- like having a whole dev team right inside your editor.
+
+### How does Roo Code differ from Copilot, Cursor, or Windsurf?
+Roo Code is open-source and fully customizable, letting you integrate any AI model you choose (e.g., OpenAI, Anthropic, local LLMs). It is built for multi-file edits, so it can read, refactor, and update multiple files at once. Its agentic abilities go beyond typical AI autocomplete, enabling it to run tests, open a browser, and handle deeper tasks. You are always in control: Roo Code is permission-based, meaning you control and approve any file changes or command executions.
+
+### Is Roo Code really free?
+Yes! Roo Code is completely free and open-source. You only pay for AI model usage if you use a paid API (like OpenAI). If you choose free or self-hosted models, there is no cost at all. Roo Code Cloud has free and paid tiers.
+
+### Will my code stay private?
+Yes. The Roo Code extension runs locally in VS Code, so your code never leaves your machine unless you connect to an external AI API. Even then, you control exactly what is sent. You can use .rooignore to exclude sensitive files, and you can run with offline/local models for full privacy.
+
+### Which AI models does Roo Code support?
+Roo Code is fully model-agnostic. It supports OpenAI models (GPT-4o, GPT-4, o1), Anthropic Claude (including Claude 3.5 Sonnet), Google Gemini models, local LLMs via Ollama, and any model accessible through OpenRouter or compatible APIs.
+
+### Does Roo Code support my programming language?
+Likely yes. Roo Code supports Python, Java, C#, JavaScript, TypeScript, Go, Rust, and many more. Since it leverages AI model understanding, new or lesser-known languages may also work depending on model support.
+
+### How do I install and get started?
+Install Roo Code from the VS Code Marketplace or GitHub. Add your AI keys (OpenAI, Anthropic, or other) in the extension settings. Open the Roo panel in VS Code and start typing commands in plain English. Tutorial videos are available at https://docs.roocode.com/tutorial-videos.
+
+### Can it handle large, enterprise-scale projects?
+Yes. Roo Code uses efficient strategies like partial-file analysis, summarization, and user-specified context to handle large codebases. Enterprises can use on-premises or self-hosted models for compliance and security needs.
+
+### Is it safe for enterprise use?
+Yes. Roo Code was built for enterprise environments. You can self-host AI models or use your own trusted provider. All file changes and commands go through permission gating. Because Roo Code is fully open-source, it is auditable. SOC 2 Type II compliant.
+
+### Can Roo Code run commands and tests automatically?
+Yes. Roo Code can execute terminal commands, run test suites, and open a web browser for integration testing -- always optional and fully permission-based.
+
+### Can I contribute to Roo Code?
+Yes! Roo Code is open-source on GitHub at https://github.com/RooCodeInc/Roo-Code. Submit issues, suggest features, or open a pull request. There is an active community on Discord (https://discord.gg/roocode) and Reddit (https://reddit.com/r/RooCode).
+
+### Where can I learn more or get help?
+Check the official documentation at https://docs.roocode.com for quick-start guides and advanced documentation. Community support is available on Discord, Reddit, YouTube (https://www.youtube.com/@RooCodeYT), and the blog (https://blog.roocode.com).
+
+---
+
+## Links
+
+- Website: https://roocode.com
+- Documentation: https://docs.roocode.com
+- GitHub: https://github.com/RooCodeInc/Roo-Code
+- VS Code Marketplace: https://marketplace.visualstudio.com/items?itemName=RooVeterinaryInc.roo-cline
+- Cloud App: https://app.roocode.com
+- Discord: https://discord.gg/roocode
+- Reddit: https://reddit.com/r/RooCode
+- X/Twitter: https://x.com/roocode
+- LinkedIn: https://www.linkedin.com/company/roo-code
+- YouTube: https://www.youtube.com/@RooCodeYT
+- Blog: https://blog.roocode.com
+- Trust Center: https://trust.roocode.com
+- Careers: https://careers.roocode.com
diff --git a/apps/web-roo-code/public/llms.txt b/apps/web-roo-code/public/llms.txt
new file mode 100644
index 00000000000..94336605011
--- /dev/null
+++ b/apps/web-roo-code/public/llms.txt
@@ -0,0 +1,41 @@
+# Roo Code
+
+> Roo Code is an AI-powered software development platform with two core products: a free, open-source VS Code extension for interactive AI-assisted coding, and Roo Code Cloud for autonomous AI agents that work in the background.
+
+Roo Code is built by Roo Code, Inc. The VS Code extension is the #1 most-installed open-source AI coding extension on the VS Code Marketplace. Roo Code Cloud extends this with autonomous agents that can be triggered from the web, Slack, Linear, or GitHub.
+
+## Core Products
+
+- [Roo Code VS Code Extension](https://roocode.com/extension): Free, open-source AI coding assistant. Model-agnostic, supports multi-file editing, custom modes, agentic command execution, and permission-based control. Works with OpenAI, Anthropic, Google Gemini, local LLMs, and dozens more.
+- [Roo Code Cloud](https://roocode.com/cloud): Autonomous AI agents -- Planner, Coder, Explainer, PR Reviewer, PR Fixer -- that run 24/7 in isolated cloud containers, triggered from the web UI, Slack, Linear, or GitHub.
+- [Roo Code Enterprise](https://roocode.com/enterprise): Enterprise control-plane with centralized management, SAML/SCIM, usage analytics, cost controls, model allow-lists, and audit trails.
+
+## Integrations
+
+- [Roo Code for Slack](https://roocode.com/slack): Mention @Roomote in any Slack channel to plan, explain, or build features without leaving the conversation.
+- [Roo Code for Linear](https://roocode.com/linear): Assign issues to @Roo Code directly from Linear and get PRs back.
+- [PR Reviewer](https://roocode.com/reviewer): AI-powered code reviews using advanced reasoning and full repository context. Bring your own API key.
+- [PR Fixer](https://roocode.com/pr-fixer): Automatically apply fixes to PRs based on review comments. Comment-history aware.
+
+## Key Differentiators
+
+- Open source and fully auditable
+- Model-agnostic: works with any LLM provider, no lock-in
+- Custom modes: Architect, Code, Ask, Debug, Test, Orchestrator, and user-created modes
+- Permission-based: you approve every file change and command execution
+- SOC 2 Type II compliant
+- Bring your own API key: no markup on inference costs
+
+## Resources
+
+- [Documentation](https://docs.roocode.com)
+- [Pricing](https://roocode.com/pricing)
+- [Evals and Benchmarks](https://roocode.com/evals)
+- [GitHub Repository](https://github.com/RooCodeInc/Roo-Code)
+- [Blog](https://blog.roocode.com)
+- [Discord Community](https://discord.gg/roocode)
+- [Trust Center](https://trust.roocode.com)
+
+## Optional
+
+- [llms-full.txt](https://roocode.com/llms-full.txt): Comprehensive product information for detailed context
diff --git a/apps/web-roo-code/src/app/evals/layout.tsx b/apps/web-roo-code/src/app/evals/layout.tsx
new file mode 100644
index 00000000000..f6ebd164a4a
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/layout.tsx
@@ -0,0 +1,8 @@
+import { Fraunces, IBM_Plex_Sans } from "next/font/google"
+
+const display = Fraunces({ subsets: ["latin"], variable: "--font-display" })
+const body = IBM_Plex_Sans({ subsets: ["latin"], weight: ["400", "500", "600"], variable: "--font-body" })
+
+export default function EvalsLayout({ children }: { children: React.ReactNode }) {
+ return
{children}
+}
diff --git a/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx
new file mode 100644
index 00000000000..6bca818da15
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx
@@ -0,0 +1,522 @@
+"use client"
+
+import { motion } from "framer-motion"
+import { AlertTriangle, CheckCircle2, Scale, Timer, Zap } from "lucide-react"
+import Link from "next/link"
+
+const containerVariants = {
+ hidden: { opacity: 0 },
+ visible: {
+ opacity: 1,
+ transition: {
+ staggerChildren: 0.12,
+ delayChildren: 0.08,
+ },
+ },
+}
+
+const fadeUpVariants = {
+ hidden: { opacity: 0, y: 18 },
+ visible: {
+ opacity: 1,
+ y: 0,
+ transition: { duration: 0.55, ease: [0.21, 0.45, 0.27, 0.9] as const },
+ },
+}
+
+const backgroundVariants = {
+ hidden: { opacity: 0 },
+ visible: { opacity: 1, transition: { duration: 1.1, ease: "easeOut" as const } },
+}
+
+function InlineArrow() {
+ return (
+
+ →
+
+ )
+}
+
+function Chip({ icon: Icon, label }: { icon: React.ComponentType<{ className?: string }>; label: string }) {
+ return (
+
+
+ {label}
+
+ )
+}
+
+function Callout({
+ icon: Icon,
+ title,
+ body,
+ tone = "neutral",
+}: {
+ icon: React.ComponentType<{ className?: string }>
+ title: string
+ body: string
+ tone?: "neutral" | "warning" | "success"
+}) {
+ const toneClasses =
+ tone === "warning"
+ ? "border-amber-500/20 bg-amber-500/5"
+ : tone === "success"
+ ? "border-emerald-500/20 bg-emerald-500/5"
+ : "border-border/50 bg-card/40"
+
+ return (
+
+ )
+}
+
+function Step({ num, title, body }: { num: string; title: string; body: string }) {
+ return (
+
+ )
+}
+
+function SmallLink({ href, label }: { href: string; label: string }) {
+ return (
+
+ {label}
+
+ )
+}
+
+export function MethodologyContent() {
+ return (
+ <>
+ {/* Hero */}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Evals
+
+ /
+
+ Recommendations
+
+ /
+ Methodology
+
+
+
+ Read this before you compare models
+
+
+
+ How we run evals
+
+
+
+ We keep tasks, environment, and scoring constant across models. Use our results to
+ pick a default for a specific objective, then validate in your repo.
+
+
+
+
+
+
+
+
+
+
+ View recommendations
+
+
+ Raw eval data
+
+
+
+
+ Jump to:
+
+ What we hold constant
+
+
+ Scoring
+
+
+ Limitations
+
+
+
+
+
+
+
+
+
+
+ Methodology at a glance
+
+
+ Comparable results, not universal truth
+
+
+
+
+
+ We hold constant
+
+
+ Same exercises, same tools, same time limit, same scoring.
+
+
+
+
+ We measure
+
+
+ Pass rate, latency, and cost signals across multiple languages.
+
+
+
+
+ We recommend
+
+
+ A default model and agent lineup for an objective. It’s a
+ baseline, not a guarantee.
+
+
+
+
+
+
+ Objective-first
+
+
+ Optimized for: Quality / Speed / Cost
+
+
+ Validate in your repo
+
+
+
+
+
+
+
+
+
+
+ {/* Body */}
+
+
+
+ {/* Left rail */}
+
+
+
+ How to read results responsibly
+
+
+ Evals help you pick a better default. They don’t predict how a model behaves
+ in your repo, with your tests, tooling, and constraints.
+
+
+
+
+
+
+
+
+
+ On this page
+
+
+
+
+
+
+ Quick definitions
+
+
+
+
+
+ Objective :
+ the workflow you want to ship (for example,{" "}
+ Issue → PR ).
+
+
+
+
+
+
+ Optimized for
+
+ : the tradeoff you care about most (
+ Quality ,{" "}
+ Speed ,{" "}
+ Cost ).
+
+
+
+
+
+ Pass rate :
+ percent of exercises a model completes within the limit.
+
+
+
+
+
+
+
+ {/* Main column */}
+
+
+
+ Methodology
+
+
+ How to use these evals
+
+
+ The recommendations page is organized around what you’re trying to ship. You
+ pick the objective and tradeoff. We show the best default setup based on the signal
+ we have.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Scoring and signals
+
+
+ What we measure
+
+
+
+
+
+
+
+
+
+
+
+ Tradeoffs
+
+
+ Quality, speed, cost: pick one to optimize
+
+
+
+ Choosing an optimization mode is how you tell the system what matters most for
+ your objective. If you care about merge confidence, optimize for Quality. If you
+ care about throughput, Speed and Cost matter.
+
+
+ When two models are close on pass rate, the most practical tie-breakers are
+ latency and $/task.
+
+
+
+
+
+
+ Limitations
+
+
+ What these evals don’t tell you
+
+
+
+ A model can score well on this suite and still struggle in your repo because
+ your stack, tests, dependencies, and CI constraints are different.
+
+
+ The right move is to treat our results as a starting point, then run your
+ objective end-to-end in Roo Code Cloud and inspect the PR output.
+
+
+
+
+
+
+ Links
+
+
+
+
+
+
+
+ If something feels off in the recommendations, that’s a signal too. The
+ fastest path is to run your objective in Roo Code Cloud and compare the PR
+ output.
+
+
+
+
+
+
+
+ >
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/methodology/page.tsx b/apps/web-roo-code/src/app/evals/methodology/page.tsx
new file mode 100644
index 00000000000..e723cabd674
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/methodology/page.tsx
@@ -0,0 +1,59 @@
+import type { Metadata } from "next"
+
+import { SEO } from "@/lib/seo"
+import { ogImageUrl } from "@/lib/og"
+
+import { MethodologyContent } from "./methodology-content"
+
+// ── SEO Metadata ────────────────────────────────────────────────────────────
+
+const TITLE = "Methodology | Roo Code Cloud Evals"
+const DESCRIPTION =
+ "How we run Roo Code Cloud evals and how to interpret outcomes-first recommendations. Same tasks, same limits, clear tradeoffs."
+const OG_DESCRIPTION = "How we run Roo Code Cloud evals"
+const PATH = "/evals/methodology"
+
+export const metadata: Metadata = {
+ title: TITLE,
+ description: DESCRIPTION,
+ alternates: {
+ canonical: `${SEO.url}${PATH}`,
+ },
+ openGraph: {
+ title: TITLE,
+ description: DESCRIPTION,
+ url: `${SEO.url}${PATH}`,
+ siteName: SEO.name,
+ images: [
+ {
+ url: ogImageUrl(TITLE, OG_DESCRIPTION),
+ width: 1200,
+ height: 630,
+ alt: TITLE,
+ },
+ ],
+ locale: SEO.locale,
+ type: "website",
+ },
+ twitter: {
+ card: SEO.twitterCard,
+ title: TITLE,
+ description: DESCRIPTION,
+ images: [ogImageUrl(TITLE, OG_DESCRIPTION)],
+ },
+ keywords: [
+ ...SEO.keywords,
+ "AI evaluation",
+ "model benchmarking",
+ "coding evals",
+ "methodology",
+ "evaluation process",
+ "transparent evaluation",
+ ],
+}
+
+// ── Page Component ──────────────────────────────────────────────────────────
+
+export default function MethodologyPage() {
+ return
+}
diff --git a/apps/web-roo-code/src/app/evals/recommendations/[objectiveSlug]/objective-content.tsx b/apps/web-roo-code/src/app/evals/recommendations/[objectiveSlug]/objective-content.tsx
new file mode 100644
index 00000000000..90c5c8c7fd4
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/recommendations/[objectiveSlug]/objective-content.tsx
@@ -0,0 +1,662 @@
+"use client"
+
+import { useCallback, useMemo, useState } from "react"
+import Link from "next/link"
+import { useSearchParams } from "next/navigation"
+import { motion } from "framer-motion"
+import { ArrowRight, BarChart3, Check, Copy, ExternalLink, SlidersHorizontal, Sparkles, Workflow } from "lucide-react"
+import { Cell, ResponsiveContainer, Scatter, ScatterChart, Tooltip, XAxis, YAxis, ZAxis } from "recharts"
+
+import type { RoleRecommendation, ModelCandidate } from "@/lib/mock-recommendations"
+import { getCloudSetupUrl } from "@/lib/mock-recommendations"
+
+type EvalOptimizationMode = "best" | "fastest" | "cost"
+
+type ObjectiveDeepDive = {
+ id: string
+ slug: string
+ name: string
+ description: string
+ whyItWorks: string[]
+ recommendedRoleIds: string[]
+ builderProfile?: {
+ title: string
+ description: string
+ examplePrompt?: string
+ howItWorks: string[]
+ }
+}
+
+type ObjectiveRoleRec = {
+ roleId: string
+ recommendation: RoleRecommendation
+}
+
+type Props = {
+ objective: ObjectiveDeepDive
+ initialMode: EvalOptimizationMode
+ recs: ObjectiveRoleRec[]
+}
+
+const containerVariants = {
+ hidden: { opacity: 0 },
+ visible: {
+ opacity: 1,
+ transition: { staggerChildren: 0.08, delayChildren: 0.08 },
+ },
+}
+
+const fadeUp = {
+ hidden: { opacity: 0, y: 14 },
+ visible: { opacity: 1, y: 0, transition: { duration: 0.55, ease: [0.21, 0.45, 0.27, 0.9] as const } },
+}
+
+function isEvalOptimizationMode(value: string): value is EvalOptimizationMode {
+ return value === "best" || value === "fastest" || value === "cost"
+}
+
+function pickCandidate(rec: RoleRecommendation | undefined, mode: EvalOptimizationMode): ModelCandidate | null {
+ if (!rec) return null
+ if (mode === "fastest") return rec.speedHire ?? rec.best[0] ?? null
+ if (mode === "cost") return rec.budgetHire ?? rec.best[0] ?? null
+ return rec.best[0] ?? null
+}
+
+function shortProvider(provider: string): string {
+ switch (provider) {
+ case "openai":
+ return "OpenAI"
+ case "anthropic":
+ return "Anthropic"
+ case "google":
+ return "Google"
+ case "xai":
+ return "xAI"
+ case "deepseek":
+ return "DeepSeek"
+ case "moonshot":
+ return "Moonshot"
+ default:
+ return provider
+ }
+}
+
+function formatDollars(value: number): string {
+ if (!Number.isFinite(value)) return "—"
+ return `$${Math.round(value)}`
+}
+
+function formatSeconds(value: number): string {
+ if (!Number.isFinite(value)) return "—"
+ return `${value.toFixed(1)}s`
+}
+
+function buildObjectiveQueryString(
+ searchParams: { get(name: string): string | null },
+ objectiveSlug: string,
+ mode: EvalOptimizationMode,
+): string {
+ const params = new URLSearchParams()
+ params.set("objective", objectiveSlug)
+ params.set("mode", mode)
+
+ // Preserve any existing query bits we might add later without breaking URLs.
+ const view = searchParams.get("view")
+ if (view) params.set("view", view)
+
+ return `?${params.toString()}`
+}
+
+type PrimaryScatterPoint = {
+ name: string
+ provider: string
+ score: number
+ successRate: number
+ dailyCost: number
+ dotSize: number
+ isSelected: boolean
+}
+
+type ObjectiveDataPreviewProps = {
+ points: PrimaryScatterPoint[]
+ compareHref: string
+}
+
+function ObjectiveDataPreview({ points, compareHref }: ObjectiveDataPreviewProps): JSX.Element {
+ return (
+
+
Data preview
+
+ Composite score vs estimated daily cost for the primary agent, with dot size mapped to success rate.
+
+
+
+
+
+
+
+
+ } />
+
+ {points.map((point, i) => {
+ const pointColor = point.isSelected ? "#22c55e" : "#3b82f6"
+ const strokeColor = point.isSelected ? "rgba(34,197,94,0.65)" : "rgba(59,130,246,0.35)"
+ const strokeWidth = point.isSelected ? 2 : 1
+
+ return |
+ })}
+
+
+
+
+
+
+
+ Compare the full set
+
+
+
+
+ )
+}
+
+type ObjectiveTooltipPayloadEntry = { payload?: unknown }
+
+function ObjectiveTooltip({
+ active,
+ payload,
+}: {
+ active?: boolean
+ payload?: ObjectiveTooltipPayloadEntry[]
+}): JSX.Element | null {
+ if (!active || !payload?.length) return null
+ const entryPayload = payload[0]?.payload
+ const p = entryPayload as
+ | {
+ name?: string
+ provider?: string
+ score?: number
+ dailyCost?: number
+ successRate?: number
+ }
+ | undefined
+ if (!p) return null
+
+ return (
+
+
{p.name}
+
+ {shortProvider(p.provider ?? "")} · {p.score} score ·{" "}
+ {p.successRate}% success
+
+
+ Est daily cost {formatDollars(p.dailyCost ?? NaN)}
+
+
+ )
+}
+
+export function ObjectiveContent({ objective, initialMode, recs }: Props) {
+ const searchParams = useSearchParams()
+
+ const recByRole = useMemo(() => new Map(recs.map((r) => [r.roleId, r.recommendation])), [recs])
+ const lineup = useMemo(() => objective.recommendedRoleIds.filter((id) => recByRole.has(id)), [objective, recByRole])
+
+ const [mode, setMode] = useState(() => {
+ const m = searchParams.get("mode")
+ if (m && isEvalOptimizationMode(m)) return m
+ return initialMode
+ })
+
+ const [overrides, setOverrides] = useState>({})
+
+ const selectedByRole = useMemo(() => {
+ const next = new Map()
+ for (const roleId of lineup) {
+ const rec = recByRole.get(roleId)
+ const overrideModelId = overrides[roleId]
+ if (overrideModelId && rec) {
+ next.set(roleId, rec.allCandidates.find((c) => c.modelId === overrideModelId) ?? null)
+ } else {
+ next.set(roleId, pickCandidate(rec, mode))
+ }
+ }
+ return next
+ }, [lineup, mode, overrides, recByRole])
+
+ const primaryRoleId = lineup[0] ?? null
+ const primaryCandidate = primaryRoleId ? (selectedByRole.get(primaryRoleId) ?? null) : null
+
+ const modePill = useMemo(
+ () => [
+ { id: "best" as const, label: "Quality" },
+ { id: "fastest" as const, label: "Speed" },
+ { id: "cost" as const, label: "Cost" },
+ ],
+ [],
+ )
+
+ const cloudHref = useMemo(() => {
+ if (!primaryCandidate) return "https://app.roocode.com"
+ return getCloudSetupUrl(primaryCandidate)
+ }, [primaryCandidate])
+
+ const examplePrompt = objective.builderProfile?.examplePrompt?.trim() ?? ""
+ const [promptCopied, setPromptCopied] = useState(false)
+ const copyPrompt = useCallback(async () => {
+ if (!examplePrompt) return
+ await navigator.clipboard.writeText(examplePrompt)
+ setPromptCopied(true)
+ setTimeout(() => setPromptCopied(false), 2000)
+ }, [examplePrompt])
+
+ const onSelectModel = useCallback((roleId: string, modelId: string) => {
+ setOverrides((prev) => ({ ...prev, [roleId]: modelId }))
+ }, [])
+
+ const onSetMode = useCallback((next: EvalOptimizationMode) => {
+ setMode(next)
+ }, [])
+
+ const roleQuery = useMemo(
+ () => buildObjectiveQueryString(searchParams, objective.slug, mode),
+ [mode, objective.slug, searchParams],
+ )
+
+ const primaryScatter = useMemo(() => {
+ if (!primaryRoleId) return []
+ const rec = recByRole.get(primaryRoleId)
+ if (!rec) return []
+ return rec.allCandidates.map((c) => ({
+ name: c.displayName,
+ provider: c.provider,
+ score: c.compositeScore,
+ successRate: c.successRate,
+ dailyCost: Math.round(c.estimatedDailyCost),
+ dotSize: Math.round(40 + (c.successRate / 100) * 260),
+ isSelected: primaryCandidate?.modelId === c.modelId,
+ }))
+ }, [primaryCandidate?.modelId, primaryRoleId, recByRole])
+
+ const comparePrimaryHref = primaryRoleId
+ ? `/evals/recommendations/roles/${primaryRoleId}/compare${roleQuery}`
+ : "/evals/recommendations"
+
+ return (
+
+ {/* Atmosphere */}
+
+
+
+
+
+
+
+ Evals
+
+ /
+
+ Build with Roo Code Cloud
+
+ /
+ {objective.name}
+
+
+
+
+
+
+ Objective Deep Dive
+
+
+ {objective.name}
+
+
+ {objective.description} Explore why this setup is recommended, compare options, and
+ tune it to your constraints before you start in Roo Code Cloud.
+
+
+
+
+
+ See model data
+
+
+
+ Compare candidates
+
+
+
+
+
+
+
+
+
+
+ Configuration
+
+
+
+ Tunable
+
+
+
+
+
+ Optimized for
+
+
+ {modePill.map((m) => {
+ const selected = m.id === mode
+ return (
+ onSetMode(m.id)}
+ className={[
+ "rounded-full px-3 py-1.5 text-xs font-semibold transition-colors",
+ selected
+ ? "bg-foreground/10 text-foreground"
+ : "text-muted-foreground hover:text-foreground",
+ ].join(" ")}>
+ {m.label}
+
+ )
+ })}
+
+
+
+
+
Primary model
+
+ {primaryCandidate ? (
+ <>
+
+ {primaryCandidate.displayName}
+ {" "}
+
+ ({shortProvider(primaryCandidate.provider)})
+
+ >
+ ) : (
+
+ Pick an objective to see recommendations.
+
+ )}
+
+ {primaryCandidate ? (
+
+
+
+ Score
+
+
+ {primaryCandidate.compositeScore}
+
+
+
+
+ Success
+
+
+ {primaryCandidate.successRate}%
+
+
+
+
+ Daily
+
+
+ {formatDollars(primaryCandidate.estimatedDailyCost)}
+
+
+
+ ) : null}
+
+
+
+
+
+ This is a starting point. Adjust the lineup and model picks to match your
+ repo and delivery constraints.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Recommended lineup
+
+
+ For {objective.name}, this lineup balances momentum and safety. Swap models per agent if
+ you have different constraints.
+
+
+
+ {lineup.map((roleId) => {
+ const rec = recByRole.get(roleId)
+ if (!rec) return null
+ const selected = selectedByRole.get(roleId) ?? null
+
+ return (
+
+
+
+
+ For
+
+
+ {rec.role.name}
+
+
+ {rec.role.salaryRange}
+
+
+
+
+ Details
+
+
+
+ Compare
+
+
+
+
+
+
+
+
+ Model pick
+
+
+ onSelectModel(roleId, e.target.value)}
+ className="w-full rounded-xl border border-border/60 bg-background/20 px-3 py-2.5 text-sm font-semibold text-foreground shadow-sm outline-none transition-colors focus:border-foreground/30">
+ {rec.allCandidates.slice(0, 12).map((c) => (
+
+ {c.displayName} · {shortProvider(c.provider)} ·{" "}
+ {c.compositeScore}
+
+ ))}
+
+
+
+
+
+
+
+ Success
+
+
+ {selected ? `${selected.successRate}%` : "—"}
+
+
+
+
+ Daily
+
+
+ {selected
+ ? formatDollars(selected.estimatedDailyCost)
+ : "—"}
+
+
+
+
+ Time
+
+
+ {selected
+ ? formatSeconds(selected.avgTimePerTask)
+ : "—"}
+
+
+
+
+
+
+ )
+ })}
+
+
+
+
+
+
+
+
+ Why this works
+
+
+ {(objective.whyItWorks ?? []).slice(0, 4).map((line) => (
+ - {line}
+ ))}
+
+
+ {objective.builderProfile?.howItWorks?.length ? (
+ <>
+
+ How it runs
+
+
+ {objective.builderProfile.howItWorks.slice(0, 4).map((step, idx) => (
+
+
+ {idx + 1}
+
+ {step}
+
+ ))}
+
+ >
+ ) : null}
+
+ {examplePrompt ? (
+
+
+ Example prompt
+
+
+ {examplePrompt}
+
+
+ ) : null}
+
+
+
+
+
+
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/recommendations/[objectiveSlug]/page.tsx b/apps/web-roo-code/src/app/evals/recommendations/[objectiveSlug]/page.tsx
new file mode 100644
index 00000000000..dce9f883da9
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/recommendations/[objectiveSlug]/page.tsx
@@ -0,0 +1,102 @@
+import type { Metadata } from "next"
+import { notFound } from "next/navigation"
+
+import { SEO } from "@/lib/seo"
+import { ogImageUrl } from "@/lib/og"
+import { getRoleRecommendation } from "@/lib/mock-recommendations"
+import { getEvalOutcomeBySlug } from "@/lib/eval-outcomes"
+
+import { ObjectiveContent } from "./objective-content"
+
+type PageProps = {
+ params: Promise<{ objectiveSlug: string }>
+ searchParams?: Promise>
+}
+
+function isMode(value: string): value is "best" | "fastest" | "cost" {
+ return value === "best" || value === "fastest" || value === "cost"
+}
+
+export async function generateMetadata({ params }: PageProps): Promise {
+ const { objectiveSlug } = await params
+ const objective = getEvalOutcomeBySlug(objectiveSlug)
+ if (!objective) {
+ return {
+ title: "Objective Not Found | Roo Code Evals",
+ description: "The requested objective was not found.",
+ }
+ }
+
+ const title = `${objective.name} — Build Profile | Roo Code Evals`
+ const description = `Investigate the recommended lineup for ${objective.name}. Compare options, explore data, and start in Roo Code Cloud.`
+ const ogDescription = `${objective.name} — Build Profile`
+ const path = `/evals/recommendations/${objective.slug}`
+
+ return {
+ title,
+ description,
+ alternates: { canonical: `${SEO.url}${path}` },
+ openGraph: {
+ title,
+ description,
+ url: `${SEO.url}${path}`,
+ siteName: SEO.name,
+ images: [
+ {
+ url: ogImageUrl(title, ogDescription),
+ width: 1200,
+ height: 630,
+ alt: title,
+ },
+ ],
+ locale: SEO.locale,
+ type: "website",
+ },
+ twitter: {
+ card: SEO.twitterCard,
+ title,
+ description,
+ images: [ogImageUrl(title, ogDescription)],
+ },
+ keywords: [...SEO.keywords, "AI coding", "model recommendations", "coding evals", "roo code cloud"],
+ }
+}
+
+export default async function ObjectiveDeepDivePage({ params, searchParams }: PageProps) {
+ const { objectiveSlug } = await params
+ const objective = getEvalOutcomeBySlug(objectiveSlug)
+ if (!objective) notFound()
+
+ const sp = (await searchParams) ?? {}
+ const modeRaw = typeof sp.mode === "string" ? sp.mode : undefined
+ const initialMode = modeRaw && isMode(modeRaw) ? modeRaw : "best"
+
+ const recs: Array<{ roleId: string; recommendation: NonNullable> }> = []
+ for (const roleId of objective.recommendedRoleIds) {
+ const recommendation = getRoleRecommendation(roleId)
+ if (recommendation) recs.push({ roleId, recommendation })
+ }
+
+ return (
+
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/recommendations/layout.tsx b/apps/web-roo-code/src/app/evals/recommendations/layout.tsx
new file mode 100644
index 00000000000..7b14622976d
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/recommendations/layout.tsx
@@ -0,0 +1,5 @@
+import type { ReactNode } from "react"
+
+export default function RecommendationsLayout({ children }: { children: ReactNode }) {
+ return <>{children}>
+}
diff --git a/apps/web-roo-code/src/app/evals/recommendations/page.tsx b/apps/web-roo-code/src/app/evals/recommendations/page.tsx
new file mode 100644
index 00000000000..035b44a417c
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/recommendations/page.tsx
@@ -0,0 +1,112 @@
+import { Suspense } from "react"
+import type { Metadata } from "next"
+import { redirect } from "next/navigation"
+
+import { SEO } from "@/lib/seo"
+import { ogImageUrl } from "@/lib/og"
+import { getEngineerRoles, getAllRecommendations } from "@/lib/mock-recommendations"
+import { EVAL_OUTCOMES, isEvalOutcomeId } from "@/lib/eval-outcomes"
+
+import { WorkersContent } from "../workers/workers-content"
+
+// ── SEO Metadata ────────────────────────────────────────────────────────────
+
+const TITLE = "Build with Roo Code Cloud | Roo Code Evals"
+const DESCRIPTION =
+ "Outcome-first, eval-backed recommendations for shipping production code. Start from your objective and pick a tradeoff."
+const OG_DESCRIPTION = "Outcome-first recommendations for shipping production code"
+const PATH = "/evals/recommendations"
+
+export const metadata: Metadata = {
+ title: TITLE,
+ description: DESCRIPTION,
+ alternates: {
+ canonical: `${SEO.url}${PATH}`,
+ },
+ openGraph: {
+ title: TITLE,
+ description: DESCRIPTION,
+ url: `${SEO.url}${PATH}`,
+ siteName: SEO.name,
+ images: [
+ {
+ url: ogImageUrl(TITLE, OG_DESCRIPTION),
+ width: 1200,
+ height: 630,
+ alt: TITLE,
+ },
+ ],
+ locale: SEO.locale,
+ type: "website",
+ },
+ twitter: {
+ card: SEO.twitterCard,
+ title: TITLE,
+ description: DESCRIPTION,
+ images: [ogImageUrl(TITLE, OG_DESCRIPTION)],
+ },
+ keywords: [
+ ...SEO.keywords,
+ "AI coding",
+ "coding agents",
+ "roo code cloud",
+ "model recommendations",
+ "coding evals",
+ "model comparison",
+ "shipping code",
+ "prototype",
+ ],
+}
+
+// ── Page Component ──────────────────────────────────────────────────────────
+
+type PageProps = {
+ searchParams?: Promise>
+}
+
+export default async function RecommendationsPage({ searchParams }: PageProps) {
+ const sp = (await searchParams) ?? {}
+ const view = typeof sp.view === "string" ? sp.view : undefined
+ const outcome = typeof sp.outcome === "string" ? sp.outcome : undefined
+ const mode = typeof sp.mode === "string" ? sp.mode : undefined
+
+ // Legacy deep link: move profile investigations into the dedicated objective pages.
+ if (view === "profile" && outcome && isEvalOutcomeId(outcome)) {
+ const objective = EVAL_OUTCOMES.find((o) => o.id === outcome)
+ if (objective) {
+ const qs = mode ? `?mode=${encodeURIComponent(mode)}` : ""
+ redirect(`/evals/recommendations/${objective.slug}${qs}`)
+ }
+ }
+
+ const roles = getEngineerRoles()
+ const recommendations = getAllRecommendations()
+
+ // Aggregate totals
+ const totalEvalRuns = recommendations[0]?.totalEvalRuns ?? 0
+ const totalExercises = recommendations[0]?.totalExercises ?? 0
+ const uniqueModels = new Set(
+ recommendations.flatMap((recommendation) => recommendation.allCandidates.map((candidate) => candidate.modelId)),
+ )
+ const totalModels = uniqueModels.size
+
+ const lastUpdated = recommendations
+ .map((r) => r.lastUpdated)
+ .sort()
+ .pop()
+
+ return (
+
+
+
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/recommendations/roles/[roleId]/compare/page.tsx b/apps/web-roo-code/src/app/evals/recommendations/roles/[roleId]/compare/page.tsx
new file mode 100644
index 00000000000..2565807e14c
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/recommendations/roles/[roleId]/compare/page.tsx
@@ -0,0 +1,85 @@
+import { notFound } from "next/navigation"
+import type { Metadata } from "next"
+
+import { SEO } from "@/lib/seo"
+import { ogImageUrl } from "@/lib/og"
+import { getEngineerRole, getRoleRecommendation } from "@/lib/mock-recommendations"
+
+import { ComparisonChart } from "../../../../workers/[roleId]/compare/comparison-chart"
+
+type PageProps = { params: Promise<{ roleId: string }> }
+
+export async function generateMetadata({ params }: PageProps): Promise {
+ const { roleId } = await params
+ const role = getEngineerRole(roleId)
+
+ if (!role) {
+ return {
+ title: "Role Not Found | Roo Code Evals",
+ description: "The requested engineer role was not found.",
+ }
+ }
+
+ const title = `Compare Models — ${role.name} | Roo Code Evals`
+ const description = `Interactive comparison of AI models for the ${role.name} setup. Compare composite score, success rate, cost efficiency, and speed.`
+ const ogDescription = `Compare Models — ${role.name}`
+ const path = `/evals/recommendations/roles/${roleId}/compare`
+
+ return {
+ title,
+ description,
+ alternates: {
+ canonical: `${SEO.url}${path}`,
+ },
+ openGraph: {
+ title,
+ description,
+ url: `${SEO.url}${path}`,
+ siteName: SEO.name,
+ images: [
+ {
+ url: ogImageUrl(title, ogDescription),
+ width: 1200,
+ height: 630,
+ alt: title,
+ },
+ ],
+ locale: SEO.locale,
+ type: "website",
+ },
+ twitter: {
+ card: SEO.twitterCard,
+ title,
+ description,
+ images: [ogImageUrl(title, ogDescription)],
+ },
+ keywords: [
+ ...SEO.keywords,
+ "AI coding",
+ "model comparison",
+ "coding evals",
+ role.name.toLowerCase(),
+ "bar chart",
+ "model comparison",
+ ],
+ }
+}
+
+export default async function RecommendationComparePage({ params }: PageProps) {
+ const { roleId } = await params
+ const recommendation = getRoleRecommendation(roleId)
+
+ if (!recommendation) {
+ notFound()
+ }
+
+ return (
+
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/recommendations/roles/[roleId]/page.tsx b/apps/web-roo-code/src/app/evals/recommendations/roles/[roleId]/page.tsx
new file mode 100644
index 00000000000..06ddc26155b
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/recommendations/roles/[roleId]/page.tsx
@@ -0,0 +1,101 @@
+import { notFound } from "next/navigation"
+import type { Metadata } from "next"
+
+import { SEO } from "@/lib/seo"
+import { ogImageUrl } from "@/lib/og"
+import { getRoleRecommendation, getCloudSetupUrl } from "@/lib/mock-recommendations"
+
+import { CandidatesContent } from "../../../workers/[roleId]/candidates-content"
+
+type PageProps = { params: Promise<{ roleId: string }> }
+
+export async function generateMetadata({ params }: PageProps): Promise {
+ const { roleId } = await params
+ const recommendation = getRoleRecommendation(roleId)
+
+ if (!recommendation) {
+ return {
+ title: "Role Not Found | Roo Code Evals",
+ description: "The requested engineer role was not found.",
+ }
+ }
+
+ const { role } = recommendation
+ const title = `${role.name} — Recommended Models | Roo Code Evals`
+ const description = `Eval-backed recommendations for ${role.name}. Compare models by success rate, cost, and speed across 5 languages.`
+ const ogDescription = `${role.name} — Recommended Models`
+ const path = `/evals/recommendations/roles/${roleId}`
+
+ return {
+ title,
+ description,
+ alternates: {
+ canonical: `${SEO.url}${path}`,
+ },
+ openGraph: {
+ title,
+ description,
+ url: `${SEO.url}${path}`,
+ siteName: SEO.name,
+ images: [
+ {
+ url: ogImageUrl(title, ogDescription),
+ width: 1200,
+ height: 630,
+ alt: title,
+ },
+ ],
+ locale: SEO.locale,
+ type: "website",
+ },
+ twitter: {
+ card: SEO.twitterCard,
+ title,
+ description,
+ images: [ogImageUrl(title, ogDescription)],
+ },
+ keywords: [
+ ...SEO.keywords,
+ "AI coding",
+ "coding agents",
+ "model recommendations",
+ "coding evals",
+ role.name.toLowerCase(),
+ "model comparison",
+ ],
+ }
+}
+
+export default async function RecommendationRolePage({ params }: PageProps) {
+ const { roleId } = await params
+ const recommendation = getRoleRecommendation(roleId)
+
+ if (!recommendation) {
+ notFound()
+ }
+
+ const { role, best, budgetHire, speedHire, allCandidates, totalEvalRuns, totalExercises, lastUpdated } =
+ recommendation
+
+ const cloudUrls: Record = {}
+ for (const candidate of allCandidates) {
+ cloudUrls[candidate.modelId] = getCloudSetupUrl(candidate)
+ }
+
+ return (
+
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx
new file mode 100644
index 00000000000..f6e566c1f1c
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx
@@ -0,0 +1,14 @@
+import { permanentRedirect } from "next/navigation"
+
+import { buildQueryString, type RedirectSearchParams } from "../../_redirect-utils"
+
+type PageProps = {
+ params: Promise<{ roleId: string }>
+ searchParams?: Promise
+}
+
+export default async function WorkersV2ComparePage({ params, searchParams }: PageProps) {
+ const { roleId } = await params
+ const sp = (await searchParams) ?? {}
+ permanentRedirect(`/evals/workers/${roleId}/compare${buildQueryString(sp)}`)
+}
diff --git a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx
new file mode 100644
index 00000000000..5b921157f8a
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx
@@ -0,0 +1,14 @@
+import { permanentRedirect } from "next/navigation"
+
+import { buildQueryString, type RedirectSearchParams } from "../_redirect-utils"
+
+type PageProps = {
+ params: Promise<{ roleId: string }>
+ searchParams?: Promise
+}
+
+export default async function WorkersV2RolePage({ params, searchParams }: PageProps) {
+ const { roleId } = await params
+ const sp = (await searchParams) ?? {}
+ permanentRedirect(`/evals/workers/${roleId}${buildQueryString(sp)}`)
+}
diff --git a/apps/web-roo-code/src/app/evals/workers-v2/_redirect-utils.ts b/apps/web-roo-code/src/app/evals/workers-v2/_redirect-utils.ts
new file mode 100644
index 00000000000..a8884e591a6
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/workers-v2/_redirect-utils.ts
@@ -0,0 +1,11 @@
+export type RedirectSearchParams = Record
+
+export function buildQueryString(searchParams: RedirectSearchParams): string {
+ const params = new URLSearchParams()
+ for (const [key, value] of Object.entries(searchParams)) {
+ if (typeof value === "string") params.set(key, value)
+ else if (Array.isArray(value)) value.forEach((v) => params.append(key, v))
+ }
+ const qs = params.toString()
+ return qs ? `?${qs}` : ""
+}
diff --git a/apps/web-roo-code/src/app/evals/workers-v2/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/page.tsx
new file mode 100644
index 00000000000..4f067eb2769
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/workers-v2/page.tsx
@@ -0,0 +1,12 @@
+import { permanentRedirect } from "next/navigation"
+
+import { buildQueryString, type RedirectSearchParams } from "./_redirect-utils"
+
+type PageProps = {
+ searchParams?: Promise
+}
+
+export default async function WorkersV2Page({ searchParams }: PageProps) {
+ const sp = (await searchParams) ?? {}
+ permanentRedirect(`/evals/recommendations${buildQueryString(sp)}`)
+}
diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx
new file mode 100644
index 00000000000..63d4cc27329
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx
@@ -0,0 +1,1037 @@
+"use client"
+
+import { motion } from "framer-motion"
+import {
+ Code,
+ GitBranch,
+ Building2,
+ Search,
+ Bot,
+ ArrowRight,
+ ArrowLeft,
+ Trophy,
+ DollarSign,
+ Zap,
+ ExternalLink,
+ CheckCircle2,
+ AlertTriangle,
+ FlaskConical,
+ BarChart3,
+ Beaker,
+} from "lucide-react"
+import type { LucideIcon } from "lucide-react"
+import Link from "next/link"
+import { useSearchParams } from "next/navigation"
+
+import type { ModelCandidate, LanguageScores, EngineerRole } from "@/lib/mock-recommendations"
+
+import { CopySettingsButton } from "./copy-settings-button"
+
+// ── Icon Mapping ────────────────────────────────────────────────────────────
+
+const ICON_MAP: Record = {
+ Code,
+ GitBranch,
+ Building2,
+ Search,
+ Bot,
+}
+
+// ── Role Color Themes ───────────────────────────────────────────────────────
+
+type RoleTheme = {
+ accent: string
+ accentLight: string
+ accentDark: string
+ iconBg: string
+ iconText: string
+ badgeBg: string
+ badgeText: string
+ borderHover: string
+ shadowHover: string
+ buttonBg: string
+ buttonHover: string
+ glowColor: string
+ gradientFrom: string
+ gradientVia: string
+ ringColor: string
+ scoreText: string
+ scoreBg: string
+ blurBg1: string
+ blurBg2: string
+ methodologyBorder: string
+}
+
+const ROLE_THEMES: Record = {
+ junior: {
+ accent: "emerald",
+ accentLight: "text-emerald-600",
+ accentDark: "dark:text-emerald-400",
+ iconBg: "bg-emerald-100 dark:bg-emerald-900/30",
+ iconText: "text-emerald-700 dark:text-emerald-300",
+ badgeBg: "bg-emerald-100 dark:bg-emerald-900/30",
+ badgeText: "text-emerald-700 dark:text-emerald-300",
+ borderHover: "hover:border-emerald-500/40 dark:hover:border-emerald-400/30",
+ shadowHover: "hover:shadow-emerald-500/10 dark:hover:shadow-emerald-400/10",
+ buttonBg: "bg-emerald-600 dark:bg-emerald-600",
+ buttonHover: "hover:bg-emerald-700 dark:hover:bg-emerald-500",
+ glowColor: "bg-emerald-500/8 dark:bg-emerald-600/15",
+ gradientFrom: "from-emerald-500",
+ gradientVia: "via-emerald-400",
+ ringColor: "ring-emerald-500/30",
+ scoreText: "text-emerald-400",
+ scoreBg: "bg-emerald-500/10 border-emerald-500/20",
+ blurBg1: "bg-emerald-500/10 dark:bg-emerald-600/20",
+ blurBg2: "bg-emerald-400/5 dark:bg-emerald-500/10",
+ methodologyBorder: "border-emerald-500/30 hover:border-emerald-500/50",
+ },
+ senior: {
+ accent: "blue",
+ accentLight: "text-blue-600",
+ accentDark: "dark:text-blue-400",
+ iconBg: "bg-blue-100 dark:bg-blue-900/30",
+ iconText: "text-blue-700 dark:text-blue-300",
+ badgeBg: "bg-blue-100 dark:bg-blue-900/30",
+ badgeText: "text-blue-700 dark:text-blue-300",
+ borderHover: "hover:border-blue-500/40 dark:hover:border-blue-400/30",
+ shadowHover: "hover:shadow-blue-500/10 dark:hover:shadow-blue-400/10",
+ buttonBg: "bg-blue-600 dark:bg-blue-600",
+ buttonHover: "hover:bg-blue-700 dark:hover:bg-blue-500",
+ glowColor: "bg-blue-500/8 dark:bg-blue-600/15",
+ gradientFrom: "from-blue-500",
+ gradientVia: "via-blue-400",
+ ringColor: "ring-blue-500/30",
+ scoreText: "text-blue-400",
+ scoreBg: "bg-blue-500/10 border-blue-500/20",
+ blurBg1: "bg-blue-500/10 dark:bg-blue-600/20",
+ blurBg2: "bg-blue-400/5 dark:bg-blue-500/10",
+ methodologyBorder: "border-blue-500/30 hover:border-blue-500/50",
+ },
+ staff: {
+ accent: "amber",
+ accentLight: "text-amber-600",
+ accentDark: "dark:text-amber-400",
+ iconBg: "bg-amber-100 dark:bg-amber-900/30",
+ iconText: "text-amber-700 dark:text-amber-300",
+ badgeBg: "bg-amber-100 dark:bg-amber-900/30",
+ badgeText: "text-amber-700 dark:text-amber-300",
+ borderHover: "hover:border-amber-500/40 dark:hover:border-amber-400/30",
+ shadowHover: "hover:shadow-amber-500/10 dark:hover:shadow-amber-400/10",
+ buttonBg: "bg-amber-600 dark:bg-amber-600",
+ buttonHover: "hover:bg-amber-700 dark:hover:bg-amber-500",
+ glowColor: "bg-amber-500/8 dark:bg-amber-600/15",
+ gradientFrom: "from-amber-500",
+ gradientVia: "via-amber-400",
+ ringColor: "ring-amber-500/30",
+ scoreText: "text-amber-400",
+ scoreBg: "bg-amber-500/10 border-amber-500/20",
+ blurBg1: "bg-amber-500/10 dark:bg-amber-600/20",
+ blurBg2: "bg-amber-400/5 dark:bg-amber-500/10",
+ methodologyBorder: "border-amber-500/30 hover:border-amber-500/50",
+ },
+ reviewer: {
+ accent: "violet",
+ accentLight: "text-violet-600",
+ accentDark: "dark:text-violet-400",
+ iconBg: "bg-violet-100 dark:bg-violet-900/30",
+ iconText: "text-violet-700 dark:text-violet-300",
+ badgeBg: "bg-violet-100 dark:bg-violet-900/30",
+ badgeText: "text-violet-700 dark:text-violet-300",
+ borderHover: "hover:border-violet-500/40 dark:hover:border-violet-400/30",
+ shadowHover: "hover:shadow-violet-500/10 dark:hover:shadow-violet-400/10",
+ buttonBg: "bg-violet-600 dark:bg-violet-600",
+ buttonHover: "hover:bg-violet-700 dark:hover:bg-violet-500",
+ glowColor: "bg-violet-500/8 dark:bg-violet-600/15",
+ gradientFrom: "from-violet-500",
+ gradientVia: "via-violet-400",
+ ringColor: "ring-violet-500/30",
+ scoreText: "text-violet-400",
+ scoreBg: "bg-violet-500/10 border-violet-500/20",
+ blurBg1: "bg-violet-500/10 dark:bg-violet-600/20",
+ blurBg2: "bg-violet-400/5 dark:bg-violet-500/10",
+ methodologyBorder: "border-violet-500/30 hover:border-violet-500/50",
+ },
+ autonomous: {
+ accent: "cyan",
+ accentLight: "text-cyan-600",
+ accentDark: "dark:text-cyan-400",
+ iconBg: "bg-cyan-100 dark:bg-cyan-900/30",
+ iconText: "text-cyan-700 dark:text-cyan-300",
+ badgeBg: "bg-cyan-100 dark:bg-cyan-900/30",
+ badgeText: "text-cyan-700 dark:text-cyan-300",
+ borderHover: "hover:border-cyan-500/40 dark:hover:border-cyan-400/30",
+ shadowHover: "hover:shadow-cyan-500/10 dark:hover:shadow-cyan-400/10",
+ buttonBg: "bg-cyan-600 dark:bg-cyan-600",
+ buttonHover: "hover:bg-cyan-700 dark:hover:bg-cyan-500",
+ glowColor: "bg-cyan-500/8 dark:bg-cyan-600/15",
+ gradientFrom: "from-cyan-500",
+ gradientVia: "via-cyan-400",
+ ringColor: "ring-cyan-500/30",
+ scoreText: "text-cyan-400",
+ scoreBg: "bg-cyan-500/10 border-cyan-500/20",
+ blurBg1: "bg-cyan-500/10 dark:bg-cyan-600/20",
+ blurBg2: "bg-cyan-400/5 dark:bg-cyan-500/10",
+ methodologyBorder: "border-cyan-500/30 hover:border-cyan-500/50",
+ },
+}
+
+const DEFAULT_THEME = ROLE_THEMES.senior!
+
+// ── Framer Motion Variants ──────────────────────────────────────────────────
+
+const containerVariants = {
+ hidden: { opacity: 0 },
+ visible: {
+ opacity: 1,
+ transition: {
+ staggerChildren: 0.12,
+ delayChildren: 0.1,
+ },
+ },
+}
+
+const cardVariants = {
+ hidden: { opacity: 0, y: 30 },
+ visible: {
+ opacity: 1,
+ y: 0,
+ transition: {
+ duration: 0.6,
+ ease: [0.21, 0.45, 0.27, 0.9] as const,
+ },
+ },
+}
+
+const fadeUpVariants = {
+ hidden: { opacity: 0, y: 20 },
+ visible: {
+ opacity: 1,
+ y: 0,
+ transition: {
+ duration: 0.6,
+ ease: [0.21, 0.45, 0.27, 0.9] as const,
+ },
+ },
+}
+
+const backgroundVariants = {
+ hidden: { opacity: 0 },
+ visible: {
+ opacity: 1,
+ transition: {
+ duration: 1.2,
+ ease: "easeOut" as const,
+ },
+ },
+}
+
+const tableRowVariants = {
+ hidden: { opacity: 0, x: -10 },
+ visible: {
+ opacity: 1,
+ x: 0,
+ transition: {
+ duration: 0.4,
+ ease: [0.21, 0.45, 0.27, 0.9] as const,
+ },
+ },
+}
+
+// ── Helpers ─────────────────────────────────────────────────────────────────
+
+function scoreBadgeColor(score: number): string {
+ if (score >= 85) return "bg-green-500/10 text-green-400 border border-green-500/20"
+ if (score >= 70) return "bg-blue-500/10 text-blue-400 border border-blue-500/20"
+ if (score >= 50) return "bg-yellow-500/10 text-yellow-400 border border-yellow-500/20"
+ return "bg-red-500/10 text-red-400 border border-red-500/20"
+}
+
+function tierBadge(tier: ModelCandidate["tier"]): { label: string; className: string } {
+ switch (tier) {
+ case "best":
+ return {
+ label: "Best",
+ className: "bg-green-500/10 text-green-400 border border-green-500/20",
+ }
+ case "recommended":
+ return {
+ label: "Recommended",
+ className: "bg-blue-500/10 text-blue-400 border border-blue-500/20",
+ }
+ case "situational":
+ return {
+ label: "Situational",
+ className: "bg-yellow-500/10 text-yellow-400 border border-yellow-500/20",
+ }
+ case "not-recommended":
+ return {
+ label: "Not Recommended",
+ className: "bg-red-500/10 text-red-400 border border-red-500/20",
+ }
+ }
+}
+
+const RANK_BADGES = ["🥇", "🥈", "🥉"]
+
+const LANGUAGE_CONFIG: { key: keyof LanguageScores; label: string; color: string; bgColor: string }[] = [
+ { key: "python", label: "Python", color: "bg-green-500", bgColor: "bg-green-500/20" },
+ { key: "javascript", label: "JS", color: "bg-yellow-500", bgColor: "bg-yellow-500/20" },
+ { key: "java", label: "Java", color: "bg-orange-500", bgColor: "bg-orange-500/20" },
+ { key: "go", label: "Go", color: "bg-cyan-500", bgColor: "bg-cyan-500/20" },
+ { key: "rust", label: "Rust", color: "bg-red-500", bgColor: "bg-red-500/20" },
+]
+
+function settingsLabel(candidate: ModelCandidate): string {
+ const parts = [`temp=${candidate.settings.temperature}`]
+ if (candidate.settings.reasoningEffort) {
+ parts.push(`reasoning=${candidate.settings.reasoningEffort}`)
+ }
+ return parts.join(", ")
+}
+
+// ── Language Score Bars ─────────────────────────────────────────────────────
+
+function LanguageBars({ scores }: { scores: LanguageScores }) {
+ return (
+
+ {LANGUAGE_CONFIG.map(({ key, label, color, bgColor }) => {
+ const value = scores[key]
+ return (
+
+ )
+ })}
+
+ )
+}
+
+// ── Composite Score Ring ────────────────────────────────────────────────────
+
+function ScoreRing({ score, theme }: { score: number; theme: RoleTheme }) {
+ const circumference = 2 * Math.PI * 40
+ const strokeDashoffset = circumference - (score / 100) * circumference
+
+ return (
+
+ )
+}
+
+// ── Candidate Card ──────────────────────────────────────────────────────────
+
+function CandidateCard({
+ candidate,
+ rank,
+ theme,
+ cloudUrl,
+ highlight,
+}: {
+ candidate: ModelCandidate
+ rank?: number
+ theme: RoleTheme
+ cloudUrl: string
+ highlight?: "cost" | "speed"
+}) {
+ const tier = tierBadge(candidate.tier)
+ const copySettings = {
+ provider: candidate.provider,
+ model: candidate.modelId,
+ temperature: candidate.settings.temperature,
+ ...(candidate.settings.reasoningEffort ? { reasoningEffort: candidate.settings.reasoningEffort } : {}),
+ }
+
+ return (
+
+ {/* Subtle glow on hover */}
+
+
+
+ {/* Rank badge */}
+ {rank !== undefined && rank < 3 && (
+
+ {RANK_BADGES[rank]}
+
+ )}
+
+ {/* Provider label */}
+
+ {candidate.provider}
+
+
+ {/* Model name */}
+
{candidate.displayName}
+
+ {/* Tier pill */}
+
+
+ {tier.label}
+
+
+
+ {/* Score ring */}
+
+
+
+
+ {/* Key metrics grid */}
+
+
+
Success
+
{candidate.successRate}%
+
+
+
+ Daily Cost
+
+
+ ${Math.round(candidate.estimatedDailyCost)}/day
+
+
+ (${candidate.avgCostPerTask.toFixed(3)}/task)
+
+
+
+
Avg Time
+
+ {candidate.avgTimePerTask.toFixed(1)}s
+
+
+
+
+ {/* Per-language breakdown */}
+
+
+ Language Scores
+
+
+
+
+ {/* Recommended settings */}
+
+ {settingsLabel(candidate)}
+
+
+ {/* Caveats */}
+ {candidate.caveats && candidate.caveats.length > 0 && (
+
+ {candidate.caveats.map((caveat) => (
+
+
+ {caveat}
+
+ ))}
+
+ )}
+
+ {/* CTAs */}
+
+
+
+ )
+}
+
+// ── Compact Card (Budget / Speed) ───────────────────────────────────────────
+
+function CompactCard({
+ candidate,
+ label,
+ icon: IconComp,
+ highlight,
+ theme,
+ cloudUrl,
+}: {
+ candidate: ModelCandidate
+ label: string
+ icon: LucideIcon
+ highlight: "cost" | "speed"
+ theme: RoleTheme
+ cloudUrl: string
+}) {
+ const tier = tierBadge(candidate.tier)
+ const copySettings = {
+ provider: candidate.provider,
+ model: candidate.modelId,
+ temperature: candidate.settings.temperature,
+ ...(candidate.settings.reasoningEffort ? { reasoningEffort: candidate.settings.reasoningEffort } : {}),
+ }
+
+ return (
+
+ {/* Subtle glow on hover */}
+
+
+
+ {/* Label header */}
+
+
+
+ {/* Left: model info + score */}
+
+
+ {candidate.provider}
+
+
{candidate.displayName}
+
+ {/* Score + tier */}
+
+ {candidate.compositeScore}
+
+ {tier.label}
+
+
+
+
+ {/* Right: highlighted metric */}
+
+
+ {highlight === "cost" ? "Daily Cost" : "Avg Time"}
+
+
+ {highlight === "cost"
+ ? `$${Math.round(candidate.estimatedDailyCost)}/day`
+ : `${candidate.avgTimePerTask.toFixed(1)}s`}
+
+ {highlight === "cost" && (
+
+ (${candidate.avgCostPerTask.toFixed(3)}/task)
+
+ )}
+
+
+
+ {/* Metrics row */}
+
+
+
Success
+
{candidate.successRate}%
+
+
+
+ Daily Cost
+
+
+ ${Math.round(candidate.estimatedDailyCost)}
+
+
+ (${candidate.avgCostPerTask.toFixed(3)}/task)
+
+
+
+
Time
+
{candidate.avgTimePerTask.toFixed(1)}s
+
+
+
+ {/* Language bars */}
+
+
+
+
+ {/* Settings */}
+
+ {settingsLabel(candidate)}
+
+
+ {/* CTAs */}
+
+
+
+ )
+}
+
+// ── Props ───────────────────────────────────────────────────────────────────
+
+export type CandidatesContentProps = {
+ roleId: string
+ role: EngineerRole
+ best: ModelCandidate[]
+ budgetHire: ModelCandidate | null
+ speedHire: ModelCandidate | null
+ allCandidates: ModelCandidate[]
+ totalEvalRuns: number
+ totalExercises: number
+ lastUpdated: string
+ cloudUrls: Record
+ workersRootPath?: string
+ /**
+ * Base path for role detail routes, without the role id.
+ * Examples: `/evals/workers`, `/evals/recommendations/roles`.
+ */
+ roleBasePath?: string
+}
+
+// ── Main Content Component ──────────────────────────────────────────────────
+
+export function CandidatesContent({
+ roleId,
+ role,
+ best,
+ budgetHire,
+ speedHire,
+ allCandidates,
+ totalEvalRuns,
+ totalExercises,
+ lastUpdated,
+ cloudUrls,
+ workersRootPath = "/evals/recommendations",
+ roleBasePath = workersRootPath,
+}: CandidatesContentProps) {
+ const searchParams = useSearchParams()
+ const theme = ROLE_THEMES[roleId] ?? DEFAULT_THEME
+ const IconComponent = ICON_MAP[role.icon] ?? Code
+ const outcome = searchParams.get("outcome")
+ const objectiveSlug = searchParams.get("objective")
+ const mode = searchParams.get("mode")
+ const setupQuery = (() => {
+ if (!outcome && !objectiveSlug) return ""
+ const params = new URLSearchParams()
+ if (outcome) params.set("outcome", outcome)
+ if (objectiveSlug) params.set("objective", objectiveSlug)
+ if (mode) params.set("mode", mode)
+ return `?${params.toString()}`
+ })()
+ let homeHref = `${workersRootPath}${setupQuery}`
+ if (objectiveSlug && workersRootPath === "/evals/recommendations") {
+ homeHref = `${workersRootPath}/${objectiveSlug}`
+ if (mode) homeHref += `?mode=${encodeURIComponent(mode)}`
+ }
+
+ return (
+ <>
+ {/* ── Role Header ────────────────────────────────────────────── */}
+
+ {/* Atmospheric blur gradient background */}
+
+
+
+
+
+
+ {/* Breadcrumb */}
+
+
+ Evals
+
+ /
+
+ Build with Roo Code Cloud
+
+ /
+ {role.name}
+
+
+ {/* Icon + Title row */}
+
+
+
+
+
+
{role.name}
+
+ {role.salaryRange}
+
+
+ {role.description}
+
+
+
+
+ {/* Stats bar */}
+
+
+
+
+ {totalEvalRuns.toLocaleString()}
+
+ eval runs
+
+
+
+
+
+ {totalExercises.toLocaleString()}
+
+ exercises
+
+
+
+ Updated{" "}
+
+ {new Date(lastUpdated).toLocaleDateString("en-US", {
+ month: "short",
+ day: "numeric",
+ year: "numeric",
+ timeZone: "UTC",
+ })}
+
+
+
+
+
+ Methodology
+
+
+
+
+ {/* Strengths + Trade-offs grid */}
+
+
+
+ Strengths
+
+ {role.strengths.map((s) => (
+
+
+ {s}
+
+ ))}
+
+
+
+ Trade-offs
+
+ {role.weaknesses.map((w) => (
+
+ ))}
+
+
+
+
+
+
+ {/* ── Top Models: Best Overall ────────────────────────────────── */}
+
+
+
+
+
+ Top Models
+
+
+
+ {best.map((candidate, i) => (
+
+ ))}
+
+
+
+
+
+ {/* ── Budget & Speed Hire ─────────────────────────────────────── */}
+ {(budgetHire || speedHire) && (
+
+
+
+ {budgetHire && (
+
+ )}
+ {speedHire && (
+
+ )}
+
+
+
+ )}
+
+ {/* ── All Models Table ────────────────────────────────────────── */}
+
+ {/* Subtle background */}
+
+
+
+
+
+
+
+ All Models
+
+
+
+
+
+
+
+ #
+
+
+ Model
+
+
+ Provider
+
+
+ Score
+
+
+ Tier
+
+
+ Success
+
+
+ Daily Cost
+
+
+ Time
+
+
+
+
+ {allCandidates.map((candidate, i) => {
+ const tier = tierBadge(candidate.tier)
+ return (
+
+
+ {i + 1}
+
+ {candidate.displayName}
+
+ {candidate.provider}
+
+
+
+ {candidate.compositeScore}
+
+
+
+
+ {tier.label}
+
+
+
+ {candidate.successRate}%
+
+
+ ${Math.round(candidate.estimatedDailyCost)}
+
+ (${candidate.avgCostPerTask.toFixed(3)})
+
+
+
+ {candidate.avgTimePerTask.toFixed(1)}s
+
+
+ )
+ })}
+
+
+
+
+ {/* Compare link */}
+
+
+ 📊 Compare all candidates
+
+
+
+
+
+
+
+ {/* ── Bottom Navigation ───────────────────────────────────────── */}
+
+
+
+
+
+
+ Back to all roles
+
+
+
+
+ 📊 Compare candidates
+
+
+ 📋 Raw eval data
+
+
+
+
+
+
+ >
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx
new file mode 100644
index 00000000000..38e6dec6812
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx
@@ -0,0 +1,1054 @@
+"use client"
+
+import { useState, useMemo, useCallback } from "react"
+import Link from "next/link"
+import { useSearchParams } from "next/navigation"
+import { motion } from "framer-motion"
+import {
+ ArrowLeft,
+ ArrowRight,
+ Copy,
+ Check,
+ FileJson,
+ FileSpreadsheet,
+ BarChart3,
+ SlidersHorizontal,
+ Download,
+ FlaskConical,
+} from "lucide-react"
+import {
+ BarChart,
+ Bar,
+ XAxis,
+ YAxis,
+ Tooltip,
+ ResponsiveContainer,
+ Legend,
+ ScatterChart,
+ Scatter,
+ ZAxis,
+ Cell,
+ ReferenceArea,
+} from "recharts"
+
+import type { ModelCandidate, LanguageScores, EngineerRole, RoleRecommendation } from "@/lib/mock-recommendations"
+import { TASKS_PER_DAY } from "@/lib/mock-recommendations"
+
+// ── Role Color Themes (matching candidates-content.tsx) ─────────────────────
+
+type RoleTheme = {
+ accent: string
+ accentLight: string
+ accentDark: string
+ iconBg: string
+ iconText: string
+ buttonBg: string
+ buttonHover: string
+ glowColor: string
+ blurBg1: string
+ blurBg2: string
+ borderHover: string
+ shadowHover: string
+ methodologyBorder: string
+ scoreText: string
+ pillActive: string
+ pillActiveBg: string
+ checkboxAccent: string
+ sliderAccent: string
+}
+
+const ROLE_THEMES: Record = {
+ junior: {
+ accent: "emerald",
+ accentLight: "text-emerald-600",
+ accentDark: "dark:text-emerald-400",
+ iconBg: "bg-emerald-100 dark:bg-emerald-900/30",
+ iconText: "text-emerald-700 dark:text-emerald-300",
+ buttonBg: "bg-emerald-600 dark:bg-emerald-600",
+ buttonHover: "hover:bg-emerald-700 dark:hover:bg-emerald-500",
+ glowColor: "bg-emerald-500/8 dark:bg-emerald-600/15",
+ blurBg1: "bg-emerald-500/10 dark:bg-emerald-600/20",
+ blurBg2: "bg-emerald-400/5 dark:bg-emerald-500/10",
+ borderHover: "hover:border-emerald-500/40 dark:hover:border-emerald-400/30",
+ shadowHover: "hover:shadow-emerald-500/10 dark:hover:shadow-emerald-400/10",
+ methodologyBorder: "border-emerald-500/30 hover:border-emerald-500/50",
+ scoreText: "text-emerald-400",
+ pillActive: "bg-emerald-600 text-white shadow-lg shadow-emerald-600/25",
+ pillActiveBg: "bg-emerald-600",
+ checkboxAccent: "accent-emerald-600",
+ sliderAccent: "accent-emerald-600",
+ },
+ senior: {
+ accent: "blue",
+ accentLight: "text-blue-600",
+ accentDark: "dark:text-blue-400",
+ iconBg: "bg-blue-100 dark:bg-blue-900/30",
+ iconText: "text-blue-700 dark:text-blue-300",
+ buttonBg: "bg-blue-600 dark:bg-blue-600",
+ buttonHover: "hover:bg-blue-700 dark:hover:bg-blue-500",
+ glowColor: "bg-blue-500/8 dark:bg-blue-600/15",
+ blurBg1: "bg-blue-500/10 dark:bg-blue-600/20",
+ blurBg2: "bg-blue-400/5 dark:bg-blue-500/10",
+ borderHover: "hover:border-blue-500/40 dark:hover:border-blue-400/30",
+ shadowHover: "hover:shadow-blue-500/10 dark:hover:shadow-blue-400/10",
+ methodologyBorder: "border-blue-500/30 hover:border-blue-500/50",
+ scoreText: "text-blue-400",
+ pillActive: "bg-blue-600 text-white shadow-lg shadow-blue-600/25",
+ pillActiveBg: "bg-blue-600",
+ checkboxAccent: "accent-blue-600",
+ sliderAccent: "accent-blue-600",
+ },
+ staff: {
+ accent: "amber",
+ accentLight: "text-amber-600",
+ accentDark: "dark:text-amber-400",
+ iconBg: "bg-amber-100 dark:bg-amber-900/30",
+ iconText: "text-amber-700 dark:text-amber-300",
+ buttonBg: "bg-amber-600 dark:bg-amber-600",
+ buttonHover: "hover:bg-amber-700 dark:hover:bg-amber-500",
+ glowColor: "bg-amber-500/8 dark:bg-amber-600/15",
+ blurBg1: "bg-amber-500/10 dark:bg-amber-600/20",
+ blurBg2: "bg-amber-400/5 dark:bg-amber-500/10",
+ borderHover: "hover:border-amber-500/40 dark:hover:border-amber-400/30",
+ shadowHover: "hover:shadow-amber-500/10 dark:hover:shadow-amber-400/10",
+ methodologyBorder: "border-amber-500/30 hover:border-amber-500/50",
+ scoreText: "text-amber-400",
+ pillActive: "bg-amber-600 text-white shadow-lg shadow-amber-600/25",
+ pillActiveBg: "bg-amber-600",
+ checkboxAccent: "accent-amber-600",
+ sliderAccent: "accent-amber-600",
+ },
+ reviewer: {
+ accent: "violet",
+ accentLight: "text-violet-600",
+ accentDark: "dark:text-violet-400",
+ iconBg: "bg-violet-100 dark:bg-violet-900/30",
+ iconText: "text-violet-700 dark:text-violet-300",
+ buttonBg: "bg-violet-600 dark:bg-violet-600",
+ buttonHover: "hover:bg-violet-700 dark:hover:bg-violet-500",
+ glowColor: "bg-violet-500/8 dark:bg-violet-600/15",
+ blurBg1: "bg-violet-500/10 dark:bg-violet-600/20",
+ blurBg2: "bg-violet-400/5 dark:bg-violet-500/10",
+ borderHover: "hover:border-violet-500/40 dark:hover:border-violet-400/30",
+ shadowHover: "hover:shadow-violet-500/10 dark:hover:shadow-violet-400/10",
+ methodologyBorder: "border-violet-500/30 hover:border-violet-500/50",
+ scoreText: "text-violet-400",
+ pillActive: "bg-violet-600 text-white shadow-lg shadow-violet-600/25",
+ pillActiveBg: "bg-violet-600",
+ checkboxAccent: "accent-violet-600",
+ sliderAccent: "accent-violet-600",
+ },
+ autonomous: {
+ accent: "cyan",
+ accentLight: "text-cyan-600",
+ accentDark: "dark:text-cyan-400",
+ iconBg: "bg-cyan-100 dark:bg-cyan-900/30",
+ iconText: "text-cyan-700 dark:text-cyan-300",
+ buttonBg: "bg-cyan-600 dark:bg-cyan-600",
+ buttonHover: "hover:bg-cyan-700 dark:hover:bg-cyan-500",
+ glowColor: "bg-cyan-500/8 dark:bg-cyan-600/15",
+ blurBg1: "bg-cyan-500/10 dark:bg-cyan-600/20",
+ blurBg2: "bg-cyan-400/5 dark:bg-cyan-500/10",
+ borderHover: "hover:border-cyan-500/40 dark:hover:border-cyan-400/30",
+ shadowHover: "hover:shadow-cyan-500/10 dark:hover:shadow-cyan-400/10",
+ methodologyBorder: "border-cyan-500/30 hover:border-cyan-500/50",
+ scoreText: "text-cyan-400",
+ pillActive: "bg-cyan-600 text-white shadow-lg shadow-cyan-600/25",
+ pillActiveBg: "bg-cyan-600",
+ checkboxAccent: "accent-cyan-600",
+ sliderAccent: "accent-cyan-600",
+ },
+}
+
+const DEFAULT_THEME = ROLE_THEMES.senior!
+
+// ── Framer Motion Variants ──────────────────────────────────────────────────
+
+const containerVariants = {
+ hidden: { opacity: 0 },
+ visible: {
+ opacity: 1,
+ transition: {
+ staggerChildren: 0.12,
+ delayChildren: 0.1,
+ },
+ },
+}
+
+const fadeUpVariants = {
+ hidden: { opacity: 0, y: 20 },
+ visible: {
+ opacity: 1,
+ y: 0,
+ transition: {
+ duration: 0.6,
+ ease: [0.21, 0.45, 0.27, 0.9] as const,
+ },
+ },
+}
+
+const backgroundVariants = {
+ hidden: { opacity: 0 },
+ visible: {
+ opacity: 1,
+ transition: {
+ duration: 1.2,
+ ease: "easeOut" as const,
+ },
+ },
+}
+
+// ── Constants ───────────────────────────────────────────────────────────────
+
+const LANGUAGES: { key: keyof LanguageScores; label: string }[] = [
+ { key: "go", label: "Go" },
+ { key: "java", label: "Java" },
+ { key: "javascript", label: "JavaScript" },
+ { key: "python", label: "Python" },
+ { key: "rust", label: "Rust" },
+]
+
+const PROVIDERS = [
+ "anthropic",
+ "openai",
+ "google",
+ "deepseek",
+ "groq",
+ "alibaba",
+ "mistral",
+ "xai",
+ "moonshot",
+] as const
+
+const PROVIDER_LABELS: Record = {
+ anthropic: "Anthropic",
+ openai: "OpenAI",
+ google: "Google",
+ deepseek: "DeepSeek",
+ groq: "Meta/Groq",
+ alibaba: "Alibaba",
+ mistral: "Mistral",
+ xai: "xAI",
+ moonshot: "Moonshot",
+}
+
+const DIMENSION_COLORS = {
+ composite: "#3b82f6", // blue
+ success: "#22c55e", // green
+ cost: "#f59e0b", // amber
+ speed: "#a855f7", // purple
+}
+
+const TIER_COLORS: Record = {
+ best: "#22c55e", // green
+ recommended: "#3b82f6", // blue
+ situational: "#eab308", // yellow
+ "not-recommended": "#ef4444", // red
+}
+
+const TIER_LABELS: Record = {
+ best: "Best",
+ recommended: "Recommended",
+ situational: "Situational",
+ "not-recommended": "Not Recommended",
+}
+
+// ── Helpers ─────────────────────────────────────────────────────────────────
+
+/** Normalize cost: lower cost → higher bar (0–100). */
+function normalizeCost(cost: number, maxCost: number): number {
+ if (maxCost === 0) return 100
+ return Math.round((1 - cost / maxCost) * 100)
+}
+
+/** Normalize speed: lower time → higher bar (0–100). */
+function normalizeSpeed(time: number, maxTime: number): number {
+ if (maxTime === 0) return 100
+ return Math.round((1 - time / maxTime) * 100)
+}
+
+function buildChartData(
+ candidates: ModelCandidate[],
+ language: keyof LanguageScores | "all",
+ maxCost: number,
+ maxTime: number,
+) {
+ return candidates.map((c) => ({
+ name: c.displayName,
+ composite: language === "all" ? c.compositeScore : c.languageScores[language],
+ success: c.successRate,
+ costEfficiency: normalizeCost(c.avgCostPerTask, maxCost),
+ speed: normalizeSpeed(c.avgTimePerTask, maxTime),
+ // raw daily cost for tooltip display
+ dailyCost: Math.round(c.estimatedDailyCost),
+ costPerTask: c.avgCostPerTask,
+ // raw data for export
+ _raw: c,
+ }))
+}
+
+function candidateToCsvRow(c: ModelCandidate): string {
+ return [
+ c.provider,
+ c.modelId,
+ c.displayName,
+ c.compositeScore,
+ c.successRate,
+ c.avgCostPerTask,
+ Math.round(c.estimatedDailyCost),
+ c.avgTimePerTask,
+ c.languageScores.go,
+ c.languageScores.java,
+ c.languageScores.javascript,
+ c.languageScores.python,
+ c.languageScores.rust,
+ c.tier,
+ `"${c.settings.temperature}"`,
+ `"${c.settings.reasoningEffort ?? ""}"`,
+ ].join(",")
+}
+
+function downloadBlob(content: string, filename: string, mimeType: string) {
+ const blob = new Blob([content], { type: mimeType })
+ const url = URL.createObjectURL(blob)
+ const a = document.createElement("a")
+ a.href = url
+ a.download = filename
+ document.body.appendChild(a)
+ a.click()
+ document.body.removeChild(a)
+ URL.revokeObjectURL(url)
+}
+
+// ── Custom Tooltip ──────────────────────────────────────────────────────────
+
+function CustomTooltip({
+ active,
+ payload,
+ label,
+}: {
+ active?: boolean
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
+ payload?: any[]
+ label?: string
+}) {
+ if (!active || !payload || !payload.length) return null
+
+ // Extract raw daily cost from first payload entry's data
+ const rawData = payload[0]?.payload as { dailyCost?: number; costPerTask?: number } | undefined
+ const dailyCost = rawData?.dailyCost
+ const costPerTask = rawData?.costPerTask
+
+ return (
+
+
{label}
+
+ {payload.map(
+ (
+ entry: {
+ name: string
+ value: number
+ color: string
+ dataKey: string
+ },
+ index: number,
+ ) => (
+
+
+ {entry.name}:
+
+ {entry.dataKey === "costEfficiency" && dailyCost !== undefined
+ ? `${entry.value} ($${dailyCost}/day · $${costPerTask?.toFixed(3)}/task)`
+ : entry.value}
+
+
+ ),
+ )}
+
+
+ )
+}
+
+// ── Scatter Tooltip ─────────────────────────────────────────────────────────
+
+function ScatterTooltip({
+ active,
+ payload,
+}: {
+ active?: boolean
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
+ payload?: any[]
+}) {
+ if (!active || !payload || !payload.length) return null
+
+ const data = payload[0]?.payload as
+ | {
+ name?: string
+ dailyCost?: number
+ score?: number
+ successRate?: number
+ tier?: string
+ }
+ | undefined
+
+ if (!data) return null
+
+ return (
+
+
{data.name}
+
+
+
+ Tier:
+ {TIER_LABELS[data.tier ?? "situational"]}
+
+
+
+ Daily Spend:
+ ${data.dailyCost}/day
+
+
+
+ Eval Score:
+ {data.score}
+
+
+
+ Success Rate:
+ {data.successRate}%
+
+
+
+ )
+}
+
+// ── Main Component ──────────────────────────────────────────────────────────
+
+interface ComparisonChartProps {
+ recommendation: RoleRecommendation
+ role: EngineerRole
+ roleId: string
+ workersRootPath?: string
+ /**
+ * Base path for role detail routes, without the role id.
+ * Examples: `/evals/workers`, `/evals/recommendations/roles`.
+ */
+ roleBasePath?: string
+}
+
+export function ComparisonChart({
+ recommendation,
+ role,
+ roleId,
+ workersRootPath = "/evals/recommendations",
+ roleBasePath = workersRootPath,
+}: ComparisonChartProps) {
+ const searchParams = useSearchParams()
+ const { allCandidates } = recommendation
+ const theme = ROLE_THEMES[roleId] ?? DEFAULT_THEME
+ const outcome = searchParams.get("outcome")
+ const objectiveSlug = searchParams.get("objective")
+ const mode = searchParams.get("mode")
+ const setupQuery = (() => {
+ if (!outcome && !objectiveSlug) return ""
+ const params = new URLSearchParams()
+ if (outcome) params.set("outcome", outcome)
+ if (objectiveSlug) params.set("objective", objectiveSlug)
+ if (mode) params.set("mode", mode)
+ return `?${params.toString()}`
+ })()
+ let homeHref = `${workersRootPath}${setupQuery}`
+ if (objectiveSlug && workersRootPath === "/evals/recommendations") {
+ homeHref = `${workersRootPath}/${objectiveSlug}`
+ if (mode) homeHref += `?mode=${encodeURIComponent(mode)}`
+ }
+
+ // ── State ───────────────────────────────────────────────────────────────
+ const [selectedLanguage, setSelectedLanguage] = useState("all")
+ const [enabledProviders, setEnabledProviders] = useState>(() => new Set(PROVIDERS))
+ const [minSuccessRate, setMinSuccessRate] = useState(0)
+ const [copiedSettings, setCopiedSettings] = useState(false)
+
+ // ── Derived ─────────────────────────────────────────────────────────────
+
+ const filteredCandidates = useMemo(
+ () => allCandidates.filter((c) => enabledProviders.has(c.provider) && c.successRate >= minSuccessRate),
+ [allCandidates, enabledProviders, minSuccessRate],
+ )
+
+ const maxCost = useMemo(() => Math.max(...allCandidates.map((c) => c.avgCostPerTask), 0.001), [allCandidates])
+
+ const maxTime = useMemo(() => Math.max(...allCandidates.map((c) => c.avgTimePerTask), 0.1), [allCandidates])
+
+ const chartData = useMemo(
+ () => buildChartData(filteredCandidates, selectedLanguage, maxCost, maxTime),
+ [filteredCandidates, selectedLanguage, maxCost, maxTime],
+ )
+
+ const chartHeight = Math.max(400, chartData.length * 100)
+
+ // Scatter plot data: value map of daily cost vs composite score
+ const scatterData = useMemo(
+ () =>
+ filteredCandidates.map((c) => ({
+ name: c.displayName,
+ dailyCost: Math.round(c.estimatedDailyCost),
+ score: c.compositeScore,
+ successRate: c.successRate,
+ tier: c.tier,
+ // ZAxis size: map success rate to dot size (60–400 range)
+ dotSize: Math.round(60 + (c.successRate / 100) * 340),
+ })),
+ [filteredCandidates],
+ )
+
+ // Determine axis domains for scatter plot
+ const scatterMaxCost = useMemo(() => Math.max(...scatterData.map((d) => d.dailyCost), 10), [scatterData])
+ const scatterMinScore = useMemo(() => Math.min(...scatterData.map((d) => d.score), 50), [scatterData])
+
+ // Providers that actually appear in data
+ const activeProviders = useMemo(() => {
+ const providers = new Set(allCandidates.map((c) => c.provider))
+ return PROVIDERS.filter((p) => providers.has(p))
+ }, [allCandidates])
+
+ // ── Handlers ────────────────────────────────────────────────────────────
+
+ const toggleProvider = useCallback((provider: string) => {
+ setEnabledProviders((prev) => {
+ const next = new Set(prev)
+ if (next.has(provider)) {
+ next.delete(provider)
+ } else {
+ next.add(provider)
+ }
+ return next
+ })
+ }, [])
+
+ const handleCopySettings = useCallback(async () => {
+ const settings = filteredCandidates.map((c) => ({
+ provider: c.provider,
+ model: c.modelId,
+ displayName: c.displayName,
+ temperature: c.settings.temperature,
+ ...(c.settings.reasoningEffort ? { reasoningEffort: c.settings.reasoningEffort } : {}),
+ }))
+ await navigator.clipboard.writeText(JSON.stringify(settings, null, 2))
+ setCopiedSettings(true)
+ setTimeout(() => setCopiedSettings(false), 2000)
+ }, [filteredCandidates])
+
+ const handleExportCsv = useCallback(() => {
+ const header =
+ "Provider,Model ID,Display Name,Composite Score,Success Rate,Avg Cost/Task,Est. Daily Cost,Avg Time/Task,Go,Java,JavaScript,Python,Rust,Tier,Temperature,Reasoning Effort"
+ const rows = filteredCandidates.map(candidateToCsvRow)
+ const csv = [header, ...rows].join("\n")
+ downloadBlob(csv, `${roleId}-comparison.csv`, "text/csv")
+ }, [filteredCandidates, roleId])
+
+ const handleExportJson = useCallback(() => {
+ const json = JSON.stringify(filteredCandidates, null, 2)
+ downloadBlob(json, `${roleId}-comparison.json`, "application/json")
+ }, [filteredCandidates, roleId])
+
+ // ── Render ──────────────────────────────────────────────────────────────
+
+ return (
+ <>
+ {/* ── Atmospheric Header ────────────────────────────────────── */}
+
+ {/* Blur gradient background in role color */}
+
+
+
+
+
+
+ {/* Breadcrumb */}
+
+
+ Evals
+
+ /
+
+ Build with Roo Code Cloud
+
+ /
+
+ {role.name}
+
+ /
+ Compare Models
+
+
+ {/* Title row */}
+
+
+
+
+
+
Compare Models
+
+ {role.name}
+
+
+ Interactive comparison across composite score, success rate, cost efficiency, and
+ speed. Filter by provider, language, and minimum success rate.
+
+
+
+
+ {/* Stats bar */}
+
+
+
+
+ {filteredCandidates.length}
+
+ of {allCandidates.length} models shown
+
+
+
+ Viewing{" "}
+
+ {selectedLanguage === "all"
+ ? "All Languages"
+ : LANGUAGES.find((l) => l.key === selectedLanguage)?.label}
+
+
+ {minSuccessRate > 0 && (
+ <>
+
+
+ Min success{" "}
+
+ {minSuccessRate}%
+
+
+ >
+ )}
+
+
+
+
+
+ {/* ── Main Content ──────────────────────────────────────────── */}
+
+
+ {/* ── Filters Section ────────────────────────────────────── */}
+
+
+
+
+ {/* Language toggle pills */}
+
+
+ Score View
+
+
+ setSelectedLanguage("all")}
+ className={`rounded-full px-4 py-1.5 text-sm font-medium transition-all duration-200 ${
+ selectedLanguage === "all"
+ ? theme.pillActive
+ : "bg-muted/50 text-muted-foreground hover:bg-muted hover:text-foreground"
+ }`}>
+ All Languages
+
+ {LANGUAGES.map(({ key, label }) => (
+ setSelectedLanguage(key)}
+ className={`rounded-full px-4 py-1.5 text-sm font-medium transition-all duration-200 ${
+ selectedLanguage === key
+ ? theme.pillActive
+ : "bg-muted/50 text-muted-foreground hover:bg-muted hover:text-foreground"
+ }`}>
+ {label}
+
+ ))}
+
+
+
+ {/* Min success rate slider */}
+
+
+ Min Success Rate
+
+
+ setMinSuccessRate(Number(e.target.value))}
+ className={`h-2 flex-1 cursor-pointer appearance-none rounded-full bg-muted/50 ${theme.sliderAccent}`}
+ />
+
+ {minSuccessRate}%
+
+
+
+
+
+ {/* Provider checkboxes */}
+
+
+ Providers
+
+
+ {activeProviders.map((p) => (
+
+ toggleProvider(p)}
+ className={`size-4 rounded border-border ${theme.checkboxAccent}`}
+ />
+
+ {PROVIDER_LABELS[p] ?? p}
+
+
+ ))}
+
+
+
+
+ {/* ── Value Map Scatter Chart ────────────────────────────── */}
+
+
+
Value Map: Spend vs Eval Score
+
+
+ Upper-left = higher score at lower spend. Each dot is a model. Size reflects success rate.
+
+
+ {/* Tier legend */}
+
+ {Object.entries(TIER_COLORS).map(([tier, color]) => (
+
+
+ {TIER_LABELS[tier]}
+
+ ))}
+
+
+ {scatterData.length === 0 ? (
+
+
+
No models match the current filters.
+
+ Try adjusting the provider or success rate filters.
+
+
+ ) : (
+
+
+
+ `$${v}`}
+ stroke="hsl(var(--muted-foreground))"
+ strokeOpacity={0.3}
+ tick={{ fontSize: 11, fill: "hsl(var(--muted-foreground))" }}
+ axisLine={false}
+ label={{
+ value: "Daily Spend ($)",
+ position: "insideBottom",
+ offset: -10,
+ style: { fontSize: 11, fill: "hsl(var(--muted-foreground))" },
+ }}
+ />
+
+
+ {/* Sweet spot reference zone: upper-left quadrant */}
+
+ }
+ cursor={{
+ strokeDasharray: "3 3",
+ stroke: "hsl(var(--muted-foreground))",
+ strokeOpacity: 0.3,
+ }}
+ />
+
+ {scatterData.map((entry, index) => (
+ |
+ ))}
+
+
+
+
+ )}
+
+
+ {/* ── Chart Section ──────────────────────────────────────── */}
+
+
+
+ {selectedLanguage === "all"
+ ? "Composite Score"
+ : `${LANGUAGES.find((l) => l.key === selectedLanguage)?.label} Score`}{" "}
+ Comparison
+
+
+
+ Cost Efficiency and Speed are inverted — higher bars mean cheaper / faster. Daily costs
+ assume ~{TASKS_PER_DAY} tasks per agent per day (~6 productive hours).
+
+
+ {chartData.length === 0 ? (
+
+
+
No models match the current filters.
+
+ Try adjusting the provider or success rate filters.
+
+
+ ) : (
+
+
+
+ `${v}`}
+ stroke="hsl(var(--muted-foreground))"
+ strokeOpacity={0.3}
+ tick={{ fontSize: 11, fill: "hsl(var(--muted-foreground))" }}
+ axisLine={false}
+ />
+
+ }
+ cursor={{ fill: "hsl(var(--muted))", fillOpacity: 0.15 }}
+ />
+
+ l.key === selectedLanguage)?.label ?? "Language"} Score`
+ }
+ fill={DIMENSION_COLORS.composite}
+ radius={[0, 4, 4, 0]}
+ barSize={10}
+ />
+
+
+
+
+
+
+ )}
+
+
+ {/* ── Export Section ──────────────────────────────────────── */}
+
+
+
+
+
+
+ Export Data
+
+
+
+
+
+ {copiedSettings ? (
+ <>
+
+ Copied!
+ >
+ ) : (
+ <>
+
+ Copy Settings JSON
+ >
+ )}
+
+
+
+ Export CSV
+
+
+
+ Export JSON
+
+
+
+
+ {/* ── Bottom Navigation ───────────────────────────────────── */}
+
+
+
+
+
+ Back to {role.name} models
+
+
+
+
+
+ All roles
+
+
+ 📋 Raw eval data
+
+
+
+
+
+
+
+ >
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx
new file mode 100644
index 00000000000..9d03aa9cca7
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx
@@ -0,0 +1,88 @@
+import { notFound } from "next/navigation"
+import type { Metadata } from "next"
+
+import { SEO } from "@/lib/seo"
+import { ogImageUrl } from "@/lib/og"
+import { getEngineerRole, getRoleRecommendation } from "@/lib/mock-recommendations"
+
+import { ComparisonChart } from "./comparison-chart"
+
+// ── SEO Metadata ────────────────────────────────────────────────────────────
+
+type PageProps = { params: Promise<{ roleId: string }> }
+
+export async function generateMetadata({ params }: PageProps): Promise {
+ const { roleId } = await params
+ const role = getEngineerRole(roleId)
+
+ if (!role) {
+ return {
+ title: "Role Not Found | Roo Code Evals",
+ description: "The requested engineer role was not found.",
+ }
+ }
+
+ const title = `Compare Models — ${role.name} | Roo Code Evals`
+ const description = `Interactive comparison of AI models for the ${role.name} setup. Compare composite score, success rate, cost efficiency, and speed.`
+ const ogDescription = `Compare Models — ${role.name}`
+ const path = `/evals/workers/${roleId}/compare`
+
+ return {
+ title,
+ description,
+ alternates: {
+ canonical: `${SEO.url}${path}`,
+ },
+ openGraph: {
+ title,
+ description,
+ url: `${SEO.url}${path}`,
+ siteName: SEO.name,
+ images: [
+ {
+ url: ogImageUrl(title, ogDescription),
+ width: 1200,
+ height: 630,
+ alt: title,
+ },
+ ],
+ locale: SEO.locale,
+ type: "website",
+ },
+ twitter: {
+ card: SEO.twitterCard,
+ title,
+ description,
+ images: [ogImageUrl(title, ogDescription)],
+ },
+ keywords: [
+ ...SEO.keywords,
+ "AI coding",
+ "model comparison",
+ "coding evals",
+ role.name.toLowerCase(),
+ "bar chart",
+ "model comparison",
+ ],
+ }
+}
+
+// ── Page Component ──────────────────────────────────────────────────────────
+
+export default async function CompareModelsPage({ params }: PageProps) {
+ const { roleId } = await params
+ const recommendation = getRoleRecommendation(roleId)
+
+ if (!recommendation) {
+ notFound()
+ }
+
+ return (
+
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/copy-settings-button.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/copy-settings-button.tsx
new file mode 100644
index 00000000000..b330ba76536
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/copy-settings-button.tsx
@@ -0,0 +1,42 @@
+"use client"
+
+import { useState } from "react"
+import { Copy, Check } from "lucide-react"
+
+interface CopySettingsButtonProps {
+ settings: {
+ provider: string
+ model: string
+ temperature: number
+ reasoningEffort?: string
+ }
+}
+
+export function CopySettingsButton({ settings }: CopySettingsButtonProps) {
+ const [copied, setCopied] = useState(false)
+
+ const handleCopy = async () => {
+ const json = JSON.stringify(settings, null, 2)
+ await navigator.clipboard.writeText(json)
+ setCopied(true)
+ setTimeout(() => setCopied(false), 2000)
+ }
+
+ return (
+
+ {copied ? (
+ <>
+
+ Copied!
+ >
+ ) : (
+ <>
+
+ Copy Roo Code Cloud Config
+ >
+ )}
+
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx
new file mode 100644
index 00000000000..54ca16d0d26
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx
@@ -0,0 +1,106 @@
+import { notFound } from "next/navigation"
+import type { Metadata } from "next"
+
+import { SEO } from "@/lib/seo"
+import { ogImageUrl } from "@/lib/og"
+import { getRoleRecommendation, getCloudSetupUrl } from "@/lib/mock-recommendations"
+
+import { CandidatesContent } from "./candidates-content"
+
+// ── SEO Metadata ────────────────────────────────────────────────────────────
+
+type PageProps = { params: Promise<{ roleId: string }> }
+
+export async function generateMetadata({ params }: PageProps): Promise {
+ const { roleId } = await params
+ const recommendation = getRoleRecommendation(roleId)
+
+ if (!recommendation) {
+ return {
+ title: "Role Not Found | Roo Code Evals",
+ description: "The requested engineer role was not found.",
+ }
+ }
+
+ const { role } = recommendation
+ const title = `${role.name} — Recommended Models | Roo Code Evals`
+ const description = `Eval-backed recommendations for ${role.name}. Compare models by success rate, cost, and speed across 5 languages.`
+ const ogDescription = `${role.name} — Recommended Models`
+ const path = `/evals/workers/${roleId}`
+
+ return {
+ title,
+ description,
+ alternates: {
+ canonical: `${SEO.url}${path}`,
+ },
+ openGraph: {
+ title,
+ description,
+ url: `${SEO.url}${path}`,
+ siteName: SEO.name,
+ images: [
+ {
+ url: ogImageUrl(title, ogDescription),
+ width: 1200,
+ height: 630,
+ alt: title,
+ },
+ ],
+ locale: SEO.locale,
+ type: "website",
+ },
+ twitter: {
+ card: SEO.twitterCard,
+ title,
+ description,
+ images: [ogImageUrl(title, ogDescription)],
+ },
+ keywords: [
+ ...SEO.keywords,
+ "AI coding",
+ "coding agents",
+ "model recommendations",
+ "coding evals",
+ role.name.toLowerCase(),
+ "model comparison",
+ ],
+ }
+}
+
+// ── Page Component ──────────────────────────────────────────────────────────
+
+export default async function RoleCandidatesPage({ params }: PageProps) {
+ const { roleId } = await params
+ const recommendation = getRoleRecommendation(roleId)
+
+ if (!recommendation) {
+ notFound()
+ }
+
+ const { role, best, budgetHire, speedHire, allCandidates, totalEvalRuns, totalExercises, lastUpdated } =
+ recommendation
+
+ // Pre-compute cloud URLs on the server so the client component receives
+ // only serializable data (no functions).
+ const cloudUrls: Record = {}
+ for (const candidate of allCandidates) {
+ cloudUrls[candidate.modelId] = getCloudSetupUrl(candidate)
+ }
+
+ return (
+
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/workers/page.tsx b/apps/web-roo-code/src/app/evals/workers/page.tsx
new file mode 100644
index 00000000000..e71a5b0d18f
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/workers/page.tsx
@@ -0,0 +1,91 @@
+import { Suspense } from "react"
+import type { Metadata } from "next"
+
+import { SEO } from "@/lib/seo"
+import { ogImageUrl } from "@/lib/og"
+import { getEngineerRoles, getAllRecommendations } from "@/lib/mock-recommendations"
+
+import { WorkersContent } from "./workers-content"
+
+// ── SEO Metadata ────────────────────────────────────────────────────────────
+
+const TITLE = "Build with Roo Code Cloud | Roo Code Evals"
+const DESCRIPTION =
+ "Outcome-first, eval-backed recommendations for shipping production code. Start from your objective and pick a tradeoff."
+const OG_DESCRIPTION = "Outcome-first recommendations for shipping production code"
+const PATH = "/evals/workers"
+
+export const metadata: Metadata = {
+ title: TITLE,
+ description: DESCRIPTION,
+ alternates: {
+ canonical: `${SEO.url}${PATH}`,
+ },
+ openGraph: {
+ title: TITLE,
+ description: DESCRIPTION,
+ url: `${SEO.url}${PATH}`,
+ siteName: SEO.name,
+ images: [
+ {
+ url: ogImageUrl(TITLE, OG_DESCRIPTION),
+ width: 1200,
+ height: 630,
+ alt: TITLE,
+ },
+ ],
+ locale: SEO.locale,
+ type: "website",
+ },
+ twitter: {
+ card: SEO.twitterCard,
+ title: TITLE,
+ description: DESCRIPTION,
+ images: [ogImageUrl(TITLE, OG_DESCRIPTION)],
+ },
+ keywords: [
+ ...SEO.keywords,
+ "AI coding",
+ "coding agents",
+ "roo code cloud",
+ "model recommendations",
+ "coding evals",
+ "model comparison",
+ "shipping code",
+ "prototype",
+ ],
+}
+
+// ── Page Component ──────────────────────────────────────────────────────────
+
+export default function WorkersPage() {
+ const roles = getEngineerRoles()
+ const recommendations = getAllRecommendations()
+
+ // Aggregate totals
+ const totalEvalRuns = recommendations[0]?.totalEvalRuns ?? 0
+ const totalExercises = recommendations[0]?.totalExercises ?? 0
+ const uniqueModels = new Set(
+ recommendations.flatMap((recommendation) => recommendation.allCandidates.map((candidate) => candidate.modelId)),
+ )
+ const totalModels = uniqueModels.size
+
+ const lastUpdated = recommendations
+ .map((r) => r.lastUpdated)
+ .sort()
+ .pop()
+
+ return (
+
+
+
+ )
+}
diff --git a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx
new file mode 100644
index 00000000000..5965c7f8ec6
--- /dev/null
+++ b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx
@@ -0,0 +1,1202 @@
+"use client"
+
+import { useCallback, useMemo } from "react"
+import { motion } from "framer-motion"
+import { ArrowRight, FlaskConical, Beaker } from "lucide-react"
+import Link from "next/link"
+import { usePathname, useRouter, useSearchParams } from "next/navigation"
+import { ScatterChart, Scatter, XAxis, YAxis, ZAxis, Tooltip, ResponsiveContainer, Cell, ReferenceLine } from "recharts"
+
+import type { EngineerRole, RoleRecommendation } from "@/lib/mock-recommendations"
+import { TASKS_PER_DAY, MODEL_TIMELINE } from "@/lib/mock-recommendations"
+import { EVAL_OUTCOMES, isEvalOutcomeId, type EvalOutcomeId } from "@/lib/eval-outcomes"
+import { pickObjectiveDefaultModelV1 } from "@/lib/objective-default-models-v1"
+
+// ── Outcome Layer: Optimization Modes ──────────────────────────────────────
+
+type EvalOptimizationMode = "best" | "fastest" | "cost"
+
+const OPTIMIZATION_MODES: Array<{
+ id: EvalOptimizationMode
+ label: string
+ description: string
+}> = [
+ { id: "best", label: "Quality", description: "Maximize pass rate and overall quality across our eval suite." },
+ { id: "fastest", label: "Speed", description: "Lower latency per task when speed matters." },
+ { id: "cost", label: "Cost", description: "Lower cost per task for high-volume work." },
+]
+
+function isEvalOptimizationMode(value: string): value is EvalOptimizationMode {
+ return value === "best" || value === "fastest" || value === "cost"
+}
+
+function getModeCandidate(rec: RoleRecommendation | undefined, mode: EvalOptimizationMode) {
+ if (!rec) return null
+ if (mode === "fastest") return rec.speedHire ?? rec.best[0] ?? null
+ if (mode === "cost") return rec.budgetHire ?? rec.best[0] ?? null
+ return rec.best[0] ?? null
+}
+
+function getModeLabel(mode: EvalOptimizationMode) {
+ if (mode === "fastest") return "Speed"
+ if (mode === "cost") return "Cost"
+ return "Quality"
+}
+
+function formatModelIdForUi(modelId: string) {
+ if (modelId.startsWith("claude-opus-")) {
+ const rest = modelId.replace(/^claude-opus-/, "")
+ const parts = rest.split("-").filter(Boolean)
+ if (parts.length >= 2) return `Opus ${parts[0]}.${parts[1]}`
+ if (parts.length === 1) return `Opus ${parts[0]}`
+ }
+ if (modelId === "kimi-k2-0905") return "Kimi K2"
+ return modelId
+}
+
+// ── Framer Motion Variants ──────────────────────────────────────────────────
+
+const containerVariants = {
+ hidden: { opacity: 0 },
+ visible: {
+ opacity: 1,
+ transition: {
+ staggerChildren: 0.15,
+ delayChildren: 0.2,
+ },
+ },
+}
+
+const cardVariants = {
+ hidden: { opacity: 0, y: 30 },
+ visible: {
+ opacity: 1,
+ y: 0,
+ transition: {
+ duration: 0.6,
+ ease: [0.21, 0.45, 0.27, 0.9] as const,
+ },
+ },
+}
+
+const fadeUpVariants = {
+ hidden: { opacity: 0, y: 20 },
+ visible: {
+ opacity: 1,
+ y: 0,
+ transition: {
+ duration: 0.6,
+ ease: [0.21, 0.45, 0.27, 0.9] as const,
+ },
+ },
+}
+
+const backgroundVariants = {
+ hidden: { opacity: 0 },
+ visible: {
+ opacity: 1,
+ transition: {
+ duration: 1.2,
+ ease: "easeOut" as const,
+ },
+ },
+}
+
+// ── Provider Colors ─────────────────────────────────────────────────────────
+
+const PROVIDER_COLORS: Record = {
+ anthropic: "#fb923c", // orange-400
+ openai: "#4ade80", // green-400
+ google: "#60a5fa", // blue-400
+ xai: "#c084fc", // purple-400
+ deepseek: "#22d3ee", // cyan-400
+ moonshot: "#f472b6", // pink-400
+}
+
+const PROVIDER_DISPLAY: Record = {
+ anthropic: "Anthropic",
+ openai: "OpenAI",
+ google: "Google",
+ xai: "xAI",
+ deepseek: "DeepSeek",
+ moonshot: "Moonshot",
+}
+
+// ── Timeline Tooltip ────────────────────────────────────────────────────────
+
+function TimelineTooltip({
+ active,
+ payload,
+}: {
+ active?: boolean
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
+ payload?: any[]
+}) {
+ if (!active || !payload || !payload.length) return null
+
+ const data = payload[0]?.payload as
+ | {
+ modelName?: string
+ provider?: string
+ score?: number
+ costPerRun?: number
+ dateLabel?: string
+ }
+ | undefined
+
+ if (!data) return null
+
+ return (
+
+
{data.modelName}
+
+
+
+ Provider:
+ {PROVIDER_DISPLAY[data.provider ?? ""] ?? data.provider}
+
+
+
+ Release:
+ {data.dateLabel}
+
+
+
+ Eval Score:
+ {data.score}%
+
+
+
+ Cost per Run:
+ ${data.costPerRun?.toFixed(2)}
+
+
+
+ )
+}
+
+// ── Sub-Components ──────────────────────────────────────────────────────────
+
+// ── Main Content Component ──────────────────────────────────────────────────
+
+type WorkersContentProps = {
+ roles: EngineerRole[]
+ recommendations: RoleRecommendation[]
+ totalEvalRuns: number
+ totalExercises: number
+ totalModels: number
+ lastUpdated: string | undefined
+ workersRootPath?: string
+ /**
+ * Base path for role detail routes, without the role id.
+ * Examples: `/evals/workers`, `/evals/recommendations/roles`.
+ */
+ roleBasePath?: string
+}
+
+// Outcomes-first is canonical. Baseline/V1 is removed from the UI.
+const ENABLE_OUTCOME_LAYER = true
+
+export function WorkersContent({
+ roles,
+ recommendations,
+ totalEvalRuns,
+ totalExercises: _totalExercises,
+ totalModels: _totalModels,
+ lastUpdated,
+ workersRootPath = "/evals/recommendations",
+ roleBasePath = workersRootPath,
+}: WorkersContentProps) {
+ const enableOutcomeLayer = ENABLE_OUTCOME_LAYER
+ const router = useRouter()
+ const pathname = usePathname()
+ const searchParams = useSearchParams()
+
+ const selectedOutcomeId = useMemo(() => {
+ const outcome = searchParams.get("outcome")
+ if (!outcome) return null
+ return isEvalOutcomeId(outcome) ? outcome : null
+ }, [searchParams])
+ const effectiveOutcomeId = useMemo(() => {
+ if (selectedOutcomeId) return selectedOutcomeId
+ return EVAL_OUTCOMES[0]?.id ?? null
+ }, [selectedOutcomeId])
+
+ const selectedMode = useMemo((): EvalOptimizationMode => {
+ const mode = searchParams.get("mode")
+ if (!mode) return "best"
+ return isEvalOptimizationMode(mode) ? mode : "best"
+ }, [searchParams])
+
+ const setOutcome = useCallback(
+ (nextOutcomeId: EvalOutcomeId | null) => {
+ const params = new URLSearchParams(searchParams.toString())
+ if (nextOutcomeId) params.set("outcome", nextOutcomeId)
+ else params.delete("outcome")
+
+ const query = params.toString()
+ router.replace(query ? `${pathname}?${query}` : pathname, { scroll: false })
+ },
+ [pathname, router, searchParams],
+ )
+
+ const setMode = useCallback(
+ (nextMode: EvalOptimizationMode) => {
+ const params = new URLSearchParams(searchParams.toString())
+ params.set("mode", nextMode)
+
+ const query = params.toString()
+ router.replace(query ? `${pathname}?${query}` : pathname, { scroll: false })
+ },
+ [pathname, router, searchParams],
+ )
+
+ const scrollToOutcomes = useCallback(() => {
+ if (typeof document === "undefined") return
+ const el = document.getElementById("outcomes")
+ if (!el) return
+ el.scrollIntoView({ behavior: "smooth", block: "start" })
+ }, [])
+
+ const recByRole = new Map(recommendations.map((r) => [r.roleId, r]))
+ const roleById = useMemo(() => new Map(roles.map((r) => [r.id, r])), [roles])
+
+ const selectedOutcome = useMemo(() => {
+ if (!effectiveOutcomeId) return null
+ return EVAL_OUTCOMES.find((o) => o.id === effectiveOutcomeId) ?? null
+ }, [effectiveOutcomeId])
+
+ const setupQuery = useMemo(() => {
+ if (!effectiveOutcomeId) return ""
+ const params = new URLSearchParams()
+ params.set("outcome", effectiveOutcomeId)
+ params.set("mode", selectedMode)
+ const query = params.toString()
+ return query ? `?${query}` : ""
+ }, [effectiveOutcomeId, selectedMode])
+
+ const isProfileView = useMemo(() => {
+ return searchParams.get("view") === "profile"
+ }, [searchParams])
+
+ const profileDescription =
+ selectedOutcome?.builderProfile?.description ??
+ "A default setup built from our eval signals. It’s a baseline, not a guarantee."
+ const profileHowItWorks = selectedOutcome?.builderProfile?.howItWorks ?? selectedOutcome?.whyItWorks ?? []
+ const objectiveDefaultModel = useMemo(() => {
+ if (!effectiveOutcomeId) return null
+ return pickObjectiveDefaultModelV1(effectiveOutcomeId, selectedMode)
+ }, [effectiveOutcomeId, selectedMode])
+ const objectiveDefaultModelLabel = useMemo(() => {
+ if (!objectiveDefaultModel?.modelId) return "—"
+ return formatModelIdForUi(objectiveDefaultModel.modelId)
+ }, [objectiveDefaultModel])
+ const selectedModeLabel = getModeLabel(selectedMode)
+ const examplePrompt = selectedOutcome?.builderProfile?.examplePrompt ?? ""
+ const cloudSetupHref = useMemo(() => {
+ if (!effectiveOutcomeId) return "/cloud-agents/setup"
+ const params = new URLSearchParams()
+ params.set("outcome", effectiveOutcomeId)
+ params.set("mode", selectedMode)
+ if (examplePrompt) params.set("prompt", examplePrompt)
+ return `/cloud-agents/setup?${params.toString()}`
+ }, [examplePrompt, selectedMode, effectiveOutcomeId])
+
+ const profileCapabilities = useMemo(() => {
+ if (!selectedOutcome) return []
+ const fromProfile = selectedOutcome.builderProfile?.capabilities
+ if (fromProfile && fromProfile.length > 0) return fromProfile
+ return selectedOutcome.recommendedRoleIds.map((roleId) => {
+ const role = roleById.get(roleId)
+ return {
+ id: roleId,
+ name: role?.name ?? roleId,
+ description: role?.salaryRange ?? "",
+ roleId,
+ }
+ })
+ }, [selectedOutcome, roleById])
+
+ const agentCapabilities = useMemo(() => profileCapabilities.filter((c) => Boolean(c.roleId)), [profileCapabilities])
+
+ const skillCapabilities = useMemo(() => profileCapabilities.filter((c) => !c.roleId), [profileCapabilities])
+
+ // ── Timeline scatter data ──────────────────────────────────────────────
+ const timelineData = useMemo(() => {
+ const maxCost = Math.max(...MODEL_TIMELINE.map((m) => m.costPerRun))
+ return MODEL_TIMELINE.map((m) => {
+ const date = new Date(m.releaseDate)
+ return {
+ modelName: m.modelName,
+ provider: m.provider,
+ score: m.score,
+ costPerRun: m.costPerRun,
+ // numeric X for scatter: days since epoch
+ dateNum: date.getTime(),
+ dateLabel: date.toLocaleDateString("en-US", { month: "short", year: "numeric", timeZone: "UTC" }),
+ // Dot size: inversely proportional to cost (cheaper = bigger dot)
+ dotSize: Math.round(60 + (1 - m.costPerRun / maxCost) * 340),
+ }
+ }).sort((a, b) => a.dateNum - b.dateNum)
+ }, [])
+
+ // Trend line endpoints for the timeline
+ const trendLine = useMemo(() => {
+ if (timelineData.length < 2) return null
+ const first = timelineData[0]!
+ const last = timelineData[timelineData.length - 1]!
+ return { x1: first.dateNum, y1: first.score, x2: last.dateNum, y2: last.score }
+ }, [timelineData])
+
+ return (
+ <>
+ {/* ── Hero Section ───────────────────────────────────────────── */}
+
+ {/* Atmospheric blur background */}
+
+
+
+
+ {/* Blueprint grid overlay */}
+
+
+ {/* Gradient fade from hero atmosphere to cards */}
+
+
+
+
+ {enableOutcomeLayer ? (
+
+
+
+ {/* Badge */}
+
+
+
+
+ How we run evals
+
+
+
+
+
+
+ Outcomes over artifacts
+
+
+ {/* Heading */}
+
+ You’re the Builder
+
+ Ship{" "}
+
+ Real Code
+
+
+
+
+ {/* Subheading */}
+
+ Pick an objective. We’ll suggest an agent lineup and default model
+ based on eval results. Treat it as a baseline for your repo.
+
+
+
+
+ Get started with your objective
+
+
+
+
+
+ ) : null}
+
+
+
+
+ {/* ── Outcomes Overlay ───────────────────────────────────────── */}
+ {enableOutcomeLayer ? (
+
+
+
+ {isProfileView ? (
+
+ {selectedOutcome ? (
+ <>
+
+
+
+ Profile
+
+
+ {selectedOutcome.name}
+
+
+ {profileDescription}
+
+
+
+
+ {examplePrompt ? (
+
+
+ Example prompt
+
+
+ {examplePrompt}
+
+
+ ) : null}
+
+ Start in Roo Code Cloud
+
+
+
+ Back to objectives
+
+
+
+
+
+
+
+
+
+ Optimized for
+
+
+ {selectedModeLabel}
+
+
+
+
+ Default model
+
+
+ {objectiveDefaultModelLabel}
+
+
+
+
+ Agents
+
+
+ {agentCapabilities.length}
+
+
+
+
+ Skills
+
+
+ {skillCapabilities.length}
+
+
+
+
+
+
+
+ Optimize for
+
+
+ {OPTIMIZATION_MODES.map((mode) => {
+ const isSelected = mode.id === selectedMode
+ return (
+ setMode(mode.id)}
+ className={[
+ "rounded-full px-3 py-1.5 text-xs font-semibold transition-colors",
+ isSelected
+ ? "bg-foreground/10 text-foreground"
+ : "text-muted-foreground hover:text-foreground",
+ ].join(" ")}>
+ {mode.label}
+
+ )
+ })}
+
+
+
+
+
+
+ Agent lineup
+
+
+ {agentCapabilities.map((capability) => {
+ const roleId = capability.roleId!
+ const rec = recByRole.get(roleId)
+ const candidate = getModeCandidate(rec, selectedMode)
+
+ const providerColor = candidate
+ ? (PROVIDER_COLORS[candidate.provider] ?? "#94a3b8")
+ : "#94a3b8"
+
+ return (
+
+
+
+ {capability.name}
+
+
+
+ {candidate ? (
+
+ {candidate.displayName}
+
+ ) : (
+
+ View models
+
+ )}
+
+
+
+
+ )
+ })}
+
+
+
+
+
+ Skills included
+
+
+ {skillCapabilities.length > 0 ? (
+ skillCapabilities.map((capability) => (
+
+
+ {capability.name}
+
+
+ {capability.description}
+
+
+ ))
+ ) : (
+
+ No skills listed for this profile yet.
+
+ )}
+
+
+
+
+ {profileHowItWorks.length > 0 || selectedOutcome.whyItWorks.length > 0 ? (
+
+
+ Rationale
+
+
+
+
+ {selectedOutcome.builderProfile
+ ? "How it works"
+ : "Why it works"}
+
+
+ {profileHowItWorks.map((line) => (
+
+
+ {line}
+
+ ))}
+
+
+
+
+ Why it works
+
+
+ {selectedOutcome.whyItWorks.map((line) => (
+
+
+ {line}
+
+ ))}
+
+
+
+
+ ) : null}
+ >
+ ) : (
+
+
+
+ No objective selected
+
+
+ Pick an objective first, then open the profile view.
+
+
+
+ Back to objectives
+
+
+
+ )}
+
+ ) : null}
+
+
+ {/* Left rail: objective + mode */}
+
+
+
+ Select your objective
+
+
+
+
+
+ Optimize for
+
+
+ {OPTIMIZATION_MODES.map((mode) => {
+ const isSelected = mode.id === selectedMode
+ return (
+ setMode(mode.id)}
+ className={[
+ "rounded-full px-3 py-1.5 text-xs font-semibold transition-colors",
+ isSelected
+ ? "bg-foreground/10 text-foreground"
+ : "text-muted-foreground hover:text-foreground",
+ ].join(" ")}>
+ {mode.label}
+
+ )
+ })}
+
+
+
+
+ {EVAL_OUTCOMES.map((outcome) => {
+ const Icon = outcome.icon
+ const isSelected = outcome.id === effectiveOutcomeId
+
+ return (
+ setOutcome(isSelected ? null : outcome.id)}
+ className={[
+ "group w-full rounded-2xl border bg-card/35 p-4 text-left backdrop-blur-sm transition-all duration-200 hover:bg-card/55",
+ isSelected
+ ? "border-foreground/20 ring-1 ring-foreground/15"
+ : "border-border/50 hover:border-border",
+ ].join(" ")}>
+
+
+
+
+
+
+
+ {outcome.description}
+
+
+
+
+ )
+ })}
+
+
+
+ {/* Right rail: profile snapshot */}
+
+
+
+
+
+
+
+
+
+
+ Profile snapshot
+
+
+ {selectedOutcome
+ ? selectedOutcome.name
+ : "Pick an objective"}
+
+
+ {selectedOutcome
+ ? profileDescription
+ : "Select an objective to see the suggested lineup and default model."}
+
+
+
+
+
+ Optimized for: {selectedModeLabel}
+
+
+
+
+
+
+
+
+ Signal
+
+
+ {totalEvalRuns.toLocaleString()} runs
+
+
+
+
+ Agents
+
+
+ {agentCapabilities.length}
+
+
+
+
+ Default model
+
+
+ {objectiveDefaultModelLabel}
+
+
+
+
+
+
+
+
+
+ Agent lineup
+
+
+ Open candidates & settings
+
+
+
+
+ {selectedOutcome ? (
+
+ {agentCapabilities.map((capability) => {
+ const roleId = capability.roleId!
+ const rec = recByRole.get(roleId)
+ const candidate = getModeCandidate(rec, selectedMode)
+
+ const providerColor = candidate
+ ? (PROVIDER_COLORS[candidate.provider] ?? "#94a3b8")
+ : "#94a3b8"
+
+ return (
+
+
+
+
+
+ {capability.name}
+
+
+
+ {candidate ? (
+
+ {candidate.displayName}
+
+ ) : (
+
+ View models
+
+ )}
+
+
+
+
+
+ )
+ })}
+
+ ) : (
+
+
+ No objective selected
+
+
+ Pick an objective to see the recommended agent lineup.
+
+
+ )}
+
+
+ {selectedOutcome ? (
+ <>
+ {examplePrompt ? (
+
+
+ Example prompt
+
+
+ {examplePrompt}
+
+
+ ) : null}
+
+
+ Start in Roo Code Cloud
+
+
+
+ Learn more about this profile
+
+
+
+ >
+ ) : null}
+
+
+
+
+
+
+
+
+ ) : null}
+
+ {/* ── AI Coding Capability Over Time ─────────────────────────── */}
+
+ {/* Subtle atmospheric background */}
+
+
+
+
+
+
+ {/* Section header */}
+
+
+ AI Coding Capability{" "}
+
+ Over Time
+
+
+
+ Pass rates on our eval suite by model release date. Several current models hit 100% on
+ this suite.
+
+
+
+ {/* Chart container */}
+
+ {/* Provider legend */}
+
+ {Object.entries(PROVIDER_COLORS)
+ .filter(([provider]) => MODEL_TIMELINE.some((m) => m.provider === provider))
+ .map(([provider, color]) => (
+
+
+ {PROVIDER_DISPLAY[provider] ?? provider}
+
+ ))}
+
+ ●
+ Bigger dot = lower cost
+
+
+
+
+
+
+ {
+ const d = new Date(v)
+ return d.toLocaleDateString("en-US", {
+ month: "short",
+ year: "2-digit",
+ timeZone: "UTC",
+ })
+ }}
+ stroke="hsl(var(--muted-foreground))"
+ strokeOpacity={0.3}
+ tick={{ fontSize: 11, fill: "hsl(var(--muted-foreground))" }}
+ axisLine={false}
+ label={{
+ value: "Release Date",
+ position: "insideBottom",
+ offset: -10,
+ style: { fontSize: 11, fill: "hsl(var(--muted-foreground))" },
+ }}
+ />
+ `${v}%`}
+ stroke="hsl(var(--muted-foreground))"
+ strokeOpacity={0.3}
+ tick={{ fontSize: 11, fill: "hsl(var(--muted-foreground))" }}
+ axisLine={false}
+ label={{
+ value: "Eval Score (%)",
+ angle: -90,
+ position: "insideLeft",
+ offset: 10,
+ style: { fontSize: 11, fill: "hsl(var(--muted-foreground))" },
+ }}
+ />
+
+ {/* Trend line: dashed line from first to last */}
+ {trendLine && (
+
+ )}
+ {/* 100% reference line */}
+
+ }
+ cursor={{
+ strokeDasharray: "3 3",
+ stroke: "hsl(var(--muted-foreground))",
+ strokeOpacity: 0.3,
+ }}
+ />
+
+ {timelineData.map((entry, index) => (
+ |
+ ))}
+
+
+
+
+
+
+
+
+
+ {/* ── Footer / Methodology Section ───────────────────────────── */}
+
+
+
+ {/* Stats summary */}
+
+
+ {totalEvalRuns.toLocaleString()}+
+ {" "}
+ eval runs
+ •
+ 5 languages
+ •
+ Last updated:{" "}
+
+ {lastUpdated
+ ? new Date(lastUpdated).toLocaleDateString("en-US", {
+ year: "numeric",
+ month: "long",
+ day: "numeric",
+ timeZone: "UTC",
+ })
+ : "N/A"}
+
+
+
+ {/* Assumption note */}
+
+ Daily costs assume ~{TASKS_PER_DAY} tasks per agent per day (~6 productive hours including
+ overhead).
+
+
+ {/* Links */}
+
+
+
+ Our methodology
+
+
+
+ Raw eval data
+
+
+
+
+
+
+ >
+ )
+}
diff --git a/apps/web-roo-code/src/lib/eval-outcomes.ts b/apps/web-roo-code/src/lib/eval-outcomes.ts
new file mode 100644
index 00000000000..f6616f97466
--- /dev/null
+++ b/apps/web-roo-code/src/lib/eval-outcomes.ts
@@ -0,0 +1,461 @@
+import type { LucideIcon } from "lucide-react"
+import { Bug, CheckCircle2, GitPullRequest, Sparkles, Workflow } from "lucide-react"
+
+export type EvalOutcomeId =
+ | "prototype_to_pr"
+ | "paper_cuts"
+ | "sentry_triage"
+ | "repro_to_fix"
+ | "review_guardrails"
+ | "issue_to_pr"
+
+export type EvalOutcomeCapability = {
+ id: string
+ name: string
+ description: string
+ /**
+ * Optional roleId for capabilities that map directly to a role page.
+ * Non-role capabilities represent Roo Code Cloud behaviors (validation, PR packaging, etc.).
+ */
+ roleId?: string
+}
+
+export type EvalOutcomeProfile = {
+ title: string
+ description: string
+ /**
+ * Starter prompt shown in the UI to help users understand what to ask for.
+ * This is product copy, not an eval artifact.
+ */
+ examplePrompt?: string
+ capabilities: EvalOutcomeCapability[]
+ howItWorks: string[]
+}
+
+export type EvalOutcome = {
+ id: EvalOutcomeId
+ /** URL slug used for objective deep dives: /evals/recommendations/ */
+ slug: string
+ name: string
+ description: string
+ icon: LucideIcon
+ /**
+ * Ordered list of roleIds to suggest as a "setup".
+ * Keep roleIds stable even if display names evolve.
+ */
+ recommendedRoleIds: string[]
+ whyItWorks: string[]
+ /**
+ * Optional profile details used to render a more comprehensive “exoskeleton”
+ * for an outcome. Start with the most important outcomes and expand over time.
+ */
+ builderProfile?: EvalOutcomeProfile
+}
+
+export const EVAL_OUTCOMES: EvalOutcome[] = [
+ {
+ id: "review_guardrails",
+ slug: "idea-prototype",
+ name: "Idea → Prototype",
+ description: "Turn a vague idea into a working demo in your real codebase.",
+ icon: Sparkles,
+ recommendedRoleIds: ["autonomous", "senior"],
+ whyItWorks: [
+ "Optimizes for momentum: map the codebase fast, then build a working slice.",
+ "Senior builder keeps the prototype grounded in production constraints.",
+ ],
+ builderProfile: {
+ title: "Your Builder Profile",
+ description: "For turning an idea into a working demo in your repo.",
+ examplePrompt: `Objective: Idea → Prototype
+
+In this repo, turn this idea into a working demo: .
+
+Constraints:
+- Keep scope small and demo-first.
+- Use the existing stack and patterns in this codebase.
+
+Deliver:
+- A reviewable PR
+- A short walkthrough (how to run it, what works, what’s next)`,
+ capabilities: [
+ {
+ id: "autonomous_researcher",
+ name: "Autonomous Researcher",
+ description: "Maps the codebase, constraints, and best path forward before implementation starts.",
+ roleId: "autonomous",
+ },
+ {
+ id: "multi_file_builder",
+ name: "Senior Builder",
+ description: "Builds a working prototype directly in your repo across the files it touches.",
+ roleId: "senior",
+ },
+ {
+ id: "discovery_loop",
+ name: "Discovery loop",
+ description:
+ "Maps the codebase and constraints before making changes (so the prototype fits reality).",
+ },
+ {
+ id: "prototype_scaffold",
+ name: "Prototype scaffold",
+ description: "Creates the smallest working slice you can demo and build on.",
+ },
+ {
+ id: "demo_ready_output",
+ name: "Demo-ready output",
+ description:
+ "Delivers a reviewable diff plus a clear walkthrough of what’s working and what’s next.",
+ },
+ ],
+ howItWorks: [
+ "Clarify the objective and success criteria.",
+ "Explore the codebase and pick the smallest viable implementation path.",
+ "Build the prototype directly in the repo (no throwaway export/import step).",
+ "Deliver a demo-ready diff with notes for the next iteration.",
+ ],
+ },
+ },
+ {
+ id: "prototype_to_pr",
+ slug: "prototype-pr",
+ name: "Prototype → PR",
+ description: "Build a working prototype on the production codebase, then turn it into a reviewable diff.",
+ icon: Sparkles,
+ recommendedRoleIds: ["senior", "reviewer"],
+ whyItWorks: [
+ "Multi-file changes with a reviewer pass for coherence and edge cases.",
+ "Optimizes for shipping, not slides.",
+ ],
+ builderProfile: {
+ title: "Your Builder Profile",
+ description: "For turning a prototype into a reviewable PR on the production codebase.",
+ examplePrompt: `Objective: Prototype → PR
+
+Take the current prototype implementation and turn it into a reviewable PR.
+
+Do:
+- Tighten scope to the smallest shippable diff
+- Add/adjust tests, lint, and typechecks as needed
+
+Deliver:
+- A PR-ready diff with a plain-English summary and review notes`,
+ capabilities: [
+ {
+ id: "multi_file_builder",
+ name: "Multi-file Builder",
+ description: "Builds the prototype directly in your repo across the files it touches.",
+ roleId: "senior",
+ },
+ {
+ id: "reviewer_guardrails",
+ name: "Reviewer & Guardrails",
+ description: "Reviews the diff for correctness, edge cases, and coherence before you merge.",
+ roleId: "reviewer",
+ },
+ {
+ id: "environment_setup",
+ name: "Environment setup",
+ description:
+ "Bootstraps a working dev environment and runs the workflow without you fighting Git, installs, or tests.",
+ },
+ {
+ id: "validation_loop",
+ name: "Validation loop",
+ description: "Runs tests/lint/typechecks and iterates until it’s clean (or flags what’s blocked).",
+ },
+ {
+ id: "pr_ready_output",
+ name: "PR-ready output",
+ description: "Produces a focused diff plus a plain-English summary and review notes.",
+ },
+ {
+ id: "straight_line_merge",
+ name: "Straight-line to merge",
+ description:
+ "No export/import step: the work is already on the production codebase, so merge is a straight line.",
+ },
+ {
+ id: "scope_control",
+ name: "Scope control",
+ description: "Keeps diffs tight: smaller review surface, fewer surprises, and easier merges.",
+ },
+ ],
+ howItWorks: [
+ "Build a working prototype directly in the production codebase.",
+ "Convert the prototype into a tight diff (tests, cleanup, and safeguards).",
+ "Run a reviewer pass to catch edge cases and improve merge confidence.",
+ "Deliver a PR-ready result with context and next steps.",
+ ],
+ },
+ },
+ {
+ id: "issue_to_pr",
+ slug: "issue-pr",
+ name: "Issue → PR",
+ description: "Run end-to-end work in the background and come back to a reviewable result.",
+ icon: GitPullRequest,
+ recommendedRoleIds: ["autonomous", "reviewer"],
+ whyItWorks: [
+ "Handles out-of-band work while humans stay on the roadmap.",
+ "Pairs autonomy with guardrails for merge safety.",
+ ],
+ builderProfile: {
+ title: "Your Builder Profile",
+ description: "For turning an issue into a reviewable PR.",
+ examplePrompt: `Objective: Issue → PR
+
+Fix this issue in the repo: .
+
+Requirements:
+- Define “done” in 2-3 acceptance criteria
+- Implement the fix and validate it (tests/lint/typechecks)
+
+Deliver:
+- A reviewable PR with context and any follow-ups`,
+ capabilities: [
+ {
+ id: "autonomous_executor",
+ name: "Autonomous Executor",
+ description: "Runs the full loop (investigate → implement → validate) while you stay unblocked.",
+ roleId: "autonomous",
+ },
+ {
+ id: "reviewer_guardrails",
+ name: "Reviewer & Guardrails",
+ description: "Reviews the diff for correctness, edge cases, and merge safety.",
+ roleId: "reviewer",
+ },
+ {
+ id: "issue_intake",
+ name: "Issue intake",
+ description:
+ "Translates a request into scoped tasks, acceptance criteria, and a safe plan of attack.",
+ },
+ {
+ id: "validation_loop",
+ name: "Validation loop",
+ description: "Runs tests/lint/typechecks and iterates until it’s clean (or flags what’s blocked).",
+ },
+ {
+ id: "pr_ready_output",
+ name: "PR-ready output",
+ description: "Produces a focused diff plus a plain-English summary and review notes.",
+ },
+ ],
+ howItWorks: [
+ "Clarify the issue and define what “done” means.",
+ "Implement in the background with frequent validation checkpoints.",
+ "Run a reviewer pass to reduce merge risk.",
+ "Deliver a PR-ready result with context and next steps.",
+ ],
+ },
+ },
+ {
+ id: "sentry_triage",
+ slug: "customer-escalation-resolved",
+ name: "Customer Escalation → Resolved",
+ description: "Triage a customer-blocking issue and ship the smallest safe fix.",
+ icon: Bug,
+ recommendedRoleIds: ["autonomous", "senior", "reviewer"],
+ whyItWorks: [
+ "Autonomous runs handle multi-step investigation and iteration.",
+ "Senior builder makes the final fix precise and production-safe.",
+ "Reviewer focuses on safety, correctness, and “does this hold up?”.",
+ ],
+ builderProfile: {
+ title: "Your Builder Profile",
+ description: "For resolving a customer escalation quickly and safely.",
+ examplePrompt: `Objective: Customer Escalation → Resolved
+
+We have a customer-blocking escalation:
+- Symptoms:
+- Context:
+
+Do:
+- Find the smallest safe fix with a clear blast-radius assessment
+- Add guardrails/tests where it makes sense
+
+Deliver:
+- A PR with the fix and a short “risk + rollout” note`,
+ capabilities: [
+ {
+ id: "autonomous_triage",
+ name: "Autonomous Triage",
+ description: "Investigates logs, context, and repro steps to converge on a fix quickly.",
+ roleId: "autonomous",
+ },
+ {
+ id: "senior_fixer",
+ name: "Senior Builder",
+ description: "Implements the smallest production-safe fix when the blast radius is unclear.",
+ roleId: "senior",
+ },
+ {
+ id: "reviewer_guardrails",
+ name: "Reviewer & Guardrails",
+ description: "Double-checks safety and correctness so speed doesn’t create regressions.",
+ roleId: "reviewer",
+ },
+ {
+ id: "repro_first",
+ name: "Repro-first",
+ description: "Prioritizes a minimal reproduction so we know the fix actually fixes the issue.",
+ },
+ {
+ id: "minimal_fix",
+ name: "Minimal safe fix",
+ description: "Ships the smallest change that unblocks customers, with a clear rollback story.",
+ },
+ {
+ id: "verification_artifacts",
+ name: "Verification artifacts",
+ description: "Provides proof (tests/logs/steps) that the fix works and what it covers.",
+ },
+ ],
+ howItWorks: [
+ "Gather context and reproduce the customer issue.",
+ "Implement the smallest safe fix with verification.",
+ "Run a reviewer pass to catch edge cases.",
+ "Deliver a PR-ready result plus rollout notes.",
+ ],
+ },
+ },
+ {
+ id: "repro_to_fix",
+ slug: "bug-report-fix",
+ name: "Bug Report → Fix",
+ description: "Reproduce, isolate, patch, and validate in one loop.",
+ icon: Workflow,
+ recommendedRoleIds: ["senior", "reviewer"],
+ whyItWorks: [
+ "Good default for ambiguous bugs that touch a few files.",
+ "Reviewer helps catch cross-team assumptions early.",
+ ],
+ builderProfile: {
+ title: "Your Builder Profile",
+ description: "For turning a bug report into a verified fix.",
+ examplePrompt: `Objective: Bug Report → Fix
+
+Fix this bug:
+- Report:
+- Expected vs actual:
+
+Do:
+- Reproduce if possible, then implement the fix
+- Validate with tests/lint/typechecks (or explain what’s blocked)
+
+Deliver:
+- A PR with the fix and verification notes`,
+ capabilities: [
+ {
+ id: "bug_fixer",
+ name: "Bug Fixer",
+ description: "Reproduces and fixes bugs efficiently across the files involved.",
+ roleId: "senior",
+ },
+ {
+ id: "reviewer_guardrails",
+ name: "Reviewer & Guardrails",
+ description: "Reviews the diff for correctness and regression risk before it ships.",
+ roleId: "reviewer",
+ },
+ {
+ id: "repro_harness",
+ name: "Repro harness",
+ description:
+ "Creates a minimal reproduction path (tests or steps) to prevent “can’t repro” stalls.",
+ },
+ {
+ id: "fix_with_tests",
+ name: "Fix with tests",
+ description: "Pairs the fix with verification so it doesn’t regress on the next change.",
+ },
+ {
+ id: "validation_loop",
+ name: "Validation loop",
+ description: "Runs tests/lint/typechecks and iterates until it’s clean (or flags what’s blocked).",
+ },
+ ],
+ howItWorks: [
+ "Reproduce the issue and isolate the root cause.",
+ "Implement a targeted fix with verification.",
+ "Run a reviewer pass to reduce regression risk.",
+ "Deliver a PR-ready result with steps to validate.",
+ ],
+ },
+ },
+ {
+ id: "paper_cuts",
+ slug: "paper-cuts-shipped",
+ name: "Paper Cuts → Shipped",
+ description: "Fix the small stuff without dragging engineers off big projects.",
+ icon: CheckCircle2,
+ recommendedRoleIds: ["junior", "reviewer"],
+ whyItWorks: [
+ "Small diffs are high-leverage when the work is well-scoped.",
+ "Reviewer keeps the quality bar and reduces surprise.",
+ ],
+ builderProfile: {
+ title: "Your Builder Profile",
+ description: "For shipping small fixes quickly, cleanly, and safely.",
+ examplePrompt: `Objective: Paper Cuts → Shipped
+
+Ship these small fixes in this repo:
+-
+-
+-
+
+Constraints:
+- Keep diffs small and easy to review
+- Don’t change behavior unless it’s clearly a bug
+
+Deliver:
+- A PR with grouped, well-scoped commits and a short summary`,
+ capabilities: [
+ {
+ id: "small_diff_builder",
+ name: "Small-diff Builder",
+ description: "Ships focused fixes with low review surface area and minimal risk.",
+ roleId: "junior",
+ },
+ {
+ id: "reviewer_guardrails",
+ name: "Reviewer & Guardrails",
+ description: "Catches edge cases and keeps changes aligned with team conventions.",
+ roleId: "reviewer",
+ },
+ {
+ id: "scope_control",
+ name: "Scope control",
+ description: "Keeps changes tight: fewer surprises, faster reviews, easier merges.",
+ },
+ {
+ id: "quick_validation",
+ name: "Quick validation",
+ description: "Runs the relevant checks and flags what’s safe to skip (and what’s not).",
+ },
+ {
+ id: "pr_ready_output",
+ name: "PR-ready output",
+ description: "Produces a focused diff plus a plain-English summary and review notes.",
+ },
+ ],
+ howItWorks: [
+ "Pick the smallest fix that moves the needle.",
+ "Implement with tight scope control.",
+ "Validate quickly and review for conventions.",
+ "Deliver a PR-ready result you can merge confidently.",
+ ],
+ },
+ },
+]
+
+export function isEvalOutcomeId(value: string): value is EvalOutcomeId {
+ return EVAL_OUTCOMES.some((o) => o.id === value)
+}
+
+export function getEvalOutcomeBySlug(slug: string): EvalOutcome | undefined {
+ return EVAL_OUTCOMES.find((o) => o.slug === slug)
+}
diff --git a/apps/web-roo-code/src/lib/mock-recommendations.ts b/apps/web-roo-code/src/lib/mock-recommendations.ts
new file mode 100644
index 00000000000..2e30b5ec0b9
--- /dev/null
+++ b/apps/web-roo-code/src/lib/mock-recommendations.ts
@@ -0,0 +1,890 @@
+// ---------------------------------------------------------------------------
+// Eval Recommendations: Types + Mock Data (S1.1a)
+// ---------------------------------------------------------------------------
+// This file defines the API contract for the /evals/workers recommendation pages.
+// The backend (Sprint 3-4) will produce data matching these exact types.
+// ---------------------------------------------------------------------------
+
+// ── Constants ──────────────────────────────────────────────────────────────
+
+/**
+ * Estimated tasks per agent per day.
+ * Assumes ~6 productive hours with overhead for setup, review, and iteration.
+ * One human engineer typically manages 2-3 agents throughout a workday.
+ */
+export const TASKS_PER_DAY = 80
+
+// ── Types ──────────────────────────────────────────────────────────────────
+
+/** Engineer role definition: maps task complexity to a recommendation tier. */
+export type EngineerRole = {
+ id: string
+ name: string
+ /** Short descriptor shown under the profile name (scope, mode, etc.). */
+ salaryRange: string
+ description: string
+ bestFor: string[]
+ strengths: string[]
+ weaknesses: string[]
+ icon: string
+}
+
+/** Language-specific eval scores (0–100). */
+export type LanguageScores = {
+ go: number
+ java: number
+ javascript: number
+ python: number
+ rust: number
+}
+
+/** Model inference settings used during evaluation. */
+export type ModelSettings = {
+ temperature: number
+ reasoningEffort?: string
+}
+
+/** A model candidate evaluated for a specific role. */
+export type ModelCandidate = {
+ provider: string
+ modelId: string
+ displayName: string
+ compositeScore: number
+ tier: "best" | "recommended" | "situational" | "not-recommended"
+ tags: string[]
+ successRate: number
+ avgCostPerTask: number
+ /** Estimated daily cost: avgCostPerTask × TASKS_PER_DAY */
+ estimatedDailyCost: number
+ avgTimePerTask: number
+ languageScores: LanguageScores
+ settings: ModelSettings
+ caveats?: string[]
+}
+
+/** Full recommendation payload for a single role. */
+export type RoleRecommendation = {
+ roleId: string
+ role: EngineerRole
+ lastUpdated: string
+ totalEvalRuns: number
+ totalExercises: number
+ best: ModelCandidate[]
+ budgetHire: ModelCandidate | null
+ speedHire: ModelCandidate | null
+ allCandidates: ModelCandidate[]
+}
+
+// ── Engineer Role Configs ──────────────────────────────────────────────────
+
+const ENGINEER_ROLES: EngineerRole[] = [
+ {
+ id: "junior",
+ name: "Single-file Builder",
+ salaryRange: "Scope: single-file",
+ description:
+ "Best for tight diffs: boilerplate, small fixes, and test updates. Great when the work is clear and bounded.",
+ bestFor: ["Small fixes", "Boilerplate", "Test updates", "Simple implementations"],
+ strengths: ["Fast iteration", "Stays close to the requested change", "Great for well-scoped diffs"],
+ weaknesses: ["Not ideal for cross-cutting work", "Can miss edge cases in complex systems"],
+ icon: "Code",
+ },
+ {
+ id: "senior",
+ name: "Multi-file Builder",
+ salaryRange: "Scope: multi-file",
+ description:
+ "For most day-to-day shipping: feature work across a few files, refactors, and debugging with solid consistency.",
+ bestFor: ["Feature work", "Multi-file refactors", "Debugging", "Integrations"],
+ strengths: [
+ "Reliable for common product work",
+ "Handles multi-file changes and dependencies",
+ "Consistent across all five languages",
+ ],
+ weaknesses: ["Overkill for trivial diffs", "May need help on cross-cutting architecture"],
+ icon: "GitBranch",
+ },
+ {
+ id: "staff",
+ name: "Architecture & Refactor",
+ salaryRange: "Scope: cross-cutting",
+ description:
+ "For ambiguity and cross-cutting changes: architecture decisions, complex refactors, and work where correctness matters more than speed.",
+ bestFor: ["Complex refactors", "Architecture changes", "Ambiguous requirements", "System design"],
+ strengths: [
+ "Strong multi-step reasoning",
+ "Good at navigating bigger codebases",
+ "Better at making safe, coherent changes",
+ ],
+ weaknesses: ["Overkill for simple diffs", "Still needs human review before merge"],
+ icon: "Building2",
+ },
+ {
+ id: "reviewer",
+ name: "Reviewer & Guardrails",
+ salaryRange: "Mode: review",
+ description:
+ "For PR feedback, security review, and design critique. Use this to improve quality and reduce surprises before merge.",
+ bestFor: ["Code review", "PR feedback", "Security analysis", "Design critique", "Refactor guidance"],
+ strengths: [
+ "Catches subtle bugs and logic errors",
+ "Provides actionable suggestions with context",
+ "Understands cross-file impact of changes",
+ ],
+ weaknesses: [
+ "Not for writing features end-to-end",
+ "Not a replacement for CI and linters",
+ "Review quality varies by codebase size",
+ ],
+ icon: "Search",
+ },
+ {
+ id: "autonomous",
+ name: "Autonomous Delivery",
+ salaryRange: "Mode: end-to-end",
+ description:
+ "For issue-to-PR workflows and long-running tasks. Best when you want an agent to run, iterate, and bring back a reviewable result.",
+ bestFor: [
+ "Issue-to-PR workflows",
+ "Multi-step debugging",
+ "Feature implementation from spec",
+ "Long-running tasks",
+ "Batch operations",
+ ],
+ strengths: [
+ "Completes tasks end-to-end with minimal guidance",
+ "Recovers from errors and retries automatically",
+ "Handles ambiguous requirements independently",
+ ],
+ weaknesses: [
+ "Higher cost per completed task due to retries",
+ "May take unexpected approaches without oversight",
+ "Results need review before merging",
+ ],
+ icon: "Bot",
+ },
+]
+
+// ── Model Candidates (derived from roocode.com/evals data) ─────────────────
+// Cost per task = total run cost ÷ 120 exercises
+// Time per task = total duration (seconds) ÷ 120 exercises
+// Composite scores computed using role-specific weights:
+// Junior: success 50%, speed 20%, cost 25%, quality 5%
+// Senior: success 40%, quality 25%, cost 20%, speed 15%
+// Staff: success 40%, quality 35%, cost 15%, speed 10%
+// Quality = consistency across languages (lower variance → higher score)
+
+// --- Junior Role Candidates -------------------------------------------------
+
+const juniorCandidates: ModelCandidate[] = [
+ {
+ provider: "xai",
+ modelId: "grok-4-fast",
+ displayName: "Grok 4 Fast",
+ compositeScore: 94,
+ tier: "best",
+ tags: ["best-value"],
+ successRate: 97,
+ avgCostPerTask: 0.029,
+ estimatedDailyCost: 0.029 * TASKS_PER_DAY,
+ avgTimePerTask: 144.0,
+ languageScores: { go: 97, java: 96, javascript: 98, python: 100, rust: 97 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "openai",
+ modelId: "gpt-5-mini",
+ displayName: "GPT-5 Mini",
+ compositeScore: 92,
+ tier: "best",
+ tags: [],
+ successRate: 99,
+ avgCostPerTask: 0.028,
+ estimatedDailyCost: 0.028 * TASKS_PER_DAY,
+ avgTimePerTask: 173.0,
+ languageScores: { go: 100, java: 98, javascript: 100, python: 100, rust: 97 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "xai",
+ modelId: "grok-code-fast-1",
+ displayName: "Grok Code Fast 1",
+ compositeScore: 85,
+ tier: "best",
+ tags: [],
+ successRate: 90,
+ avgCostPerTask: 0.057,
+ estimatedDailyCost: 0.057 * TASKS_PER_DAY,
+ avgTimePerTask: 146.0,
+ languageScores: { go: 92, java: 91, javascript: 88, python: 94, rust: 83 },
+ settings: { temperature: 0 },
+ caveats: ["Weaker on Rust (83%): consider alternatives for Rust-heavy tasks"],
+ },
+ {
+ provider: "google",
+ modelId: "gemini-2.5-flash",
+ displayName: "Gemini 2.5 Flash",
+ compositeScore: 82,
+ tier: "recommended",
+ tags: ["speed-hire"],
+ successRate: 90,
+ avgCostPerTask: 0.118,
+ estimatedDailyCost: 0.118 * TASKS_PER_DAY,
+ avgTimePerTask: 109.5,
+ languageScores: { go: 89, java: 91, javascript: 92, python: 85, rust: 90 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "openai",
+ modelId: "gpt-4.1-mini",
+ displayName: "GPT-4.1 Mini",
+ compositeScore: 77,
+ tier: "recommended",
+ tags: [],
+ successRate: 83,
+ avgCostPerTask: 0.073,
+ estimatedDailyCost: 0.073 * TASKS_PER_DAY,
+ avgTimePerTask: 158.5,
+ languageScores: { go: 81, java: 84, javascript: 94, python: 76, rust: 70 },
+ settings: { temperature: 0 },
+ caveats: ["Inconsistent across languages: Python (76%) to JavaScript (94%)"],
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-haiku-4-5",
+ displayName: "Claude Haiku 4.5",
+ compositeScore: 77,
+ tier: "recommended",
+ tags: [],
+ successRate: 95,
+ avgCostPerTask: 0.159,
+ estimatedDailyCost: 0.159 * TASKS_PER_DAY,
+ avgTimePerTask: 139.0,
+ languageScores: { go: 92, java: 93, javascript: 94, python: 97, rust: 100 },
+ settings: { temperature: 0 },
+ caveats: ["Most expensive in junior tier. Consider Grok 4 Fast for better cost-to-quality ratio."],
+ },
+ {
+ provider: "openai",
+ modelId: "gpt-5-nano",
+ displayName: "GPT-5 Nano",
+ compositeScore: 73,
+ tier: "situational",
+ tags: ["budget-hire"],
+ successRate: 78,
+ avgCostPerTask: 0.013,
+ estimatedDailyCost: 0.013 * TASKS_PER_DAY,
+ avgTimePerTask: 276.5,
+ languageScores: { go: 86, java: 73, javascript: 76, python: 79, rust: 77 },
+ settings: { temperature: 0 },
+ caveats: ["Cheapest option but slowest: 4.6 min/task average"],
+ },
+ {
+ provider: "deepseek",
+ modelId: "deepseek-v3",
+ displayName: "DeepSeek V3",
+ compositeScore: 66,
+ tier: "situational",
+ tags: [],
+ successRate: 77,
+ avgCostPerTask: 0.107,
+ estimatedDailyCost: 0.107 * TASKS_PER_DAY,
+ avgTimePerTask: 216.0,
+ languageScores: { go: 83, java: 76, javascript: 82, python: 76, rust: 67 },
+ settings: { temperature: 0 },
+ caveats: ["Weakest on Rust (67%)", "Open-source model, self-hostable"],
+ },
+]
+
+// --- Senior Role Candidates -------------------------------------------------
+
+const seniorCandidates: ModelCandidate[] = [
+ {
+ provider: "moonshot",
+ modelId: "kimi-k2-0905",
+ displayName: "Kimi K2",
+ compositeScore: 95,
+ tier: "best",
+ tags: ["budget-hire", "best-value"],
+ successRate: 94,
+ avgCostPerTask: 0.127,
+ estimatedDailyCost: 0.127 * TASKS_PER_DAY,
+ avgTimePerTask: 112.0,
+ languageScores: { go: 94, java: 91, javascript: 96, python: 97, rust: 93 },
+ settings: { temperature: 0 },
+ caveats: ["Tested via Groq; latency may vary by provider"],
+ },
+ {
+ provider: "openai",
+ modelId: "gpt-4.1",
+ displayName: "GPT-4.1",
+ compositeScore: 87,
+ tier: "best",
+ tags: [],
+ successRate: 91,
+ avgCostPerTask: 0.322,
+ estimatedDailyCost: 0.322 * TASKS_PER_DAY,
+ avgTimePerTask: 139.5,
+ languageScores: { go: 92, java: 91, javascript: 90, python: 94, rust: 90 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-sonnet-4",
+ displayName: "Claude Sonnet 4",
+ compositeScore: 84,
+ tier: "best",
+ tags: ["top-performer"],
+ successRate: 98,
+ avgCostPerTask: 0.33,
+ estimatedDailyCost: 0.33 * TASKS_PER_DAY,
+ avgTimePerTask: 167.5,
+ languageScores: { go: 94, java: 100, javascript: 98, python: 100, rust: 97 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "openai",
+ modelId: "gpt-5-medium",
+ displayName: "GPT-5 (Medium)",
+ compositeScore: 81,
+ tier: "recommended",
+ tags: [],
+ successRate: 98,
+ avgCostPerTask: 0.193,
+ estimatedDailyCost: 0.193 * TASKS_PER_DAY,
+ avgTimePerTask: 260.0,
+ languageScores: { go: 97, java: 98, javascript: 100, python: 100, rust: 93 },
+ settings: { temperature: 0, reasoningEffort: "medium" },
+ caveats: ["Slowest in tier: 4.3 min/task average"],
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-3.7-sonnet",
+ displayName: "Claude 3.7 Sonnet",
+ compositeScore: 79,
+ tier: "recommended",
+ tags: [],
+ successRate: 95,
+ avgCostPerTask: 0.313,
+ estimatedDailyCost: 0.313 * TASKS_PER_DAY,
+ avgTimePerTask: 176.5,
+ languageScores: { go: 92, java: 98, javascript: 94, python: 100, rust: 93 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-3.5-sonnet",
+ displayName: "Claude 3.5 Sonnet",
+ compositeScore: 78,
+ tier: "recommended",
+ tags: ["speed-hire"],
+ successRate: 90,
+ avgCostPerTask: 0.208,
+ estimatedDailyCost: 0.208 * TASKS_PER_DAY,
+ avgTimePerTask: 108.5,
+ languageScores: { go: 94, java: 91, javascript: 92, python: 88, rust: 80 },
+ settings: { temperature: 0 },
+ caveats: ["Previous generation; weaker on Rust (80%)"],
+ },
+ {
+ provider: "openai",
+ modelId: "gpt-5-low",
+ displayName: "GPT-5 (Low)",
+ compositeScore: 76,
+ tier: "situational",
+ tags: [],
+ successRate: 95,
+ avgCostPerTask: 0.135,
+ estimatedDailyCost: 0.135 * TASKS_PER_DAY,
+ avgTimePerTask: 175.0,
+ languageScores: { go: 100, java: 96, javascript: 86, python: 100, rust: 100 },
+ settings: { temperature: 0, reasoningEffort: "low" },
+ caveats: ["Weak on JavaScript (86%) compared to other languages"],
+ },
+ {
+ provider: "google",
+ modelId: "gemini-2.5-pro",
+ displayName: "Gemini 2.5 Pro",
+ compositeScore: 73,
+ tier: "situational",
+ tags: [],
+ successRate: 96,
+ avgCostPerTask: 0.482,
+ estimatedDailyCost: 0.482 * TASKS_PER_DAY,
+ avgTimePerTask: 188.5,
+ languageScores: { go: 97, java: 91, javascript: 96, python: 100, rust: 97 },
+ settings: { temperature: 0 },
+ caveats: ["Most expensive in this tier: $39/day ($0.48/task)"],
+ },
+]
+
+// --- Staff Role Candidates --------------------------------------------------
+
+const staffCandidates: ModelCandidate[] = [
+ {
+ provider: "openai",
+ modelId: "gpt-5.2-med",
+ displayName: "GPT 5.2 (Med)",
+ compositeScore: 99,
+ tier: "best",
+ tags: ["budget-hire", "best-value"],
+ successRate: 100,
+ avgCostPerTask: 0.104,
+ estimatedDailyCost: 0.104 * TASKS_PER_DAY,
+ avgTimePerTask: 105.5,
+ languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 },
+ settings: { temperature: 0, reasoningEffort: "medium" },
+ caveats: ["100% pass rate at $8/day ($0.10/task): best cost-to-quality ratio in this role"],
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-opus-4-6",
+ displayName: "Opus 4.6",
+ compositeScore: 98,
+ tier: "best",
+ tags: ["speed-hire", "top-performer"],
+ successRate: 100,
+ avgCostPerTask: 0.412,
+ estimatedDailyCost: 0.412 * TASKS_PER_DAY,
+ avgTimePerTask: 76.5,
+ languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-sonnet-4-5",
+ displayName: "Claude Sonnet 4.5",
+ compositeScore: 97,
+ tier: "best",
+ tags: [],
+ successRate: 100,
+ avgCostPerTask: 0.32,
+ estimatedDailyCost: 0.32 * TASKS_PER_DAY,
+ avgTimePerTask: 103.0,
+ languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-opus-4-5",
+ displayName: "Opus 4.5",
+ compositeScore: 96,
+ tier: "recommended",
+ tags: [],
+ successRate: 100,
+ avgCostPerTask: 0.419,
+ estimatedDailyCost: 0.419 * TASKS_PER_DAY,
+ avgTimePerTask: 124.0,
+ languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "google",
+ modelId: "gemini-3-pro-preview",
+ displayName: "Gemini 3 Pro Preview",
+ compositeScore: 95,
+ tier: "recommended",
+ tags: [],
+ successRate: 100,
+ avgCostPerTask: 0.276,
+ estimatedDailyCost: 0.276 * TASKS_PER_DAY,
+ avgTimePerTask: 164.0,
+ languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-opus-4-1",
+ displayName: "Opus 4.1",
+ compositeScore: 73,
+ tier: "situational",
+ tags: [],
+ successRate: 98,
+ avgCostPerTask: 1.168,
+ estimatedDailyCost: 1.168 * TASKS_PER_DAY,
+ avgTimePerTask: 211.5,
+ languageScores: { go: 97, java: 96, javascript: 98, python: 100, rust: 100 },
+ settings: { temperature: 0 },
+ caveats: ["$93/day ($1.17/task), 11× the cost of the top pick"],
+ },
+ {
+ provider: "openai",
+ modelId: "gpt-5-medium",
+ displayName: "GPT-5 (Medium)",
+ compositeScore: 71,
+ tier: "situational",
+ tags: [],
+ successRate: 98,
+ avgCostPerTask: 0.193,
+ estimatedDailyCost: 0.193 * TASKS_PER_DAY,
+ avgTimePerTask: 260.0,
+ languageScores: { go: 97, java: 98, javascript: 100, python: 100, rust: 93 },
+ settings: { temperature: 0, reasoningEffort: "medium" },
+ caveats: ["Slowest in tier: 4.3 min/task average"],
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-opus-4",
+ displayName: "Opus 4",
+ compositeScore: 57,
+ tier: "not-recommended",
+ tags: [],
+ successRate: 94,
+ avgCostPerTask: 1.436,
+ estimatedDailyCost: 1.436 * TASKS_PER_DAY,
+ avgTimePerTask: 235.0,
+ languageScores: { go: 92, java: 91, javascript: 94, python: 94, rust: 100 },
+ settings: { temperature: 0 },
+ caveats: [
+ "Most expensive model tested: $115/day ($1.44/task)",
+ "Lower success rate (94%) despite highest cost",
+ ],
+ },
+]
+
+// --- Architecture Reviewer Candidates ---------------------------------------
+// Composite scoring: quality 50%, success 30%, cost 15%, speed 5%
+// Quality = consistency across languages (lower variance → higher score)
+
+const reviewerCandidates: ModelCandidate[] = [
+ {
+ provider: "openai",
+ modelId: "gpt-5.2-med",
+ displayName: "GPT 5.2 (Med)",
+ compositeScore: 98,
+ tier: "best",
+ tags: ["budget-hire", "best-value"],
+ successRate: 100,
+ avgCostPerTask: 0.104,
+ estimatedDailyCost: 0.104 * TASKS_PER_DAY,
+ avgTimePerTask: 105.5,
+ languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 },
+ settings: { temperature: 0, reasoningEffort: "medium" },
+ caveats: ["100% consistency across all languages: ideal reviewer at $8/day"],
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-opus-4-6",
+ displayName: "Opus 4.6",
+ compositeScore: 95,
+ tier: "best",
+ tags: ["speed-hire", "top-performer"],
+ successRate: 100,
+ avgCostPerTask: 0.412,
+ estimatedDailyCost: 0.412 * TASKS_PER_DAY,
+ avgTimePerTask: 76.5,
+ languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-sonnet-4-5",
+ displayName: "Claude Sonnet 4.5",
+ compositeScore: 94,
+ tier: "best",
+ tags: [],
+ successRate: 100,
+ avgCostPerTask: 0.32,
+ estimatedDailyCost: 0.32 * TASKS_PER_DAY,
+ avgTimePerTask: 103.0,
+ languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-sonnet-4",
+ displayName: "Claude Sonnet 4",
+ compositeScore: 90,
+ tier: "recommended",
+ tags: ["top-performer"],
+ successRate: 98,
+ avgCostPerTask: 0.33,
+ estimatedDailyCost: 0.33 * TASKS_PER_DAY,
+ avgTimePerTask: 167.5,
+ languageScores: { go: 94, java: 100, javascript: 98, python: 100, rust: 97 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-haiku-4-5",
+ displayName: "Claude Haiku 4.5",
+ compositeScore: 88,
+ tier: "recommended",
+ tags: [],
+ successRate: 95,
+ avgCostPerTask: 0.159,
+ estimatedDailyCost: 0.159 * TASKS_PER_DAY,
+ avgTimePerTask: 139.0,
+ languageScores: { go: 92, java: 93, javascript: 94, python: 97, rust: 100 },
+ settings: { temperature: 0 },
+ caveats: ["Budget reviewer option: good consistency at lower cost"],
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-3.7-sonnet",
+ displayName: "Claude 3.7 Sonnet",
+ compositeScore: 86,
+ tier: "recommended",
+ tags: [],
+ successRate: 95,
+ avgCostPerTask: 0.313,
+ estimatedDailyCost: 0.313 * TASKS_PER_DAY,
+ avgTimePerTask: 176.5,
+ languageScores: { go: 92, java: 98, javascript: 94, python: 100, rust: 93 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "google",
+ modelId: "gemini-2.5-pro",
+ displayName: "Gemini 2.5 Pro",
+ compositeScore: 82,
+ tier: "situational",
+ tags: [],
+ successRate: 96,
+ avgCostPerTask: 0.482,
+ estimatedDailyCost: 0.482 * TASKS_PER_DAY,
+ avgTimePerTask: 188.5,
+ languageScores: { go: 97, java: 91, javascript: 96, python: 100, rust: 97 },
+ settings: { temperature: 0 },
+ caveats: ["Most expensive reviewer: $39/day ($0.48/task)", "More variable across languages than top picks"],
+ },
+ {
+ provider: "openai",
+ modelId: "gpt-4.1",
+ displayName: "GPT-4.1",
+ compositeScore: 80,
+ tier: "situational",
+ tags: [],
+ successRate: 91,
+ avgCostPerTask: 0.322,
+ estimatedDailyCost: 0.322 * TASKS_PER_DAY,
+ avgTimePerTask: 139.5,
+ languageScores: { go: 92, java: 91, javascript: 90, python: 94, rust: 90 },
+ settings: { temperature: 0 },
+ caveats: ["Lower consistency across languages than Anthropic alternatives"],
+ },
+]
+
+// --- Autonomous Agent Candidates --------------------------------------------
+// Composite scoring: success 35%, quality 35%, cost 20%, speed 10%
+// Focused on end-to-end task completion and error recovery
+
+const autonomousCandidates: ModelCandidate[] = [
+ {
+ provider: "openai",
+ modelId: "gpt-5.2-med",
+ displayName: "GPT 5.2 (Med)",
+ compositeScore: 97,
+ tier: "best",
+ tags: ["best-value", "speed-hire"],
+ successRate: 100,
+ avgCostPerTask: 0.104,
+ estimatedDailyCost: 0.104 * TASKS_PER_DAY,
+ avgTimePerTask: 105.5,
+ languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 },
+ settings: { temperature: 0, reasoningEffort: "medium" },
+ caveats: ["Perfect success rate + fast completion: ideal autonomous agent at $8/day"],
+ },
+ {
+ provider: "openai",
+ modelId: "gpt-5-mini",
+ displayName: "GPT-5 Mini",
+ compositeScore: 93,
+ tier: "best",
+ tags: ["budget-hire"],
+ successRate: 99,
+ avgCostPerTask: 0.028,
+ estimatedDailyCost: 0.028 * TASKS_PER_DAY,
+ avgTimePerTask: 173.0,
+ languageScores: { go: 100, java: 98, javascript: 100, python: 100, rust: 97 },
+ settings: { temperature: 0 },
+ caveats: ["Cheapest autonomous option at $2/day with near-perfect success"],
+ },
+ {
+ provider: "xai",
+ modelId: "grok-4-fast",
+ displayName: "Grok 4 Fast",
+ compositeScore: 92,
+ tier: "best",
+ tags: [],
+ successRate: 97,
+ avgCostPerTask: 0.029,
+ estimatedDailyCost: 0.029 * TASKS_PER_DAY,
+ avgTimePerTask: 144.0,
+ languageScores: { go: 97, java: 96, javascript: 98, python: 100, rust: 97 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-sonnet-4",
+ displayName: "Claude Sonnet 4",
+ compositeScore: 87,
+ tier: "recommended",
+ tags: ["top-performer"],
+ successRate: 98,
+ avgCostPerTask: 0.33,
+ estimatedDailyCost: 0.33 * TASKS_PER_DAY,
+ avgTimePerTask: 167.5,
+ languageScores: { go: 94, java: 100, javascript: 98, python: 100, rust: 97 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "moonshot",
+ modelId: "kimi-k2-0905",
+ displayName: "Kimi K2",
+ compositeScore: 86,
+ tier: "recommended",
+ tags: [],
+ successRate: 94,
+ avgCostPerTask: 0.127,
+ estimatedDailyCost: 0.127 * TASKS_PER_DAY,
+ avgTimePerTask: 112.0,
+ languageScores: { go: 94, java: 91, javascript: 96, python: 97, rust: 93 },
+ settings: { temperature: 0 },
+ caveats: ["Tested via Groq; latency may vary by provider"],
+ },
+ {
+ provider: "anthropic",
+ modelId: "claude-haiku-4-5",
+ displayName: "Claude Haiku 4.5",
+ compositeScore: 85,
+ tier: "recommended",
+ tags: [],
+ successRate: 95,
+ avgCostPerTask: 0.159,
+ estimatedDailyCost: 0.159 * TASKS_PER_DAY,
+ avgTimePerTask: 139.0,
+ languageScores: { go: 92, java: 93, javascript: 94, python: 97, rust: 100 },
+ settings: { temperature: 0 },
+ },
+ {
+ provider: "openai",
+ modelId: "gpt-5-low",
+ displayName: "GPT-5 (Low)",
+ compositeScore: 82,
+ tier: "situational",
+ tags: [],
+ successRate: 95,
+ avgCostPerTask: 0.135,
+ estimatedDailyCost: 0.135 * TASKS_PER_DAY,
+ avgTimePerTask: 175.0,
+ languageScores: { go: 100, java: 96, javascript: 86, python: 100, rust: 100 },
+ settings: { temperature: 0, reasoningEffort: "low" },
+ caveats: ["Weak on JavaScript (86%) compared to other languages"],
+ },
+ {
+ provider: "openai",
+ modelId: "gpt-5-medium",
+ displayName: "GPT-5 (Medium)",
+ compositeScore: 80,
+ tier: "situational",
+ tags: [],
+ successRate: 98,
+ avgCostPerTask: 0.193,
+ estimatedDailyCost: 0.193 * TASKS_PER_DAY,
+ avgTimePerTask: 260.0,
+ languageScores: { go: 97, java: 98, javascript: 100, python: 100, rust: 93 },
+ settings: { temperature: 0, reasoningEffort: "medium" },
+ caveats: ["Slowest in tier: 4.3 min/task average"],
+ },
+]
+
+// ── Recommendation Builders ────────────────────────────────────────────────
+
+function findBudgetHire(candidates: ModelCandidate[]): ModelCandidate | null {
+ const budget = candidates
+ .filter((c) => c.tags.includes("budget-hire"))
+ .sort((a, b) => a.avgCostPerTask - b.avgCostPerTask)
+ return budget[0] ?? null
+}
+
+function findSpeedHire(candidates: ModelCandidate[]): ModelCandidate | null {
+ const fast = [...candidates]
+ .filter((c) => c.tier !== "not-recommended")
+ .sort((a, b) => a.avgTimePerTask - b.avgTimePerTask)
+ return fast[0] ?? null
+}
+
+function buildRecommendation(
+ role: EngineerRole,
+ candidates: ModelCandidate[],
+ totalEvalRuns: number,
+ totalExercises: number,
+): RoleRecommendation {
+ const sorted = [...candidates].sort((a, b) => b.compositeScore - a.compositeScore)
+ return {
+ roleId: role.id,
+ role,
+ lastUpdated: "2026-02-11T00:00:00Z",
+ totalEvalRuns,
+ totalExercises,
+ best: sorted.filter((c) => c.tier === "best").slice(0, 3),
+ budgetHire: findBudgetHire(sorted),
+ speedHire: findSpeedHire(sorted),
+ allCandidates: sorted,
+ }
+}
+
+// ── Pre-built Recommendations ──────────────────────────────────────────────
+
+const RECOMMENDATIONS: Record = {
+ junior: buildRecommendation(ENGINEER_ROLES[0]!, juniorCandidates, 27, 120),
+ senior: buildRecommendation(ENGINEER_ROLES[1]!, seniorCandidates, 27, 120),
+ staff: buildRecommendation(ENGINEER_ROLES[2]!, staffCandidates, 27, 120),
+ reviewer: buildRecommendation(ENGINEER_ROLES[3]!, reviewerCandidates, 27, 120),
+ autonomous: buildRecommendation(ENGINEER_ROLES[4]!, autonomousCandidates, 27, 120),
+}
+
+// ── Public API ─────────────────────────────────────────────────────────────
+
+/** Returns all engineer role configurations. */
+export function getEngineerRoles(): EngineerRole[] {
+ return ENGINEER_ROLES
+}
+
+/** Returns a single engineer role by id, or `undefined` if not found. */
+export function getEngineerRole(roleId: string): EngineerRole | undefined {
+ return ENGINEER_ROLES.find((r) => r.id === roleId)
+}
+
+/** Returns the full recommendation payload for a role, or `undefined` if not found. */
+export function getRoleRecommendation(roleId: string): RoleRecommendation | undefined {
+ return RECOMMENDATIONS[roleId]
+}
+
+/** Returns recommendation payloads for all roles. */
+export function getAllRecommendations(): RoleRecommendation[] {
+ return Object.values(RECOMMENDATIONS)
+}
+
+/** Generates a Cloud signup URL pre-configured with the candidate's model settings. */
+export function getCloudSetupUrl(candidate: ModelCandidate): string {
+ const params = new URLSearchParams({
+ redirect_url: `/cloud-agents/setup?model=${candidate.modelId}&provider=${candidate.provider}&temperature=${candidate.settings.temperature}`,
+ })
+ return `https://app.roocode.com/sign-up?${params.toString()}`
+}
+
+// ── Model Timeline Data ────────────────────────────────────────────────────
+// Historical model performance over time for the landing page chart.
+
+export type ModelTimelineEntry = {
+ modelName: string
+ provider: string
+ releaseDate: string // ISO date
+ score: number // our eval score (total %)
+ costPerRun: number // total cost for the full eval run
+}
+
+export const MODEL_TIMELINE: ModelTimelineEntry[] = [
+ { modelName: "Claude 3.5 Sonnet", provider: "anthropic", releaseDate: "2025-06-20", score: 90, costPerRun: 24.98 },
+ { modelName: "GPT-4.1", provider: "openai", releaseDate: "2025-08-14", score: 91, costPerRun: 38.64 },
+ { modelName: "Claude 3.7 Sonnet", provider: "anthropic", releaseDate: "2025-09-15", score: 95, costPerRun: 37.58 },
+ { modelName: "Gemini 2.5 Pro", provider: "google", releaseDate: "2025-10-01", score: 96, costPerRun: 57.8 },
+ { modelName: "Claude Sonnet 4", provider: "anthropic", releaseDate: "2025-11-01", score: 98, costPerRun: 39.61 },
+ { modelName: "GPT-5 Mini", provider: "openai", releaseDate: "2025-12-01", score: 99, costPerRun: 3.34 },
+ { modelName: "Claude Sonnet 4.5", provider: "anthropic", releaseDate: "2026-01-15", score: 100, costPerRun: 38.43 },
+ { modelName: "GPT 5.2 (Med)", provider: "openai", releaseDate: "2026-01-20", score: 100, costPerRun: 12.5 },
+ { modelName: "Opus 4.6", provider: "anthropic", releaseDate: "2026-02-01", score: 100, costPerRun: 49.48 },
+ { modelName: "Gemini 3 Pro", provider: "google", releaseDate: "2026-02-05", score: 100, costPerRun: 33.06 },
+]
diff --git a/apps/web-roo-code/src/lib/objective-default-models-v1.ts b/apps/web-roo-code/src/lib/objective-default-models-v1.ts
new file mode 100644
index 00000000000..e60480d908c
--- /dev/null
+++ b/apps/web-roo-code/src/lib/objective-default-models-v1.ts
@@ -0,0 +1,227 @@
+import type { EvalOutcomeId } from "./eval-outcomes"
+
+type ObjectiveMetric = { score: number; costUsd: number; runtimeS: number }
+
+type ModelObjectiveMetrics = {
+ modelId: string
+ issueResolution: ObjectiveMetric
+ frontend: ObjectiveMetric
+ greenfield: ObjectiveMetric
+ testing: ObjectiveMetric
+ infoGathering: ObjectiveMetric
+}
+
+type EvalOptimizationModeV1 = "best" | "fastest" | "cost"
+
+type ObjectiveWeights = {
+ issueResolution: number
+ frontend: number
+ greenfield: number
+ testing: number
+ infoGathering: number
+}
+
+type WeightedObjectiveMetrics = { score: number; costUsd: number; runtimeS: number }
+
+export type ObjectiveDefaultModelV1 = {
+ modelId: string
+ weighted: WeightedObjectiveMetrics
+}
+
+const MODEL_METRICS_V1: ModelObjectiveMetrics[] = [
+ {
+ modelId: "claude-opus-4-6",
+ issueResolution: { score: 74.8, costUsd: 0.56, runtimeS: 178 },
+ frontend: { score: 41.8, costUsd: 2.37, runtimeS: 602 },
+ greenfield: { score: 43.8, costUsd: 2.5, runtimeS: 388 },
+ testing: { score: 78.8, costUsd: 0.43, runtimeS: 138 },
+ infoGathering: { score: 80, costUsd: 1.33, runtimeS: 526 },
+ },
+ {
+ modelId: "GPT-5.2-Codex",
+ issueResolution: { score: 73.8, costUsd: 0.94, runtimeS: 438 },
+ frontend: { score: 35.9, costUsd: 2.97, runtimeS: 1434 },
+ greenfield: { score: 62.5, costUsd: 2.5, runtimeS: 838 },
+ testing: { score: 62.5, costUsd: 0.66, runtimeS: 343 },
+ infoGathering: { score: 70.9, costUsd: 1.66, runtimeS: 799 },
+ },
+ {
+ modelId: "claude-opus-4-5",
+ issueResolution: { score: 76.6, costUsd: 1.82, runtimeS: 325 },
+ frontend: { score: 41.2, costUsd: 2.54, runtimeS: 671 },
+ greenfield: { score: 37.5, costUsd: 4.65, runtimeS: 495 },
+ testing: { score: 78.5, costUsd: 1.38, runtimeS: 268 },
+ infoGathering: { score: 69.1, costUsd: 0.55, runtimeS: 97 },
+ },
+ {
+ modelId: "MiniMax-M2.5",
+ issueResolution: { score: 72.6, costUsd: 0.1, runtimeS: 455 },
+ frontend: { score: 25, costUsd: 0.15, runtimeS: 611 },
+ greenfield: { score: 50, costUsd: 0.16, runtimeS: 376 },
+ testing: { score: 68.1, costUsd: 0.07, runtimeS: 389 },
+ infoGathering: { score: 47.9, costUsd: 0.06, runtimeS: 716 },
+ },
+ {
+ modelId: "GPT-5.2",
+ issueResolution: { score: 74.6, costUsd: 0.86, runtimeS: 476 },
+ frontend: { score: 30.9, costUsd: 2.77, runtimeS: 1571 },
+ greenfield: { score: 18.8, costUsd: 0.71, runtimeS: 397 },
+ testing: { score: 73.2, costUsd: 0.56, runtimeS: 347 },
+ infoGathering: { score: 65.5, costUsd: 0.48, runtimeS: 189 },
+ },
+ {
+ modelId: "claude-sonnet-4-5",
+ issueResolution: { score: 74.2, costUsd: 1.19, runtimeS: 534 },
+ frontend: { score: 36.8, costUsd: 1.89, runtimeS: 787 },
+ greenfield: { score: 12.5, costUsd: 2.65, runtimeS: 744 },
+ testing: { score: 68.8, costUsd: 0.98, runtimeS: 488 },
+ infoGathering: { score: 58.8, costUsd: 0.38, runtimeS: 126 },
+ },
+ {
+ modelId: "Kimi-K2.5",
+ issueResolution: { score: 68.8, costUsd: 0.48, runtimeS: 707 },
+ frontend: { score: 32.8, costUsd: 1.58, runtimeS: 921 },
+ greenfield: { score: 18.8, costUsd: 0.96, runtimeS: 814 },
+ testing: { score: 61.9, costUsd: 0.42, runtimeS: 385 },
+ infoGathering: { score: 63.6, costUsd: 0.39, runtimeS: 602 },
+ },
+ {
+ modelId: "Gemini-3-Flash",
+ issueResolution: { score: 74.6, costUsd: 0.42, runtimeS: 343 },
+ frontend: { score: 22.1, costUsd: 0.8, runtimeS: 1152 },
+ greenfield: { score: 18.8, costUsd: 0.82, runtimeS: 399 },
+ testing: { score: 70.7, costUsd: 0.3, runtimeS: 213 },
+ infoGathering: { score: 58.8, costUsd: 0.38, runtimeS: 398 },
+ },
+ {
+ modelId: "DeepSeek-V3.2-Reasoner",
+ issueResolution: { score: 71.6, costUsd: 0.16, runtimeS: 1429 },
+ frontend: { score: 27.9, costUsd: 0.19, runtimeS: 1515 },
+ greenfield: { score: 31.2, costUsd: 0.12, runtimeS: 1411 },
+ testing: { score: 53.6, costUsd: 0.12, runtimeS: 1215 },
+ infoGathering: { score: 50.3, costUsd: 0.06, runtimeS: 427 },
+ },
+ {
+ modelId: "Gemini-3-Pro",
+ issueResolution: { score: 70.6, costUsd: 0.95, runtimeS: 343 },
+ frontend: { score: 36.8, costUsd: 1.46, runtimeS: 710 },
+ greenfield: { score: 12.5, costUsd: 2.68, runtimeS: 554 },
+ testing: { score: 68.6, costUsd: 1.01, runtimeS: 386 },
+ infoGathering: { score: 44.2, costUsd: 1.5, runtimeS: 1775 },
+ },
+ {
+ modelId: "MiniMax-M2.1",
+ issueResolution: { score: 68.8, costUsd: 0.14, runtimeS: 579 },
+ frontend: { score: 16.2, costUsd: 0.21, runtimeS: 1417 },
+ greenfield: { score: 25, costUsd: 0.33, runtimeS: 826 },
+ testing: { score: 61.4, costUsd: 0.11, runtimeS: 473 },
+ infoGathering: { score: 40.6, costUsd: 0.06, runtimeS: 641 },
+ },
+ {
+ modelId: "GLM-4.7",
+ issueResolution: { score: 73.4, costUsd: 0.56, runtimeS: 1007 },
+ frontend: { score: 22.1, costUsd: 0.66, runtimeS: 1519 },
+ greenfield: { score: 12.5, costUsd: 0.54, runtimeS: 578 },
+ testing: { score: 49.4, costUsd: 0.37, runtimeS: 744 },
+ infoGathering: { score: 53.9, costUsd: 0.46, runtimeS: 1138 },
+ },
+ {
+ modelId: "Kimi-K2-Thinking",
+ issueResolution: { score: 69.2, costUsd: 2, runtimeS: 1325 },
+ frontend: { score: 32.4, costUsd: 2.31, runtimeS: 1641 },
+ greenfield: { score: 18.8, costUsd: 6.78, runtimeS: 2314 },
+ testing: { score: 47.3, costUsd: 1.39, runtimeS: 1253 },
+ infoGathering: { score: 43.6, costUsd: 0.65, runtimeS: 279 },
+ },
+ {
+ modelId: "Qwen3-Coder-480B",
+ issueResolution: { score: 62.4, costUsd: 1.26, runtimeS: 680 },
+ frontend: { score: 23.5, costUsd: 2.09, runtimeS: 1006 },
+ greenfield: { score: 0, costUsd: 1.79, runtimeS: 924 },
+ testing: { score: 34.9, costUsd: 0.97, runtimeS: 626 },
+ infoGathering: { score: 33.9, costUsd: 0.28, runtimeS: 197 },
+ },
+]
+
+function getOutcomeWeights(outcomeId: EvalOutcomeId): ObjectiveWeights {
+ // These are intentionally opinionated. They exist to make the prototype feel realistic
+ // before we wire real Roo Code Cloud evals.
+ switch (outcomeId) {
+ // Idea → Prototype
+ case "review_guardrails":
+ return { greenfield: 0.5, infoGathering: 0.35, frontend: 0.1, testing: 0.05, issueResolution: 0 }
+ // Prototype → PR
+ case "prototype_to_pr":
+ return { greenfield: 0.35, testing: 0.35, issueResolution: 0.2, frontend: 0.1, infoGathering: 0 }
+ // Issue → PR
+ case "issue_to_pr":
+ return { issueResolution: 0.4, testing: 0.3, infoGathering: 0.2, frontend: 0.1, greenfield: 0 }
+ // Customer Escalation → Resolved
+ case "sentry_triage":
+ return { issueResolution: 0.55, infoGathering: 0.25, testing: 0.2, frontend: 0, greenfield: 0 }
+ // Bug Report → Fix
+ case "repro_to_fix":
+ return { issueResolution: 0.45, testing: 0.4, infoGathering: 0.15, frontend: 0, greenfield: 0 }
+ // Paper Cuts → Shipped
+ case "paper_cuts":
+ return { frontend: 0.6, issueResolution: 0.2, testing: 0.2, greenfield: 0, infoGathering: 0 }
+ }
+}
+
+function getWeightedMetrics(row: ModelObjectiveMetrics, weights: ObjectiveWeights): WeightedObjectiveMetrics {
+ const score =
+ row.issueResolution.score * weights.issueResolution +
+ row.frontend.score * weights.frontend +
+ row.greenfield.score * weights.greenfield +
+ row.testing.score * weights.testing +
+ row.infoGathering.score * weights.infoGathering
+ const costUsd =
+ row.issueResolution.costUsd * weights.issueResolution +
+ row.frontend.costUsd * weights.frontend +
+ row.greenfield.costUsd * weights.greenfield +
+ row.testing.costUsd * weights.testing +
+ row.infoGathering.costUsd * weights.infoGathering
+ const runtimeS =
+ row.issueResolution.runtimeS * weights.issueResolution +
+ row.frontend.runtimeS * weights.frontend +
+ row.greenfield.runtimeS * weights.greenfield +
+ row.testing.runtimeS * weights.testing +
+ row.infoGathering.runtimeS * weights.infoGathering
+ return { score, costUsd, runtimeS }
+}
+
+function pickByMode(
+ rows: Array<{ modelId: string; weighted: WeightedObjectiveMetrics }>,
+ mode: EvalOptimizationModeV1,
+): { modelId: string; weighted: WeightedObjectiveMetrics } {
+ const bestByQuality = rows.reduce((best, cur) => (cur.weighted.score > best.weighted.score ? cur : best))
+
+ // For speed/cost modes, don't pick a model that is dramatically worse on quality.
+ // This keeps the v1 prototype recommendations feeling credible even when a model is
+ // extremely cheap or fast but underperforms for the selected objective.
+ const QUALITY_FLOOR = 0.85
+ const qualityThreshold = bestByQuality.weighted.score * QUALITY_FLOOR
+ const qualityGated = rows.filter((r) => r.weighted.score >= qualityThreshold)
+ const pool = qualityGated.length > 0 ? qualityGated : rows
+
+ if (mode === "fastest") {
+ return pool.reduce((best, cur) => (cur.weighted.runtimeS < best.weighted.runtimeS ? cur : best))
+ }
+ if (mode === "cost") {
+ return pool.reduce((best, cur) => (cur.weighted.costUsd < best.weighted.costUsd ? cur : best))
+ }
+ return bestByQuality
+}
+
+export function pickObjectiveDefaultModelV1(
+ outcomeId: EvalOutcomeId,
+ mode: EvalOptimizationModeV1,
+): ObjectiveDefaultModelV1 | null {
+ const weights = getOutcomeWeights(outcomeId)
+ const candidates = MODEL_METRICS_V1.map((row) => ({
+ modelId: row.modelId,
+ weighted: getWeightedMetrics(row, weights),
+ }))
+ if (candidates.length === 0) return null
+ return pickByMode(candidates, mode)
+}